Exemple #1
0
    def test_fit_cooccurrence_vectorizer_feature_selection(self, mocked_chi2):
        # Given
        vectorizer_config = CooccurrenceVectorizerConfig(
            filter_stop_words=False)
        config = FeaturizerConfig(
            added_cooccurrence_feature_ratio=.3,
            cooccurrence_vectorizer_config=vectorizer_config)
        featurizer = Featurizer(config)
        mocked_dataset = {"language": "fr", "entities": {}, "intents": {}}
        utterances = [
            text_to_utterance("a b c d e"),
            text_to_utterance("f g h i j"),
            text_to_utterance("none"),
        ]

        mocked_vectorizer = MagicMock()
        mocked_vectorizer.idf_diag = range(10)

        featurizer.tfidf_vectorizer = mocked_vectorizer
        classes = [0, 0, 1]

        # When
        mocked_chi2.return_value = (None, [0.1, 1.0, 0.2, 1.0, 0.3, 1.0] +
                                    [1.0 for _ in range(100)])
        featurizer._fit_cooccurrence_vectorizer(utterances, classes, 1,
                                                mocked_dataset)

        # Then
        expected_pairs = {("a", "b"): 0, ("a", "d"): 1, ("b", "c"): 2}
        self.assertDictEqual(expected_pairs,
                             featurizer.cooccurrence_vectorizer.word_pairs)
Exemple #2
0
    def test_limit_vocabulary(self):
        # Given
        vectorizer = TfidfVectorizer()
        dataset = get_empty_dataset("en")

        utterances = [
            text_to_utterance("5 55 6 66 666"),
            text_to_utterance("55 66")
        ]

        voca = {"5": 0, "55": 1, "6": 2, "66": 3, "666": 4}
        kept_unigrams = ["5", "6", "666"]
        vectorizer.fit(utterances, dataset)
        self.assertDictEqual(voca, vectorizer.vocabulary)
        diag = vectorizer.idf_diag.copy()

        # When
        vectorizer.limit_vocabulary(kept_unigrams)

        # Then
        expected_voca = {"5": 0, "6": 1, "666": 2}
        self.assertDictEqual(expected_voca, vectorizer.vocabulary)

        expected_diag = diag[[voca[u] for u in kept_unigrams]].tolist()
        self.assertListEqual(expected_diag, vectorizer.idf_diag.tolist())
Exemple #3
0
    def test_limit_vocabulary(self):
        # Given
        config = CooccurrenceVectorizerConfig(filter_stop_words=False)
        vectorizer = CooccurrenceVectorizer(config=config)
        train_data = [
            text_to_utterance(t) for t in ("a b", "a c", "a d", "a e")
        ]

        data = [text_to_utterance(t) for t in ("a c e", "a d e")]
        vectorizer.fit(train_data, get_empty_dataset("en"))
        x_0 = vectorizer.transform(data)
        pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3}
        kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")]
        self.assertDictEqual(pairs, vectorizer.word_pairs)

        # When
        kept_pairs_indexes = [pairs[p] for p in kept_pairs]
        vectorizer.limit_word_pairs(kept_pairs)

        # Then
        expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2}
        self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
        x_1 = vectorizer.transform(data)
        self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(),
                             x_1.todense().tolist())
Exemple #4
0
    def test_log_activation_weights(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - foo bar

---
type: intent
name: intent2
utterances:
  - lorem ipsum""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        shared = self.get_shared_data(dataset)
        intent_classifier = LogRegIntentClassifier(**shared)

        text = "yo"
        utterances = [text_to_utterance(text)]
        self.assertIsNone(intent_classifier.log_activation_weights(text, None))

        # When
        intent_classifier.fit(dataset)
        x = intent_classifier.featurizer.transform(utterances)[0]
        log = intent_classifier.log_activation_weights(text, x, top_n=42)

        # Then
        self.assertIsInstance(log, str)
        self.assertIn("Top 42", log)
    def _get_intents(self, text, intents_filter):
        if isinstance(intents_filter, str):
            intents_filter = {intents_filter}
        elif isinstance(intents_filter, list):
            intents_filter = set(intents_filter)

        if not text or not self.intent_list or not self.featurizer:
            results = [intent_classification_result(None, 1.0)]
            results += [intent_classification_result(i, 0.0)
                        for i in self.intent_list if i is not None]
            return results

        if len(self.intent_list) == 1:
            return [intent_classification_result(self.intent_list[0], 1.0)]

        # pylint: disable=C0103
        X = self.featurizer.transform([text_to_utterance(text)])
        # pylint: enable=C0103
        proba_vec = self._predict_proba(X)
        logger.debug(
            "%s", DifferedLoggingMessage(self.log_activation_weights, text, X))
        results = [
            intent_classification_result(i, proba)
            for i, proba in zip(self.intent_list, proba_vec[0])
            if intents_filter is None or i is None or i in intents_filter]

        return sorted(results, key=lambda res: -res[RES_PROBA])
Exemple #6
0
    def test_generate_noise_utterances(self, mocked_get_noise):
        # Given
        language = LANGUAGE_EN
        num_intents = 2
        noise_factor = 1
        utterances_length = 5

        noise = [str(i) for i in range(utterances_length)]
        mocked_get_noise.return_value = noise

        augmented_utterances = [
            {
                "data": [
                    {
                        "text": " ".join(
                            "{}".format(i) for i in range(utterances_length))
                    }
                ]
            }
        ]
        num_utterances = 10
        random_state = np.random.RandomState(1)

        augmented_utterances = augmented_utterances * num_utterances
        config = IntentClassifierDataAugmentationConfig(
            noise_factor=noise_factor)
        # When
        noise_utterances = generate_noise_utterances(
            augmented_utterances, noise, num_intents, config, language,
            random_state)

        # Then
        joined_noise = text_to_utterance(" ".join(noise))
        for u in noise_utterances:
            self.assertEqual(u, joined_noise)
Exemple #7
0
    def test_cooccurrence_vectorizer_should_persist(self):
        # Given
        x = [text_to_utterance("yoo yoo")]
        dataset = get_empty_dataset("en")
        shared = self.get_shared_data(dataset)
        vectorizer = CooccurrenceVectorizer(**shared).fit(x, dataset)
        vectorizer.builtin_entity_scope = {"snips/entity"}

        # When
        vectorizer.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        expected_metadata = {"unit_name": "cooccurrence_vectorizer"}
        self.assertJsonContent(metadata_path, expected_metadata)

        vectorizer_path = self.tmp_file_path / "vectorizer.json"
        expected_vectorizer = {
            "word_pairs": {
                "0": ["yoo", "yoo"]
            },
            "language_code": "en",
            "config": vectorizer.config.to_dict(),
            "builtin_entity_scope": ["snips/entity"]
        }
        self.assertJsonContent(vectorizer_path, expected_vectorizer)
Exemple #8
0
    def test_enrich_utterance(self):
        # Given
        u = text_to_utterance("a b c d e f")
        builtin_ents = [{
            "value": "e",
            "resolved_value": "e",
            "range": {
                "start": 8,
                "end": 9
            },
            "entity_kind": "the_snips_e_entity"
        }]
        custom_ents = [{
            "value": "c",
            "resolved_value": "c",
            "range": {
                "start": 4,
                "end": 5
            },
            "entity_kind": "the_c_entity"
        }]

        vectorizer = CooccurrenceVectorizer()
        vectorizer._language = "en"

        # When
        preprocessed = vectorizer._enrich_utterance(u, builtin_ents,
                                                    custom_ents)

        # Then
        expected = ["a", "b", "THE_C_ENTITY", "d", "THE_SNIPS_E_ENTITY", "f"]
        self.assertSequenceEqual(expected, preprocessed)
Exemple #9
0
    def test_featurizer_should_exclude_replacement_string(self):
        # Given
        language = LANGUAGE_EN
        dataset = {
            "entities": {
                "dummy1": {
                    "utterances": {
                        "unknownword": "unknownword",
                        "what": "what"
                    }
                }
            }
        }
        replacement_string = "unknownword"
        featurizer = Featurizer(
            language,
            unknown_words_replacement_string=replacement_string,
            config=FeaturizerConfig())
        utterances = [text_to_utterance("hello dude")]
        y = np.array([1])

        # When
        featurizer.fit(dataset, utterances, y)

        # Then
        self.assertNotIn(replacement_string,
                         featurizer.entity_utterances_to_feature_names)
Exemple #10
0
    def test_empty_vocabulary_should_fit_and_return_none_intent(
            self, mocked_build_training):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: dummy_intent_1
utterances:
  - "[dummy_slot_name:dummy_entity_1](...)"
  
---
type: entity
name: dummy_entity_1
automatically_extensible: true
use_synonyms: false
matching_strictness: 1.0
values:
  - ...
""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        text = " "
        noise_size = 6
        utterances = [text] + [text] * noise_size
        utterances = [text_to_utterance(t) for t in utterances]
        labels = [0] + [1] * noise_size
        intent_list = ["dummy_intent_1", None]
        mocked_build_training.return_value = utterances, labels, intent_list

        # When / Then
        intent_classifier = LogRegIntentClassifier().fit(dataset)
        intent = intent_classifier.get_intent("no intent there")
        self.assertEqual(intent_classification_result(None, 1.0), intent)
Exemple #11
0
    def test_should_be_serializable(self):
        # Given

        dataset_stream = io.StringIO("""
---
type: intent
name: dummy_intent
utterances:
  - this is the number [number:snips/number](one)
""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        pvalue_threshold = 0.42
        config = FeaturizerConfig(pvalue_threshold=pvalue_threshold,
                                  added_cooccurrence_feature_ratio=0.2)
        shared = self.get_shared_data(dataset)
        featurizer = Featurizer(config=config, **shared)
        utterances = [
            text_to_utterance("this is the number"),
            text_to_utterance("yo")
        ]
        classes = np.array([0, 1])
        featurizer.fit(dataset, utterances, classes, max(classes))

        # When
        featurizer.persist(self.tmp_file_path)

        # Then
        expected_featurizer_dict = {
            "language_code": "en",
            "tfidf_vectorizer": "tfidf_vectorizer",
            "cooccurrence_vectorizer": "cooccurrence_vectorizer",
            "config": config.to_dict()
        }
        featurizer_dict_path = self.tmp_file_path / "featurizer.json"
        self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict)

        expected_metadata = {"unit_name": "featurizer"}
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, expected_metadata)

        tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer"
        self.assertTrue(tfidf_vectorizer_path.exists())

        cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer"
        self.assertTrue(cooc_vectorizer_path.exists())
    def test_should_build_training_data_with_noise(self,
                                                   mocked_augment_utterances,
                                                   mocked_get_noise):
        # Given
        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
        mocked_get_noise.return_value = mocked_noises
        mocked_augment_utterances.side_effect = get_mocked_augment_utterances

        num_intents = 3
        utterances_length = 5
        num_queries_per_intent = 3
        fake_utterance = {
            "data": [{
                "text": " ".join("1" for _ in range(utterances_length))
            }]
        }
        dataset = {
            "intents": {
                str(i): {
                    "utterances": [fake_utterance] * num_queries_per_intent
                }
                for i in range(num_intents)
            }
        }
        random_state = np.random.RandomState(1)

        # When
        np.random.seed(42)
        noise_factor = 2
        data_augmentation_config = IntentClassifierDataAugmentationConfig(
            noise_factor=noise_factor,
            unknown_word_prob=0,
            unknown_words_replacement_string=None)
        utterances, _, intent_mapping = build_training_data(
            dataset, LANGUAGE_EN, data_augmentation_config, random_state)

        # Then
        expected_utterances = [
            utterance for intent in itervalues(dataset[INTENTS])
            for utterance in intent[UTTERANCES]
        ]
        np.random.seed(42)
        noise = list(mocked_noises)
        noise_size = int(min(noise_factor * num_queries_per_intent,
                             len(noise)))
        noise_it = get_noise_it(mocked_noises, utterances_length, 0,
                                random_state)
        noisy_utterances = [
            text_to_utterance(next(noise_it)) for _ in range(noise_size)
        ]
        expected_utterances += noisy_utterances
        expected_intent_mapping = sorted(dataset["intents"])
        expected_intent_mapping.append(None)
        self.assertListEqual(expected_utterances, utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)
Exemple #13
0
    def test_training_should_be_reproducible(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a [beverage_temperature:Temperature](hot) cup of tea
- make me [number_of_cups:snips/number](five) tea cups

---
type: intent
name: MakeCoffee
utterances:
- make me [number_of_cups:snips/number](one) cup of coffee please
- brew [number_of_cups] cups of coffee""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        utterances = [
            text_to_utterance("please make me two hots cups of tea"),
            text_to_utterance("i want a cup of coffee"),
        ]
        classes = np.array([0, 1])
        shared = self.get_shared_data(dataset)
        shared["random_state"] = 42

        # When
        featurizer1 = Featurizer(**shared)
        featurizer1.fit(dataset, utterances, classes, max(classes))

        featurizer2 = Featurizer(**shared)
        featurizer2.fit(dataset, utterances, classes, max(classes))

        # Then
        with temp_dir() as tmp_dir:
            dir_featurizer1 = tmp_dir / "featurizer1"
            dir_featurizer2 = tmp_dir / "featurizer2"
            featurizer1.persist(dir_featurizer1)
            featurizer2.persist(dir_featurizer2)
            hash1 = dirhash(str(dir_featurizer1), 'sha256')
            hash2 = dirhash(str(dir_featurizer2), 'sha256')
            self.assertEqual(hash1, hash2)
Exemple #14
0
    def test_limit_vocabulary_should_raise(self):
        # Given
        vectorizer = TfidfVectorizer()
        dataset = {"language": "en", "entities": dict(), "intents": dict()}
        utterances = [text_to_utterance("5 55 6 66 666")]

        vectorizer.fit(utterances, dataset)

        # When / Then
        kept_indexes = ["7", "8"]
        with self.assertRaises(ValueError):
            vectorizer.limit_vocabulary(kept_indexes)
    def test_fit_unordered(self, mocked_preprocess):
        t = "a b c d e f"
        u = text_to_utterance(t)
        builtin_ents = [
            {
                "value": "e",
                "resolved_value": "e",
                "range": {
                    "start": 8,
                    "end": 9
                },
                "entity_kind": "the_snips_e_entity"
            }
        ]
        custom_ents = [
            {
                "value": "c",
                "resolved_value": "c",
                "range": {
                    "start": 4,
                    "end": 5
                },
                "entity_kind": "the_c_entity"
            }
        ]
        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]

        config = CooccurrenceVectorizerConfig(
            window_size=3,
            unknown_words_replacement_string="b",
            filter_stop_words=False,
            keep_order=False,
        )
        dataset = get_empty_dataset("en")
        shared = self.get_shared_data(dataset)

        # When
        expected_pairs = {
            ("THE_C_ENTITY", "THE_SNIPS_E_ENTITY"): 0,
            ("THE_C_ENTITY", "a"): 1,
            ("THE_C_ENTITY", "d"): 2,
            ("THE_C_ENTITY", "f"): 3,
            ("THE_SNIPS_E_ENTITY", "a"): 4,
            ("THE_SNIPS_E_ENTITY", "d"): 5,
            ("THE_SNIPS_E_ENTITY", "f"): 6,
            ("a", "d"): 7,
            ("d", "f"): 8,
        }
        vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset)

        # Then
        self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
    def test_fit_transform(self, mocked_preprocess):
        t = "a b c d e f"
        u = text_to_utterance(t)
        builtin_ents = [
            {
                "value": "e",
                "resolved_value": "e",
                "range": {
                    "start": 8,
                    "end": 9
                },
                "entity_kind": "the_snips_e_entity"
            }
        ]
        custom_ents = [
            {
                "value": "c",
                "resolved_value": "c",
                "range": {
                    "start": 4,
                    "end": 5
                },
                "entity_kind": "the_c_entity"
            }
        ]
        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]

        config = CooccurrenceVectorizerConfig(
            window_size=3,
            unknown_words_replacement_string="b",
            filter_stop_words=False
        )

        dataset = get_empty_dataset("en")

        builtin_parser = EntityParserMock({t: builtin_ents})
        custom_parser = EntityParserMock({t: custom_ents})
        resources = {STOP_WORDS: set()}
        vectorizer1 = CooccurrenceVectorizer(
            config, builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser, resources=resources)
        vectorizer2 = CooccurrenceVectorizer(
            config, builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser, resources=resources)

        # When
        x = [u]
        x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist()
        x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist()

        # Then
        self.assertListEqual(x_0, x_1)
    def test_add_unknown_word_to_utterances_with_zero_max_unknownword(self):
        # Given
        utterances = [text_to_utterance("yo")]
        replacement_string = "yo"
        unknown_word_prob = 1
        max_unknown_words = 0
        random_state = np.random.RandomState

        # When / Then
        with self.fail_if_exception("Failed to augment utterances with "
                                    "unknown_word_prob=0"):
            add_unknown_word_to_utterances(utterances, replacement_string,
                                           unknown_word_prob,
                                           max_unknown_words, random_state)
    def test_empty_vocabulary_should_fit_and_return_none_intent(
            self, mocked_build_training):
        # Given
        language = LANGUAGE_EN
        dataset = {
            "entities": {
                "dummy_entity_1": {
                    "automatically_extensible": True,
                    "use_synonyms": False,
                    "data": [
                        {
                            "value": "...",
                            "synonyms": [],
                        }
                    ],
                    "matching_strictness": 1.0
                }
            },
            "intents": {
                "dummy_intent_1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "...",
                                    "slot_name": "dummy_slot_name",
                                    "entity": "dummy_entity_1"
                                }
                            ]
                        }
                    ]
                }
            },
            "language": language
        }
        dataset = validate_and_format_dataset(dataset)

        text = " "
        noise_size = 6
        utterances = [text] + [text] * noise_size
        utterances = [text_to_utterance(t) for t in utterances]
        labels = [0] + [1] * noise_size
        intent_list = ["dummy_intent_1", None]
        mocked_build_training.return_value = utterances, labels, intent_list

        # When / Then
        intent_classifier = LogRegIntentClassifier().fit(dataset)
        intent = intent_classifier.get_intent("no intent there")
        self.assertEqual(None, intent)
Exemple #19
0
    def get_intent(self, text, intents_filter=None):
        """Performs intent classification on the provided *text*

        Args:
            text (str): Input
            intents_filter (str or list of str): When defined, it will find
                the most likely intent among the list, otherwise it will use
                the whole list of intents defined in the dataset

        Returns:
            dict or None: The most likely intent along with its probability or
            *None* if no intent was found

        Raises:
            NotTrained: When the intent classifier is not fitted

        """
        if not self.fitted:
            raise NotTrained('LogRegIntentClassifier must be fitted')

        if isinstance(intents_filter, str):
            intents_filter = [intents_filter]

        if not text or not self.intent_list \
                or self.featurizer is None or self.classifier is None:
            return None

        if len(self.intent_list) == 1:
            if self.intent_list[0] is None:
                return None
            return intent_classification_result(self.intent_list[0], 1.0)

        # pylint: disable=C0103
        X = self.featurizer.transform([text_to_utterance(text)])
        # pylint: enable=C0103
        proba_vec = self._predict_proba(X, intents_filter=intents_filter)
        intents_probas = sorted(zip(self.intent_list, proba_vec[0]),
                                key=lambda p: -p[1])
        for intent, proba in intents_probas:
            if intent is None:
                return None
            if intents_filter is None or intent in intents_filter:
                return intent_classification_result(intent, proba)
        return None
Exemple #20
0
    def test_preprocess(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]""")

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        utterances = [
            text_to_utterance("hÉllo wOrld Éntity_2"),
            text_to_utterance("beauTiful World entity 1"),
            text_to_utterance("Bird bïrdy"),
            text_to_utterance("Bird birdy"),
        ]

        config = TfidfVectorizerConfig(use_stemming=True,
                                       word_clusters_name="my_word_clusters")
        vectorizer = TfidfVectorizer(
            config=config,
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)
        vectorizer._language = language
        vectorizer.builtin_entity_scope = {"snips/number"}

        # When
        processed_data = vectorizer._preprocess(utterances)
        processed_data = list(zip(*processed_data))

        # Then
        u_0 = {"data": [{"text": "hello world entity_2"}]}

        u_1 = {"data": [{"text": "beauty world ent 1"}]}

        u_2 = {"data": [{"text": "bird bird"}]}

        u_3 = {"data": [{"text": "bird bird"}]}

        ent_0 = {
            "entity_kind": "entity_2",
            "value": "entity_2",
            "resolved_value": "Éntity 2",
            "range": {
                "start": 12,
                "end": 20
            }
        }
        num_0 = {
            "entity_kind": "snips/number",
            "value": "2",
            "resolved_value": {
                "value": 2.0,
                "kind": "Number"
            },
            "range": {
                "start": 19,
                "end": 20
            }
        }
        ent_11 = {
            "entity_kind": "entity_1",
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 13,
                "end": 18
            }
        }
        ent_12 = {
            "entity_kind": "entity_2",
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 13,
                "end": 18
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "1",
            "range": {
                "start": 23,
                "end": 24
            },
            "resolved_value": {
                "value": 1.0,
                "kind": "Number"
            },
        }

        expected_data = [(u_0, [num_0], [ent_0], []),
                         (u_1, [num_1], [ent_11,
                                         ent_12], ["cluster_1", "cluster_3"]),
                         (u_2, [], [], []), (u_3, [], [], ["cluster_2"])]

        self.assertSequenceEqual(expected_data, processed_data)
Exemple #21
0
    def test_preprocess(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]
    """)
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        u_0 = text_to_utterance("hÉllo wOrld Éntity_2")
        u_1 = text_to_utterance("beauTiful World entity 1")
        u_2 = text_to_utterance("Bird bïrdy")
        u_3 = text_to_utterance("Bird birdy")
        utterances = [u_0, u_1, u_2, u_3]

        vectorizer = CooccurrenceVectorizer(
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)

        vectorizer._language = language

        # When
        processed_data = vectorizer._preprocess(utterances)
        processed_data = list(zip(*processed_data))

        # Then
        ent_0 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "resolved_value": "Éntity 2",
            "range": {
                "start": 12,
                "end": 20
            }
        }
        num_0 = {
            "entity_kind": "snips/number",
            "value": "2",
            "resolved_value": {
                "value": 2.0,
                "kind": "Number"
            },
            "range": {
                "start": 19,
                "end": 20
            }
        }
        ent_11 = {
            "entity_kind": "entity_1",
            "value": "entity 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        ent_12 = {
            "entity_kind": "entity_2",
            "value": "entity 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "1",
            "range": {
                "start": 23,
                "end": 24
            },
            "resolved_value": {
                "value": 1.0,
                "kind": "Number"
            }
        }

        expected_data = [(u_0, [num_0], [ent_0]),
                         (u_1, [num_1], [ent_11, ent_12]), (u_2, [], []),
                         (u_3, [], [])]

        self.assertSequenceEqual(expected_data, processed_data)
Exemple #22
0
    def test_enrich_utterance(self):
        # Given
        utterances = [
            {
                "data": [
                    {
                        "text": "one",
                        "entity": "snips/number"
                    },
                    {
                        "text": "beauty world",
                    },
                    {
                        "text": "ent 1",
                        "entity": "dummy_entity_1"
                    },
                ]
            },
            text_to_utterance("one beauty world ent 1"),
            text_to_utterance("hello world entity_2"),
            text_to_utterance("bird bird"),
        ]

        builtin_ents = [[{
            "value": "one",
            "resolved_value": 1,
            "range": {
                "start": 0,
                "end": 3
            },
            "entity_kind": "snips/number"
        }],
                        [{
                            "value": "one",
                            "resolved_value": 1,
                            "range": {
                                "start": 0,
                                "end": 3
                            },
                            "entity_kind": "snips/number"
                        }, {
                            "value": "1",
                            "resolved_value": 1,
                            "range": {
                                "start": 27,
                                "end": 28
                            },
                            "entity_kind": "snips/number"
                        }],
                        [{
                            "value": "2",
                            "resolved_value": 2,
                            "range": {
                                "start": 19,
                                "end": 20
                            },
                            "entity_kind": "snips/number"
                        }], []]

        custom_ents = [[{
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 20,
                "end": 28
            },
            "entity_kind": "dummy_entity_1"
        }],
                       [{
                           "value": "ent 1",
                           "resolved_value": "entity 1",
                           "range": {
                               "start": 20,
                               "end": 28
                           },
                           "entity_kind": "dummy_entity_1"
                       }],
                       [{
                           "value": "entity_2",
                           "resolved_value": "Éntity_2",
                           "range": {
                               "start": 12,
                               "end": 20
                           },
                           "entity_kind": "dummy_entity_2"
                       }], []]

        w_clusters = [["111", "112"], ["111", "112"], [], []]

        vectorizer = TfidfVectorizer()
        vectorizer._language = "en"

        # When
        enriched_utterances = [
            vectorizer._enrich_utterance(*data)
            for data in zip(utterances, builtin_ents, custom_ents, w_clusters)
        ]

        # Then
        expected_u0 = "beauty world ent 1 " \
                      "builtinentityfeaturesnipsnumber " \
                      "entityfeaturedummy_entity_1 111 112"

        expected_u1 = "one beauty world ent 1 " \
                      "builtinentityfeaturesnipsnumber " \
                      "builtinentityfeaturesnipsnumber " \
                      "entityfeaturedummy_entity_1 111 112"

        expected_u2 = "hello world entity_2 builtinentityfeaturesnipsnumber " \
                      "entityfeaturedummy_entity_2"

        expected_u3 = "bird bird"

        expected_utterances = [
            expected_u0, expected_u1, expected_u2, expected_u3
        ]

        self.assertEqual(expected_utterances, enriched_utterances)
Exemple #23
0
    def test_transform(self):
        # Given
        config = CooccurrenceVectorizerConfig(
            filter_stop_words=True,
            window_size=3,
            unknown_words_replacement_string="d")

        t_0 = "yo a b c d e f yo"
        t_1 = "yo a b c d e"
        u_0 = text_to_utterance(t_0)
        u_1 = text_to_utterance(t_1)

        resources = {STOP_WORDS: {"b"}}

        builtin_ents = [{
            "value": "e",
            "resolved_value": "e",
            "range": {
                "start": 11,
                "end": 12
            },
            "entity_kind": "the_snips_e_entity"
        }]
        custom_ents = [{
            "value": "c",
            "resolved_value": "c",
            "range": {
                "start": 7,
                "end": 8
            },
            "entity_kind": "the_c_entity"
        }]

        builtin_parser = EntityParserMock({
            t_0: builtin_ents,
            t_1: builtin_ents
        })
        custom_parser = EntityParserMock({t_0: custom_ents, t_1: custom_ents})

        vectorizer = CooccurrenceVectorizer(
            config,
            builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser,
            resources=resources)

        vectorizer._language = "en"
        vectorizer._word_pairs = {
            ("THE_SNIPS_E_ENTITY", "f"): 0,
            ("a", "THE_C_ENTITY"): 1,
            ("a", "THE_SNIPS_E_ENTITY"): 2,
            ("b", "THE_SNIPS_E_ENTITY"): 3,
            ("yo", "yo"): 4,
            ("d", "THE_SNIPS_E_ENTITY"): 5
        }

        data = [u_0, u_1]

        # When
        x = vectorizer.transform(data)

        # Then
        expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]]
        self.assertEqual(expected, x.todense().tolist())
Exemple #24
0
    def test_preprocess_utterances(self, mocked_stem, mocked_word_cluster):
        # Given
        language = LANGUAGE_EN

        def _stem(t):
            if t == "beautiful":
                s = "beauty"
            elif t == "birdy":
                s = "bird"
            elif t == "entity":
                s = "ent"
            else:
                s = t
            return s

        def stem_function(text, language):
            return get_default_sep(language).join(
                [_stem(t) for t in tokenize_light(text, language)])

        mocked_word_cluster.return_value = {
            "beautiful": "cluster_1",
            "birdy": "cluster_2",
            "entity": "cluster_3"
        }

        mocked_stem.side_effect = stem_function

        dataset = {
            "intents": {
                "intent1": {
                    "utterances": []
                }
            },
            "entities": {
                "entity_1": {
                    "data": [{
                        "value": "entity 1",
                        "synonyms": ["alternative entity 1"]
                    }, {
                        "value": "éntity 1",
                        "synonyms": ["alternative entity 1"]
                    }],
                    "use_synonyms":
                    False,
                    "automatically_extensible":
                    False
                },
                "entity_2": {
                    "data": [{
                        "value": "entity 1",
                        "synonyms": []
                    }, {
                        "value": "Éntity 2",
                        "synonyms": ["Éntity_2", "Alternative entity 2"]
                    }],
                    "use_synonyms":
                    True,
                    "automatically_extensible":
                    False
                },
                "snips/number": {}
            },
            "language": "en",
        }

        dataset = validate_and_format_dataset(dataset)

        utterances = [
            text_to_utterance("hÉllo wOrld Éntity_2"),
            text_to_utterance("beauTiful World entity 1"),
            text_to_utterance("Bird bïrdy"),
        ]

        labeled_utterance = {
            DATA: [{
                TEXT: "beauTiful éntity "
            }, {
                TEXT: "1",
                ENTITY: "snips/number",
                SLOT_NAME: "number"
            }, {
                TEXT: " bIrd Éntity_2"
            }]
        }
        utterances.append(labeled_utterance)
        labels = np.array([0, 0, 1, 1])

        featurizer = Featurizer(
            language,
            None,
            config=FeaturizerConfig(word_clusters_name="brown_clusters")).fit(
                dataset, utterances, labels)

        # When
        utterances = featurizer.preprocess_utterances(utterances)

        # Then
        expected_utterances = [
            "hello world entity_2 builtinentityfeaturesnipsnumber "
            "entityfeatureentity_2",
            "beauty world ent 1 builtinentityfeaturesnipsnumber "
            "entityfeatureentity_1 entityfeatureentity_2 "
            "cluster_1 cluster_3", "bird bird",
            "beauty ent bird entity_2 builtinentityfeaturesnipsnumber "
            "builtinentityfeaturesnipsnumber entityfeatureentity_1 "
            "entityfeatureentity_2 entityfeatureentity_2 cluster_1"
        ]

        self.assertListEqual(utterances, expected_utterances)
Exemple #25
0
    def test_should_be_serializable(self):
        # Given
        language = LANGUAGE_EN
        tfidf_vectorizer = _get_tfidf_vectorizer(language)

        pvalue_threshold = 0.42
        featurizer = Featurizer(language,
                                config=FeaturizerConfig(
                                    pvalue_threshold=pvalue_threshold,
                                    word_clusters_name="brown_clusters"),
                                unknown_words_replacement_string=None,
                                tfidf_vectorizer=tfidf_vectorizer)
        dataset = {
            "entities": {
                "entity2": {
                    "data": [{
                        "value": "entity1",
                        "synonyms": ["entity1"]
                    }],
                    "use_synonyms": True,
                    "automatically_extensible": True
                }
            },
            "intents": {},
            "language": "en"
        }
        dataset = validate_and_format_dataset(dataset)

        utterances = [
            "hello world", "beautiful world", "hello here", "bird birdy",
            "beautiful bird"
        ]
        utterances = [text_to_utterance(u) for u in utterances]
        classes = np.array([0, 0, 0, 1, 1])

        featurizer.fit(dataset, utterances, classes)

        # When
        serialized_featurizer = featurizer.to_dict()

        # Then
        msg = "Featurizer dict should be json serializable to utf8."
        with self.fail_if_exception(msg):
            dumped = json_string(serialized_featurizer)

        msg = "SnipsNLUEngine should be deserializable from dict with unicode" \
              " values"
        with self.fail_if_exception(msg):
            _ = Featurizer.from_dict(json.loads(dumped))

        vocabulary = tfidf_vectorizer.vocabulary_
        # pylint: disable=W0212
        idf_diag = tfidf_vectorizer._tfidf._idf_diag.data.tolist()
        # pylint: enable=W0212

        best_features = featurizer.best_features
        entity_utterances_to_feature_names = {
            "entity1": ["entityfeatureentity2"]
        }

        expected_serialized = {
            "config": {
                'sublinear_tf': False,
                'pvalue_threshold': pvalue_threshold,
                'word_clusters_name': "brown_clusters"
            },
            "language_code": "en",
            "tfidf_vectorizer": {
                "idf_diag": idf_diag,
                "vocab": vocabulary
            },
            "best_features": best_features,
            "entity_utterances_to_feature_names":
            entity_utterances_to_feature_names,
            "unknown_words_replacement_string": None
        }
        self.assertDictEqual(expected_serialized, serialized_featurizer)