Example #1
0
    def fit(self, dataset):
        """Fit the intent classifier with a valid Snips dataset

        Returns:
            :class:`LogRegIntentClassifier`: The same instance, trained
        """
        logger.debug("Fitting LogRegIntentClassifier...")
        dataset = validate_and_format_dataset(dataset)
        language = dataset[LANGUAGE]
        random_state = check_random_state(self.config.random_seed)

        data_augmentation_config = self.config.data_augmentation_config
        utterances, classes, intent_list = build_training_data(
            dataset, language, data_augmentation_config, random_state)

        self.intent_list = intent_list
        if len(self.intent_list) <= 1:
            return self

        self.featurizer = Featurizer(
            language,
            data_augmentation_config.unknown_words_replacement_string,
            self.config.featurizer_config)
        self.featurizer = self.featurizer.fit(dataset, utterances, classes)
        if self.featurizer is None:
            return self

        X = self.featurizer.transform(utterances)  # pylint: disable=C0103
        alpha = get_regularization_factor(dataset)
        self.classifier = SGDClassifier(random_state=random_state,
                                        alpha=alpha, **LOG_REG_ARGS)
        self.classifier.fit(X, classes)
        logger.debug("%s", DifferedLoggingMessage(self.log_best_features))
        return self
Example #2
0
    def fit(self, dataset):
        """Fits the intent classifier with a valid Snips dataset

        Returns:
            :class:`LogRegIntentClassifier`: The same instance, trained
        """
        from sklearn.linear_model import SGDClassifier
        from sklearn.utils import compute_class_weight

        logger.info("Fitting LogRegIntentClassifier...")
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        language = dataset[LANGUAGE]

        data_augmentation_config = self.config.data_augmentation_config
        utterances, classes, intent_list = build_training_data(
            dataset, language, data_augmentation_config, self.resources,
            self.random_state)

        self.intent_list = intent_list
        if len(self.intent_list) <= 1:
            return self

        self.featurizer = Featurizer(
            config=self.config.featurizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources,
            random_state=self.random_state,
        )
        self.featurizer.language = language

        none_class = max(classes)
        try:
            x = self.featurizer.fit_transform(dataset, utterances, classes,
                                              none_class)
        except _EmptyDatasetUtterancesError:
            logger.warning("No (non-empty) utterances found in dataset")
            self.featurizer = None
            return self

        alpha = get_regularization_factor(dataset)

        class_weights_arr = compute_class_weight("balanced",
                                                 range(none_class + 1),
                                                 classes)
        # Re-weight the noise class
        class_weights_arr[-1] *= self.config.noise_reweight_factor
        class_weight = {idx: w for idx, w in enumerate(class_weights_arr)}

        self.classifier = SGDClassifier(random_state=self.random_state,
                                        alpha=alpha,
                                        class_weight=class_weight,
                                        **LOG_REG_ARGS)
        self.classifier.fit(x, classes)
        logger.debug("%s", DifferedLoggingMessage(self.log_best_features))
        return self
    def test_should_build_training_data_with_noise(self,
                                                   mocked_augment_utterances,
                                                   mocked_get_noise):
        # Given
        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
        mocked_get_noise.return_value = mocked_noises
        mocked_augment_utterances.side_effect = get_mocked_augment_utterances

        num_intents = 3
        utterances_length = 5
        num_queries_per_intent = 3
        fake_utterance = {
            "data": [{
                "text": " ".join("1" for _ in range(utterances_length))
            }]
        }
        dataset = {
            "intents": {
                str(i): {
                    "utterances": [fake_utterance] * num_queries_per_intent
                }
                for i in range(num_intents)
            }
        }
        random_state = np.random.RandomState(1)

        # When
        np.random.seed(42)
        noise_factor = 2
        data_augmentation_config = IntentClassifierDataAugmentationConfig(
            noise_factor=noise_factor,
            unknown_word_prob=0,
            unknown_words_replacement_string=None)
        utterances, _, intent_mapping = build_training_data(
            dataset, LANGUAGE_EN, data_augmentation_config, random_state)

        # Then
        expected_utterances = [
            utterance for intent in itervalues(dataset[INTENTS])
            for utterance in intent[UTTERANCES]
        ]
        np.random.seed(42)
        noise = list(mocked_noises)
        noise_size = int(min(noise_factor * num_queries_per_intent,
                             len(noise)))
        noise_it = get_noise_it(mocked_noises, utterances_length, 0,
                                random_state)
        noisy_utterances = [
            text_to_utterance(next(noise_it)) for _ in range(noise_size)
        ]
        expected_utterances += noisy_utterances
        expected_intent_mapping = sorted(dataset["intents"])
        expected_intent_mapping.append(None)
        self.assertListEqual(expected_utterances, utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)
    def test_should_build_training_data_with_noise(
            self, mocked_augment_utterances, mocked_get_noises):
        # Given
        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
        mocked_get_noises.return_value = mocked_noises
        mocked_augment_utterances.side_effect = get_mocked_augment_utterances

        num_intents = 3
        utterances_length = 5
        num_queries_per_intent = 3
        fake_utterance = {
            "data": [
                {"text": " ".join("1" for _ in range(utterances_length))}
            ]
        }
        dataset = {
            "intents": {
                str(i): {
                    "utterances": [fake_utterance] * num_queries_per_intent
                } for i in range(num_intents)
            }
        }
        random_state = np.random.RandomState(1)

        # When
        np.random.seed(42)
        noise_factor = 2
        data_augmentation_config = IntentClassifierDataAugmentationConfig(
            noise_factor=noise_factor, unknown_word_prob=0,
            unknown_words_replacement_string=None)
        utterances, _, intent_mapping = build_training_data(
            dataset, LANGUAGE_EN, data_augmentation_config, random_state)

        # Then
        expected_utterances = [get_text_from_chunks(utterance[DATA])
                               for intent in itervalues(dataset[INTENTS])
                               for utterance in intent[UTTERANCES]]
        np.random.seed(42)
        noise = list(mocked_noises)
        noise_size = int(min(noise_factor * num_queries_per_intent,
                             len(noise)))
        noise_it = get_noise_it(mocked_noises, utterances_length, 0,
                                random_state)
        noisy_utterances = [next(noise_it) for _ in range(noise_size)]
        expected_utterances += list(noisy_utterances)
        expected_intent_mapping = sorted(dataset["intents"])
        expected_intent_mapping.append(None)
        self.assertListEqual(utterances, expected_utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)
Example #5
0
    def test_should_build_training_data_with_no_data(self):
        # Given
        language = LANGUAGE_EN
        dataset = validate_and_format_dataset(get_empty_dataset(language))
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = LogRegIntentClassifierConfig() \
            .data_augmentation_config
        utterances, _, intent_mapping = build_training_data(
            dataset, language, data_augmentation_config, random_state)

        # Then
        expected_utterances = []
        expected_intent_mapping = []
        self.assertListEqual(utterances, expected_utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)
    def fit(self, dataset):
        """Fits the intent classifier with a valid Snips dataset

        Returns:
            :class:`LogRegIntentClassifier`: The same instance, trained
        """
        logger.info("Fitting LogRegIntentClassifier...")
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        language = dataset[LANGUAGE]

        data_augmentation_config = self.config.data_augmentation_config
        utterances, classes, intent_list = build_training_data(
            dataset, language, data_augmentation_config, self.resources,
            self.random_state)

        self.intent_list = intent_list
        if len(self.intent_list) <= 1:
            return self

        self.featurizer = Featurizer(
            config=self.config.featurizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources,
            random_state=self.random_state,
        )
        self.featurizer.language = language

        none_class = max(classes)
        try:
            x = self.featurizer.fit_transform(dataset, utterances, classes,
                                              none_class)
        except _EmptyDatasetUtterancesError:
            self.featurizer = None
            return self

        alpha = get_regularization_factor(dataset)
        self.classifier = SGDClassifier(random_state=self.random_state,
                                        alpha=alpha,
                                        **LOG_REG_ARGS)
        self.classifier.fit(x, classes)
        logger.debug("%s", DifferedLoggingMessage(self.log_best_features))
        return self
    def test_should_build_training_data_with_no_data(self):
        # Given
        language = LANGUAGE_EN
        dataset = validate_and_format_dataset(get_empty_dataset(language))
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = LogRegIntentClassifierConfig() \
            .data_augmentation_config
        utterances, _, intent_mapping = build_training_data(
            dataset, language, data_augmentation_config, random_state)

        # Then
        expected_utterances = []
        expected_intent_mapping = []
        self.assertListEqual(utterances, expected_utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)
Example #8
0
    def test_should_build_training_data_with_no_stemming_no_noise(
            self, mocked_augment_utterances):
        # Given
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = IntentClassifierDataAugmentationConfig(
            noise_factor=0)
        utterances, _, intent_mapping = build_training_data(
            dataset, LANGUAGE_EN, data_augmentation_config, random_state)

        # Then
        expected_utterances = [utterance for intent
                               in itervalues(dataset[INTENTS])
                               for utterance in intent[UTTERANCES]]
        expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2']
        self.assertListEqual(expected_utterances, utterances)
        self.assertListEqual(expected_intent_mapping, intent_mapping)
    def test_should_build_training_data_with_no_stemming_no_noise(
            self, mocked_augment_utterances):
        # Given
        dataset = SAMPLE_DATASET
        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = IntentClassifierDataAugmentationConfig(
            noise_factor=0)
        utterances, _, intent_mapping = build_training_data(
            dataset, LANGUAGE_EN, data_augmentation_config, random_state)

        # Then
        expected_utterances = [get_text_from_chunks(utterance[DATA]) for intent
                               in itervalues(dataset[INTENTS]) for utterance in
                               intent[UTTERANCES]]
        expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2']
        self.assertListEqual(utterances, expected_utterances)
        self.assertListEqual(expected_intent_mapping, intent_mapping)
    def test_should_build_training_data_with_no_noise(
            self, mocked_augment_utterances):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_first_intent
utterances:
- how are you
- hello how are you?
- what's up

---
type: intent
name: my_second_intent
utterances:
- what is the weather today ?
- does it rain
- will it rain tomorrow""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        resources = self.get_resources(dataset[LANGUAGE])
        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = IntentClassifierDataAugmentationConfig(
            noise_factor=0)
        utterances, _, intent_mapping = build_training_data(
            dataset, LANGUAGE_EN, data_augmentation_config, resources,
            random_state)

        # Then
        expected_utterances = [
            utterance for _, intent in sorted(iteritems(dataset[INTENTS]))
            for utterance in intent[UTTERANCES]
        ]
        expected_intent_mapping = ["my_first_intent", "my_second_intent"]
        self.assertListEqual(expected_utterances, utterances)
        self.assertListEqual(expected_intent_mapping, intent_mapping)