def test_fit_cooccurrence_vectorizer_feature_selection(self, mocked_chi2): # Given vectorizer_config = CooccurrenceVectorizerConfig( filter_stop_words=False) config = FeaturizerConfig( added_cooccurrence_feature_ratio=.3, cooccurrence_vectorizer_config=vectorizer_config) featurizer = Featurizer(config) mocked_dataset = {"language": "fr", "entities": {}, "intents": {}} utterances = [ text_to_utterance("a b c d e"), text_to_utterance("f g h i j"), text_to_utterance("none"), ] mocked_vectorizer = MagicMock() mocked_vectorizer.idf_diag = range(10) featurizer.tfidf_vectorizer = mocked_vectorizer classes = [0, 0, 1] # When mocked_chi2.return_value = (None, [0.1, 1.0, 0.2, 1.0, 0.3, 1.0] + [1.0 for _ in range(100)]) featurizer._fit_cooccurrence_vectorizer(utterances, classes, 1, mocked_dataset) # Then expected_pairs = {("a", "b"): 0, ("a", "d"): 1, ("b", "c"): 2} self.assertDictEqual(expected_pairs, featurizer.cooccurrence_vectorizer.word_pairs)
def fit(self, dataset): """Fit the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ logger.debug("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( language, data_augmentation_config.unknown_words_replacement_string, self.config.featurizer_config) self.featurizer = self.featurizer.fit(dataset, utterances, classes) if self.featurizer is None: return self X = self.featurizer.transform(utterances) # pylint: disable=C0103 alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(X, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def test_featurizer_should_exclude_replacement_string(self): # Given language = LANGUAGE_EN dataset = { "entities": { "dummy1": { "utterances": { "unknownword": "unknownword", "what": "what" } } } } replacement_string = "unknownword" featurizer = Featurizer( language, unknown_words_replacement_string=replacement_string, config=FeaturizerConfig()) utterances = [text_to_utterance("hello dude")] y = np.array([1]) # When featurizer.fit(dataset, utterances, y) # Then self.assertNotIn(replacement_string, featurizer.entity_utterances_to_feature_names)
def test_should_be_serializable_before_fit(self): # Given pvalue_threshold = 0.42 config = FeaturizerConfig(pvalue_threshold=pvalue_threshold, added_cooccurrence_feature_ratio=0.2) featurizer = Featurizer(config=config) # When featurizer.persist(self.tmp_file_path) # Then expected_featurizer_dict = { "language_code": None, "tfidf_vectorizer": None, "cooccurrence_vectorizer": None, "config": config.to_dict() } featurizer_dict_path = self.tmp_file_path / "featurizer.json" self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict) expected_metadata = {"unit_name": "featurizer"} metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, expected_metadata) tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer" self.assertFalse(tfidf_vectorizer_path.exists()) cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer" self.assertFalse(cooc_vectorizer_path.exists())
def fit(self, dataset): """Fits the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ from sklearn.linear_model import SGDClassifier from sklearn.utils import compute_class_weight logger.info("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, self.random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources, random_state=self.random_state, ) self.featurizer.language = language none_class = max(classes) try: x = self.featurizer.fit_transform(dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: logger.warning("No (non-empty) utterances found in dataset") self.featurizer = None return self alpha = get_regularization_factor(dataset) class_weights_arr = compute_class_weight("balanced", range(none_class + 1), classes) # Re-weight the noise class class_weights_arr[-1] *= self.config.noise_reweight_factor class_weight = {idx: w for idx, w in enumerate(class_weights_arr)} self.classifier = SGDClassifier(random_state=self.random_state, alpha=alpha, class_weight=class_weight, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def test_should_be_deserializable(self, mocked_cooccurrence_load, mocked_tfidf_load): # Given mocked_tfidf_load.return_value = "tfidf_vectorizer" mocked_cooccurrence_load.return_value = "cooccurrence_vectorizer" language = LANGUAGE_EN config = FeaturizerConfig() featurizer_dict = { "language_code": language, "tfidf_vectorizer": "tfidf_vectorizer", "cooccurrence_vectorizer": "cooccurrence_vectorizer", "config": config.to_dict() } self.tmp_file_path.mkdir() featurizer_path = self.tmp_file_path / "featurizer.json" with featurizer_path.open("w", encoding="utf-8") as f: f.write(json_string(featurizer_dict)) # When featurizer = Featurizer.from_path(self.tmp_file_path) # Then self.assertEqual(language, featurizer.language) self.assertEqual("tfidf_vectorizer", featurizer.tfidf_vectorizer) self.assertEqual("cooccurrence_vectorizer", featurizer.cooccurrence_vectorizer) self.assertDictEqual(config.to_dict(), featurizer.config.to_dict())
def test_should_be_deserializable(self, mock_from_dict): # Given mocked_featurizer = Featurizer(LANGUAGE_EN, None) mock_from_dict.return_value = mocked_featurizer intent_list = ["MakeCoffee", "MakeTea", None] coeffs = [ [1.23, 4.5], [6.7, 8.90], [1.01, 2.345], ] intercept = [ 0.34, 0.41, -0.98 ] t_ = 701. config = LogRegIntentClassifierConfig().to_dict() classifier_dict = { "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": intent_list, "config": config, "featurizer": mocked_featurizer.to_dict(), } self.tmp_file_path.mkdir() metadata = {"unit_name": "log_reg_intent_classifier"} self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata) self.writeJsonContent(self.tmp_file_path / "intent_classifier.json", classifier_dict) # When classifier = LogRegIntentClassifier.from_path(self.tmp_file_path) # Then self.assertEqual(classifier.intent_list, intent_list) self.assertIsNotNone(classifier.featurizer) self.assertListEqual(classifier.classifier.coef_.tolist(), coeffs) self.assertListEqual(classifier.classifier.intercept_.tolist(), intercept) self.assertDictEqual(classifier.config.to_dict(), config)
def fit(self, dataset): """Fits the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ logger.info("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, self.random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources, random_state=self.random_state, ) self.featurizer.language = language none_class = max(classes) try: x = self.featurizer.fit_transform(dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: self.featurizer = None return self alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=self.random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def test_feature_index_to_feature_name(self): # Given config = FeaturizerConfig(added_cooccurrence_feature_ratio=.75) featurizer = Featurizer(config=config) # When mocked_cooccurrence_vectorizer = MagicMock() mocked_cooccurrence_vectorizer.word_pairs = {("a", "b"): 0} mocked_tfidf_vectorizer = MagicMock() mocked_tfidf_vectorizer.vocabulary = {"a": 0} featurizer.cooccurrence_vectorizer = mocked_cooccurrence_vectorizer featurizer.tfidf_vectorizer = mocked_tfidf_vectorizer # Then expected = {0: "ngram:a", 1: "pair:a+b"} self.assertDictEqual(expected, featurizer.feature_index_to_feature_name)
def test_should_be_deserializable(self, mock_from_dict): # Given mocked_featurizer = Featurizer(LANGUAGE_EN, None) mock_from_dict.return_value = mocked_featurizer intent_list = ["MakeCoffee", "MakeTea", None] coeffs = [ [1.23, 4.5], [6.7, 8.90], [1.01, 2.345], ] intercept = [ 0.34, 0.41, -0.98 ] t_ = 701. config = LogRegIntentClassifierConfig().to_dict() classifier_dict = { "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": intent_list, "config": config, "featurizer": mocked_featurizer.to_dict(), } # When classifier = LogRegIntentClassifier.from_dict(classifier_dict) # Then self.assertEqual(classifier.intent_list, intent_list) self.assertIsNotNone(classifier.featurizer) self.assertListEqual(classifier.classifier.coef_.tolist(), coeffs) self.assertListEqual(classifier.classifier.intercept_.tolist(), intercept) self.assertDictEqual(classifier.config.to_dict(), config)
def test_fit_with_no_utterance_should_raise(self): # Given utterances = [] classes = [] dataset = get_empty_dataset("en") # When/Then with self.assertRaises(_EmptyDatasetUtterancesError) as ctx: Featurizer().fit_transform(dataset, utterances, classes, None) self.assertEqual("Tokenized utterances are empty", str(ctx.exception))
def test_should_be_deserializable(self, mock_from_dict): # Given mocked_featurizer = Featurizer(LANGUAGE_EN, None) mock_from_dict.return_value = mocked_featurizer intent_list = ["MakeCoffee", "MakeTea", None] coeffs = [ [1.23, 4.5], [6.7, 8.90], [1.01, 2.345], ] intercept = [0.34, 0.41, -0.98] t_ = 701. config = LogRegIntentClassifierConfig().to_dict() classifier_dict = { "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": intent_list, "config": config, "featurizer": mocked_featurizer.to_dict(), } # When classifier = LogRegIntentClassifier.from_dict(classifier_dict) # Then self.assertEqual(classifier.intent_list, intent_list) self.assertIsNotNone(classifier.featurizer) self.assertListEqual(classifier.classifier.coef_.tolist(), coeffs) self.assertListEqual(classifier.classifier.intercept_.tolist(), intercept) self.assertDictEqual(classifier.config.to_dict(), config)
def test_should_be_deserializable(self): # Given language = LANGUAGE_EN idf_diag = [1.52, 1.21, 1.04] vocabulary = {"hello": 0, "beautiful": 1, "world": 2} best_features = [0, 1] config = { "pvalue_threshold": 0.4, "sublinear_tf": False, "word_clusters_name": "brown_clusters" } entity_utterances_to_feature_names = { "entity_1": ["entityfeatureentity_1"] } featurizer_dict = { "config": config, "language_code": language, "tfidf_vectorizer": { "idf_diag": idf_diag, "vocab": vocabulary }, "best_features": best_features, "entity_utterances_to_feature_names": entity_utterances_to_feature_names, "unknown_words_replacement_string": None } # When featurizer = Featurizer.from_dict(featurizer_dict) # Then self.assertEqual(featurizer.language, language) # pylint: disable=W0212 self.assertListEqual( featurizer.tfidf_vectorizer._tfidf._idf_diag.data.tolist(), idf_diag) # pylint: enable=W0212 self.assertDictEqual(featurizer.tfidf_vectorizer.vocabulary_, vocabulary) self.assertListEqual(featurizer.best_features, best_features) self.assertEqual(config, featurizer.config.to_dict()) self.assertDictEqual(featurizer.entity_utterances_to_feature_names, { k: set(v) for k, v in iteritems(entity_utterances_to_feature_names) })
def from_path(cls, path, **shared): """Loads a :class:`LogRegIntentClassifier` instance from a path The data at the given path must have been generated using :func:`~LogRegIntentClassifier.persist` """ import numpy as np from sklearn.linear_model import SGDClassifier path = Path(path) model_path = path / "intent_classifier.json" if not model_path.exists(): raise LoadingError("Missing intent classifier model file: %s" % model_path.name) with model_path.open(encoding="utf8") as f: model_dict = json.load(f) # Create the classifier config = LogRegIntentClassifierConfig.from_dict(model_dict["config"]) intent_classifier = cls(config=config, **shared) intent_classifier.intent_list = model_dict['intent_list'] # Create the underlying SGD classifier sgd_classifier = None coeffs = model_dict['coeffs'] intercept = model_dict['intercept'] t_ = model_dict["t_"] if coeffs is not None and intercept is not None: sgd_classifier = SGDClassifier(**LOG_REG_ARGS) sgd_classifier.coef_ = np.array(coeffs) sgd_classifier.intercept_ = np.array(intercept) sgd_classifier.t_ = t_ intent_classifier.classifier = sgd_classifier # Add the featurizer featurizer = model_dict['featurizer'] if featurizer is not None: featurizer_path = path / featurizer intent_classifier.featurizer = Featurizer.from_path( featurizer_path, **shared) return intent_classifier
def test_should_be_serializable(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: dummy_intent utterances: - this is the number [number:snips/number](one) """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json pvalue_threshold = 0.42 config = FeaturizerConfig(pvalue_threshold=pvalue_threshold, added_cooccurrence_feature_ratio=0.2) shared = self.get_shared_data(dataset) featurizer = Featurizer(config=config, **shared) utterances = [ text_to_utterance("this is the number"), text_to_utterance("yo") ] classes = np.array([0, 1]) featurizer.fit(dataset, utterances, classes, max(classes)) # When featurizer.persist(self.tmp_file_path) # Then expected_featurizer_dict = { "language_code": "en", "tfidf_vectorizer": "tfidf_vectorizer", "cooccurrence_vectorizer": "cooccurrence_vectorizer", "config": config.to_dict() } featurizer_dict_path = self.tmp_file_path / "featurizer.json" self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict) expected_metadata = {"unit_name": "featurizer"} metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, expected_metadata) tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer" self.assertTrue(tfidf_vectorizer_path.exists()) cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer" self.assertTrue(cooc_vectorizer_path.exists())
def from_dict(cls, unit_dict): """Creates a :class:`LogRegIntentClassifier` instance from a dict The dict must have been generated with :func:`~LogRegIntentClassifier.to_dict` """ config = LogRegIntentClassifierConfig.from_dict(unit_dict["config"]) intent_classifier = cls(config=config) sgd_classifier = None coeffs = unit_dict['coeffs'] intercept = unit_dict['intercept'] t_ = unit_dict["t_"] if coeffs is not None and intercept is not None: sgd_classifier = SGDClassifier(**LOG_REG_ARGS) sgd_classifier.coef_ = np.array(coeffs) sgd_classifier.intercept_ = np.array(intercept) sgd_classifier.t_ = t_ intent_classifier.classifier = sgd_classifier intent_classifier.intent_list = unit_dict['intent_list'] featurizer = unit_dict['featurizer'] if featurizer is not None: intent_classifier.featurizer = Featurizer.from_dict(featurizer) return intent_classifier
class LogRegIntentClassifier(IntentClassifier): """Intent classifier which uses a Logistic Regression underneath""" config_type = LogRegIntentClassifierConfig def __init__(self, config=None, **shared): """The LogReg intent classifier can be configured by passing a :class:`.LogRegIntentClassifierConfig`""" super(LogRegIntentClassifier, self).__init__(config, **shared) self.classifier = None self.intent_list = None self.featurizer = None @property def fitted(self): """Whether or not the intent classifier has already been fitted""" return self.intent_list is not None @log_elapsed_time(logger, logging.INFO, "LogRegIntentClassifier in {elapsed_time}") def fit(self, dataset): """Fits the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ from sklearn.linear_model import SGDClassifier from sklearn.utils import compute_class_weight logger.info("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, self.random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources, random_state=self.random_state, ) self.featurizer.language = language none_class = max(classes) try: x = self.featurizer.fit_transform( dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: logger.warning("No (non-empty) utterances found in dataset") self.featurizer = None return self alpha = get_regularization_factor(dataset) class_weights_arr = compute_class_weight( "balanced", range(none_class + 1), classes) # Re-weight the noise class class_weights_arr[-1] *= self.config.noise_reweight_factor class_weight = {idx: w for idx, w in enumerate(class_weights_arr)} self.classifier = SGDClassifier( random_state=self.random_state, alpha=alpha, class_weight=class_weight, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self @fitted_required def get_intent(self, text, intents_filter=None): """Performs intent classification on the provided *text* Args: text (str): Input intents_filter (str or list of str): When defined, it will find the most likely intent among the list, otherwise it will use the whole list of intents defined in the dataset Returns: dict or None: The most likely intent along with its probability or *None* if no intent was found Raises: :class:`snips_nlu.exceptions.NotTrained`: When the intent classifier is not fitted """ return self._get_intents(text, intents_filter)[0] @fitted_required def get_intents(self, text): """Performs intent classification on the provided *text* and returns the list of intents ordered by decreasing probability The length of the returned list is exactly the number of intents in the dataset + 1 for the None intent Raises: :class:`snips_nlu.exceptions.NotTrained`: when the intent classifier is not fitted """ return self._get_intents(text, intents_filter=None) def _get_intents(self, text, intents_filter): if isinstance(intents_filter, str): intents_filter = {intents_filter} elif isinstance(intents_filter, list): intents_filter = set(intents_filter) if not text or not self.intent_list or not self.featurizer: results = [intent_classification_result(None, 1.0)] results += [intent_classification_result(i, 0.0) for i in self.intent_list if i is not None] return results if len(self.intent_list) == 1: return [intent_classification_result(self.intent_list[0], 1.0)] # pylint: disable=C0103 X = self.featurizer.transform([text_to_utterance(text)]) # pylint: enable=C0103 proba_vec = self._predict_proba(X) logger.debug( "%s", DifferedLoggingMessage(self.log_activation_weights, text, X)) results = [ intent_classification_result(i, proba) for i, proba in zip(self.intent_list, proba_vec[0]) if intents_filter is None or i is None or i in intents_filter] return sorted(results, key=lambda res: -res[RES_PROBA]) def _predict_proba(self, X): # pylint: disable=C0103 import numpy as np self.classifier._check_proba() # pylint: disable=W0212 prob = self.classifier.decision_function(X) prob *= -1 np.exp(prob, prob) prob += 1 np.reciprocal(prob, prob) if prob.ndim == 1: return np.vstack([1 - prob, prob]).T return prob @check_persisted_path def persist(self, path): """Persists the object at the given path""" path.mkdir() featurizer = None if self.featurizer is not None: featurizer = "featurizer" featurizer_path = path / featurizer self.featurizer.persist(featurizer_path) coeffs = None intercept = None t_ = None if self.classifier is not None: coeffs = self.classifier.coef_.tolist() intercept = self.classifier.intercept_.tolist() t_ = self.classifier.t_ self_as_dict = { "config": self.config.to_dict(), "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": self.intent_list, "featurizer": featurizer } classifier_json = json_string(self_as_dict) with (path / "intent_classifier.json").open(mode="w") as f: f.write(classifier_json) self.persist_metadata(path) @classmethod def from_path(cls, path, **shared): """Loads a :class:`LogRegIntentClassifier` instance from a path The data at the given path must have been generated using :func:`~LogRegIntentClassifier.persist` """ import numpy as np from sklearn.linear_model import SGDClassifier path = Path(path) model_path = path / "intent_classifier.json" if not model_path.exists(): raise LoadingError("Missing intent classifier model file: %s" % model_path.name) with model_path.open(encoding="utf8") as f: model_dict = json.load(f) # Create the classifier config = LogRegIntentClassifierConfig.from_dict(model_dict["config"]) intent_classifier = cls(config=config, **shared) intent_classifier.intent_list = model_dict['intent_list'] # Create the underlying SGD classifier sgd_classifier = None coeffs = model_dict['coeffs'] intercept = model_dict['intercept'] t_ = model_dict["t_"] if coeffs is not None and intercept is not None: sgd_classifier = SGDClassifier(**LOG_REG_ARGS) sgd_classifier.coef_ = np.array(coeffs) sgd_classifier.intercept_ = np.array(intercept) sgd_classifier.t_ = t_ intent_classifier.classifier = sgd_classifier # Add the featurizer featurizer = model_dict['featurizer'] if featurizer is not None: featurizer_path = path / featurizer intent_classifier.featurizer = Featurizer.from_path( featurizer_path, **shared) return intent_classifier def log_best_features(self, top_n=50): import numpy as np if not hasattr(self.featurizer, "feature_index_to_feature_name"): return None log = "Top {} features weights by intent:".format(top_n) index_to_feature = self.featurizer.feature_index_to_feature_name for intent_ix in range(self.classifier.coef_.shape[0]): intent_name = self.intent_list[intent_ix] log += "\n\n\nFor intent {}\n".format(intent_name) top_features_idx = np.argsort( np.absolute(self.classifier.coef_[intent_ix]))[::-1][:top_n] for feature_ix in top_features_idx: feature_name = index_to_feature[feature_ix] feature_weight = self.classifier.coef_[intent_ix, feature_ix] log += "\n{} -> {}".format(feature_name, feature_weight) return log def log_activation_weights(self, text, x, top_n=50): import numpy as np if not hasattr(self.featurizer, "feature_index_to_feature_name"): return None log = "\n\nTop {} feature activations for: \"{}\":\n".format( top_n, text) activations = np.multiply( self.classifier.coef_, np.asarray(x.todense())) abs_activation = np.absolute(activations).flatten().squeeze() if top_n > activations.size: top_n = activations.size top_n_activations_ix = np.argpartition(abs_activation, -top_n, axis=None)[-top_n:] top_n_activations_ix = np.unravel_index( top_n_activations_ix, activations.shape) index_to_feature = self.featurizer.feature_index_to_feature_name features_intent_and_activation = [ (self.intent_list[i], index_to_feature[f], activations[i, f]) for i, f in zip(*top_n_activations_ix)] features_intent_and_activation = sorted( features_intent_and_activation, key=lambda x: abs(x[2]), reverse=True) for intent, feature, activation in features_intent_and_activation: log += "\n\n\"{}\" -> ({}, {:.2f})".format( intent, feature, float(activation)) log += "\n\n" return log
def test_preprocess_queries(self, mocked_stem, mocked_word_cluster): # Given language = LANGUAGE_EN def _stem(t): if t == "beautiful": s = "beauty" elif t == "birdy": s = "bird" elif t == "entity": s = "ent" else: s = t return s def stem_function(text, language): return get_default_sep(language).join( [_stem(t) for t in tokenize_light(text, language)]) mocked_word_cluster.return_value = { "brown_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } } mocked_stem.side_effect = stem_function dataset = { "intents": { "intent1": { "utterances": [] } }, "entities": { "entity_1": { "data": [ { "value": "entity 1", "synonyms": ["alternative entity 1"] }, { "value": "éntity 1", "synonyms": ["alternative entity 1"] } ], "use_synonyms": False, "automatically_extensible": False }, "entity_2": { "data": [ { "value": "entity 1", "synonyms": [] }, { "value": "Éntity 2", "synonyms": ["Éntity_2", "Alternative entity 2"] } ], "use_synonyms": True, "automatically_extensible": False } }, "language": "en", "snips_nlu_version": "0.0.1" } dataset = validate_and_format_dataset(dataset) queries = [ "hÉllo wOrld Éntity_2", "beauTiful World entity 1", "Bird bïrdy", "beauTiful éntity 1 bIrd Éntity_2" ] labels = [0, 0, 1, 1] featurizer = Featurizer(language, None).fit( dataset, queries, labels) # When queries = featurizer.preprocess_queries(queries) # Then expected_queries = [ "hello world entity_2 entityfeatureentity_2", "beauty world ent 1 entityfeatureentity_1 entityfeatureentity_2 " "cluster_1 cluster_3", "bird bird", "beauty ent 1 bird entity_2 entityfeatureentity_1 " "entityfeatureentity_2 entityfeatureentity_2 cluster_1" ] self.assertListEqual(queries, expected_queries)
def test_featurizer_should_be_serialized_when_not_fitted(self): # Given featurizer = Featurizer() # When/Then featurizer.persist(self.tmp_file_path)
class LogRegIntentClassifier(IntentClassifier): """Intent classifier which uses a Logistic Regression underneath""" unit_name = "log_reg_intent_classifier" config_type = LogRegIntentClassifierConfig # pylint:disable=line-too-long def __init__(self, config=None): """The LogReg intent classifier can be configured by passing a :class:`.LogRegIntentClassifierConfig`""" if config is None: config = LogRegIntentClassifierConfig() super(LogRegIntentClassifier, self).__init__(config) self.classifier = None self.intent_list = None self.featurizer = None # pylint:enable=line-too-long @property def fitted(self): """Whether or not the intent classifier has already been fitted""" return self.intent_list is not None @log_elapsed_time(logger, logging.DEBUG, "LogRegIntentClassifier in {elapsed_time}") def fit(self, dataset): """Fit the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ logger.debug("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( language, data_augmentation_config.unknown_words_replacement_string, self.config.featurizer_config) self.featurizer = self.featurizer.fit(dataset, utterances, classes) if self.featurizer is None: return self X = self.featurizer.transform(utterances) # pylint: disable=C0103 alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(X, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self def get_intent(self, text, intents_filter=None): """Performs intent classification on the provided *text* Args: text (str): Input intents_filter (str or list of str): When defined, it will find the most likely intent among the list, otherwise it will use the whole list of intents defined in the dataset Returns: dict or None: The most likely intent along with its probability or *None* if no intent was found Raises: NotTrained: When the intent classifier is not fitted """ if not self.fitted: raise NotTrained('LogRegIntentClassifier must be fitted') if isinstance(intents_filter, str): intents_filter = [intents_filter] if not text or not self.intent_list \ or self.featurizer is None or self.classifier is None: return None if len(self.intent_list) == 1: if self.intent_list[0] is None: return None return intent_classification_result(self.intent_list[0], 1.0) # pylint: disable=C0103 X = self.featurizer.transform([text_to_utterance(text)]) # pylint: enable=C0103 proba_vec = self._predict_proba(X, intents_filter=intents_filter) intents_probas = sorted(zip(self.intent_list, proba_vec[0]), key=lambda p: -p[1]) for intent, proba in intents_probas: if intent is None: return None if intents_filter is None or intent in intents_filter: return intent_classification_result(intent, proba) return None def _predict_proba(self, X, intents_filter): # pylint: disable=C0103 self.classifier._check_proba() # pylint: disable=W0212 filtered_out_indexes = None if intents_filter is not None: filtered_out_indexes = [ i for i, intent in enumerate(self.intent_list) if intent not in intents_filter and intent is not None] prob = self.classifier.decision_function(X) prob *= -1 np.exp(prob, prob) prob += 1 np.reciprocal(prob, prob) if prob.ndim == 1: return np.vstack([1 - prob, prob]).T else: if filtered_out_indexes: # not None and not empty prob[:, filtered_out_indexes] = 0. # OvR normalization, like LibLinear's predict_probability prob /= prob.sum(axis=1).reshape((prob.shape[0], -1)) # We do not normalize when there is no intents filter, to keep the # probabilities calibrated return prob def to_dict(self): """Returns a json-serializable dict""" featurizer_dict = None if self.featurizer is not None: featurizer_dict = self.featurizer.to_dict() coeffs = None intercept = None t_ = None if self.classifier is not None: coeffs = self.classifier.coef_.tolist() intercept = self.classifier.intercept_.tolist() t_ = self.classifier.t_ return { "unit_name": self.unit_name, "config": self.config.to_dict(), "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": self.intent_list, "featurizer": featurizer_dict, } @classmethod def from_dict(cls, unit_dict): """Creates a :class:`LogRegIntentClassifier` instance from a dict The dict must have been generated with :func:`~LogRegIntentClassifier.to_dict` """ config = LogRegIntentClassifierConfig.from_dict(unit_dict["config"]) intent_classifier = cls(config=config) sgd_classifier = None coeffs = unit_dict['coeffs'] intercept = unit_dict['intercept'] t_ = unit_dict["t_"] if coeffs is not None and intercept is not None: sgd_classifier = SGDClassifier(**LOG_REG_ARGS) sgd_classifier.coef_ = np.array(coeffs) sgd_classifier.intercept_ = np.array(intercept) sgd_classifier.t_ = t_ intent_classifier.classifier = sgd_classifier intent_classifier.intent_list = unit_dict['intent_list'] featurizer = unit_dict['featurizer'] if featurizer is not None: intent_classifier.featurizer = Featurizer.from_dict(featurizer) return intent_classifier def log_best_features(self, top_n=20): log = "Top {} features weights by intent:\n".format(top_n) voca = { v: k for k, v in iteritems(self.featurizer.tfidf_vectorizer.vocabulary_) } features = [voca[i] for i in self.featurizer.best_features] for intent_ix in range(self.classifier.coef_.shape[0]): intent_name = self.intent_list[intent_ix] log += "\n\n\nFor intent {}\n".format(intent_name) top_features_idx = np.argsort( np.absolute(self.classifier.coef_[intent_ix]))[::-1][:top_n] for feature_ix in top_features_idx: feature_name = features[feature_ix] feature_weight = self.classifier.coef_[intent_ix, feature_ix] log += "\n{} -> {}".format(feature_name, feature_weight) return log
def test_featurizer_should_be_serialized_when_not_fitted(self): # Given language = LANGUAGE_EN featurizer = Featurizer(language, None) # When/Then featurizer.to_dict()
def test_fit_transform_should_be_consistent_with_transform(self): # Here we mainly test that the output of fit_transform is # the same as the result of fit and then transform. # We're trying to avoid that for some reason indexes of features # get mixed up after feature selection # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = FeaturizerConfig(added_cooccurrence_feature_ratio=.5) shared = self.get_shared_data(dataset) featurizer = Featurizer(config=config, **shared) utterances = [{ "data": [{ "text": "hÉllo wOrld " }, { "text": "Éntity_2", "entity": "entity_2" }] }, { "data": [{ "text": "beauTiful World " }, { "text": "entity 1", "entity": "entity_1" }] }, { "data": [{ "text": "Bird bïrdy" }] }, { "data": [{ "text": "Bird bïrdy" }] }] classes = [0, 0, 1, 1] # When x_0 = featurizer.fit_transform(dataset, utterances, classes, max(classes)) x_1 = featurizer.transform(utterances) # Then self.assertListEqual(x_0.todense().tolist(), x_1.todense().tolist())
def test_should_be_serializable(self): # Given language = LANGUAGE_EN tfidf_vectorizer = _get_tfidf_vectorizer(language) pvalue_threshold = 0.42 featurizer = Featurizer(language, config=FeaturizerConfig( pvalue_threshold=pvalue_threshold, word_clusters_name="brown_clusters"), unknown_words_replacement_string=None, tfidf_vectorizer=tfidf_vectorizer) dataset = { "entities": { "entity2": { "data": [{ "value": "entity1", "synonyms": ["entity1"] }], "use_synonyms": True, "automatically_extensible": True } }, "intents": {}, "language": "en" } dataset = validate_and_format_dataset(dataset) utterances = [ "hello world", "beautiful world", "hello here", "bird birdy", "beautiful bird" ] utterances = [text_to_utterance(u) for u in utterances] classes = np.array([0, 0, 0, 1, 1]) featurizer.fit(dataset, utterances, classes) # When serialized_featurizer = featurizer.to_dict() # Then msg = "Featurizer dict should be json serializable to utf8." with self.fail_if_exception(msg): dumped = json_string(serialized_featurizer) msg = "SnipsNLUEngine should be deserializable from dict with unicode" \ " values" with self.fail_if_exception(msg): _ = Featurizer.from_dict(json.loads(dumped)) vocabulary = tfidf_vectorizer.vocabulary_ # pylint: disable=W0212 idf_diag = tfidf_vectorizer._tfidf._idf_diag.data.tolist() # pylint: enable=W0212 best_features = featurizer.best_features entity_utterances_to_feature_names = { "entity1": ["entityfeatureentity2"] } expected_serialized = { "config": { 'sublinear_tf': False, 'pvalue_threshold': pvalue_threshold, 'word_clusters_name': "brown_clusters" }, "language_code": "en", "tfidf_vectorizer": { "idf_diag": idf_diag, "vocab": vocabulary }, "best_features": best_features, "entity_utterances_to_feature_names": entity_utterances_to_feature_names, "unknown_words_replacement_string": None } self.assertDictEqual(expected_serialized, serialized_featurizer)
def test_preprocess_utterances(self, mocked_stem, mocked_word_cluster): # Given language = LANGUAGE_EN def _stem(t): if t == "beautiful": s = "beauty" elif t == "birdy": s = "bird" elif t == "entity": s = "ent" else: s = t return s def stem_function(text, language): return get_default_sep(language).join( [_stem(t) for t in tokenize_light(text, language)]) mocked_word_cluster.return_value = { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } mocked_stem.side_effect = stem_function dataset = { "intents": { "intent1": { "utterances": [] } }, "entities": { "entity_1": { "data": [{ "value": "entity 1", "synonyms": ["alternative entity 1"] }, { "value": "éntity 1", "synonyms": ["alternative entity 1"] }], "use_synonyms": False, "automatically_extensible": False }, "entity_2": { "data": [{ "value": "entity 1", "synonyms": [] }, { "value": "Éntity 2", "synonyms": ["Éntity_2", "Alternative entity 2"] }], "use_synonyms": True, "automatically_extensible": False }, "snips/number": {} }, "language": "en", } dataset = validate_and_format_dataset(dataset) utterances = [ text_to_utterance("hÉllo wOrld Éntity_2"), text_to_utterance("beauTiful World entity 1"), text_to_utterance("Bird bïrdy"), ] labeled_utterance = { DATA: [{ TEXT: "beauTiful éntity " }, { TEXT: "1", ENTITY: "snips/number", SLOT_NAME: "number" }, { TEXT: " bIrd Éntity_2" }] } utterances.append(labeled_utterance) labels = np.array([0, 0, 1, 1]) featurizer = Featurizer( language, None, config=FeaturizerConfig(word_clusters_name="brown_clusters")).fit( dataset, utterances, labels) # When utterances = featurizer.preprocess_utterances(utterances) # Then expected_utterances = [ "hello world entity_2 builtinentityfeaturesnipsnumber " "entityfeatureentity_2", "beauty world ent 1 builtinentityfeaturesnipsnumber " "entityfeatureentity_1 entityfeatureentity_2 " "cluster_1 cluster_3", "bird bird", "beauty ent bird entity_2 builtinentityfeaturesnipsnumber " "builtinentityfeaturesnipsnumber entityfeatureentity_1 " "entityfeatureentity_2 entityfeatureentity_2 cluster_1" ] self.assertListEqual(utterances, expected_utterances)
def test_training_should_be_reproducible(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json utterances = [ text_to_utterance("please make me two hots cups of tea"), text_to_utterance("i want a cup of coffee"), ] classes = np.array([0, 1]) shared = self.get_shared_data(dataset) shared["random_state"] = 42 # When featurizer1 = Featurizer(**shared) featurizer1.fit(dataset, utterances, classes, max(classes)) featurizer2 = Featurizer(**shared) featurizer2.fit(dataset, utterances, classes, max(classes)) # Then with temp_dir() as tmp_dir: dir_featurizer1 = tmp_dir / "featurizer1" dir_featurizer2 = tmp_dir / "featurizer2" featurizer1.persist(dir_featurizer1) featurizer2.persist(dir_featurizer2) hash1 = dirhash(str(dir_featurizer1), 'sha256') hash2 = dirhash(str(dir_featurizer2), 'sha256') self.assertEqual(hash1, hash2)
class LogRegIntentClassifier(IntentClassifier): """Intent classifier which uses a Logistic Regression underneath""" unit_name = "log_reg_intent_classifier" config_type = LogRegIntentClassifierConfig # pylint:disable=line-too-long def __init__(self, config=None): """The LogReg intent classifier can be configured by passing a :class:`.LogRegIntentClassifierConfig`""" if config is None: config = LogRegIntentClassifierConfig() super(LogRegIntentClassifier, self).__init__(config) self.classifier = None self.intent_list = None self.featurizer = None # pylint:enable=line-too-long @property def fitted(self): """Whether or not the intent classifier has already been fitted""" return self.intent_list is not None def fit(self, dataset): """Fit the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ dataset = validate_and_format_dataset(dataset) language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( language, data_augmentation_config.unknown_words_replacement_string, self.config.featurizer_config) self.featurizer = self.featurizer.fit(dataset, utterances, classes) if self.featurizer is None: return self X = self.featurizer.transform(utterances) # pylint: disable=C0103 alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(X, classes) return self def get_intent(self, text, intents_filter=None): """Performs intent classification on the provided *text* Args: text (str): Input intents_filter (str or list of str): When defined, it will find the most likely intent among the list, otherwise it will use the whole list of intents defined in the dataset Returns: dict or None: The most likely intent along with its probability or *None* if no intent was found Raises: NotTrained: When the intent classifier is not fitted """ if not self.fitted: raise NotTrained('LogRegIntentClassifier must be fitted') if isinstance(intents_filter, str): intents_filter = [intents_filter] if not text or not self.intent_list \ or self.featurizer is None or self.classifier is None: return None if len(self.intent_list) == 1: if self.intent_list[0] is None: return None return intent_classification_result(self.intent_list[0], 1.0) X = self.featurizer.transform([text]) # pylint: disable=C0103 proba_vec = self.classifier.predict_proba(X)[0] intents_probas = sorted(zip(self.intent_list, proba_vec), key=lambda p: -p[1]) for intent, proba in intents_probas: if intent is None: return None if intents_filter is None or intent in intents_filter: return intent_classification_result(intent, proba) return None def to_dict(self): """Returns a json-serializable dict""" featurizer_dict = None if self.featurizer is not None: featurizer_dict = self.featurizer.to_dict() coeffs = None intercept = None t_ = None if self.classifier is not None: coeffs = self.classifier.coef_.tolist() intercept = self.classifier.intercept_.tolist() t_ = self.classifier.t_ return { "unit_name": self.unit_name, "config": self.config.to_dict(), "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": self.intent_list, "featurizer": featurizer_dict, } @classmethod def from_dict(cls, unit_dict): """Creates a :class:`LogRegIntentClassifier` instance from a dict The dict must have been generated with :func:`~LogRegIntentClassifier.to_dict` """ config = LogRegIntentClassifierConfig.from_dict(unit_dict["config"]) intent_classifier = cls(config=config) sgd_classifier = None coeffs = unit_dict['coeffs'] intercept = unit_dict['intercept'] t_ = unit_dict["t_"] if coeffs is not None and intercept is not None: sgd_classifier = SGDClassifier(**LOG_REG_ARGS) sgd_classifier.coef_ = np.array(coeffs) sgd_classifier.intercept_ = np.array(intercept) sgd_classifier.t_ = t_ intent_classifier.classifier = sgd_classifier intent_classifier.intent_list = unit_dict['intent_list'] featurizer = unit_dict['featurizer'] if featurizer is not None: intent_classifier.featurizer = Featurizer.from_dict(featurizer) return intent_classifier