def test_should_be_deserializable(self, mocked_cooccurrence_load, mocked_tfidf_load): # Given mocked_tfidf_load.return_value = "tfidf_vectorizer" mocked_cooccurrence_load.return_value = "cooccurrence_vectorizer" language = LANGUAGE_EN config = FeaturizerConfig() featurizer_dict = { "language_code": language, "tfidf_vectorizer": "tfidf_vectorizer", "cooccurrence_vectorizer": "cooccurrence_vectorizer", "config": config.to_dict() } self.tmp_file_path.mkdir() featurizer_path = self.tmp_file_path / "featurizer.json" with featurizer_path.open("w", encoding="utf-8") as f: f.write(json_string(featurizer_dict)) # When featurizer = Featurizer.from_path(self.tmp_file_path) # Then self.assertEqual(language, featurizer.language) self.assertEqual("tfidf_vectorizer", featurizer.tfidf_vectorizer) self.assertEqual("cooccurrence_vectorizer", featurizer.cooccurrence_vectorizer) self.assertDictEqual(config.to_dict(), featurizer.config.to_dict())
def test_should_be_serializable_before_fit(self): # Given pvalue_threshold = 0.42 config = FeaturizerConfig(pvalue_threshold=pvalue_threshold, added_cooccurrence_feature_ratio=0.2) featurizer = Featurizer(config=config) # When featurizer.persist(self.tmp_file_path) # Then expected_featurizer_dict = { "language_code": None, "tfidf_vectorizer": None, "cooccurrence_vectorizer": None, "config": config.to_dict() } featurizer_dict_path = self.tmp_file_path / "featurizer.json" self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict) expected_metadata = {"unit_name": "featurizer"} metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, expected_metadata) tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer" self.assertFalse(tfidf_vectorizer_path.exists()) cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer" self.assertFalse(cooc_vectorizer_path.exists())
def test_featurizer_should_exclude_replacement_string(self): # Given language = LANGUAGE_EN dataset = { "entities": { "dummy1": { "utterances": { "unknownword": "unknownword", "what": "what" } } } } replacement_string = "unknownword" featurizer = Featurizer( language, unknown_words_replacement_string=replacement_string, config=FeaturizerConfig()) utterances = [text_to_utterance("hello dude")] y = np.array([1]) # When featurizer.fit(dataset, utterances, y) # Then self.assertNotIn(replacement_string, featurizer.entity_utterances_to_feature_names)
def from_dict(cls, obj_dict): """Creates a :class:`Featurizer` instance from a :obj:`dict` The dict must have been generated with :func:`~Featurizer.to_dict` """ language = obj_dict['language_code'] config = FeaturizerConfig.from_dict(obj_dict["config"]) tfidf_vectorizer = _deserialize_tfidf_vectorizer( obj_dict["tfidf_vectorizer"], language, config) entity_utterances_to_entity_names = { k: set(v) for k, v in iteritems(obj_dict['entity_utterances_to_feature_names']) } self = cls( language=language, tfidf_vectorizer=tfidf_vectorizer, pvalue_threshold=obj_dict['pvalue_threshold'], entity_utterances_to_feature_names= entity_utterances_to_entity_names, best_features=obj_dict['best_features'], config=config, unknown_words_replacement_string=obj_dict[ "unknown_words_replacement_string"] ) return self
def test_fit_cooccurrence_vectorizer_feature_selection(self, mocked_chi2): # Given vectorizer_config = CooccurrenceVectorizerConfig( filter_stop_words=False) config = FeaturizerConfig( added_cooccurrence_feature_ratio=.3, cooccurrence_vectorizer_config=vectorizer_config) featurizer = Featurizer(config) mocked_dataset = {"language": "fr", "entities": {}, "intents": {}} utterances = [ text_to_utterance("a b c d e"), text_to_utterance("f g h i j"), text_to_utterance("none"), ] mocked_vectorizer = MagicMock() mocked_vectorizer.idf_diag = range(10) featurizer.tfidf_vectorizer = mocked_vectorizer classes = [0, 0, 1] # When mocked_chi2.return_value = (None, [0.1, 1.0, 0.2, 1.0, 0.3, 1.0] + [1.0 for _ in range(100)]) featurizer._fit_cooccurrence_vectorizer(utterances, classes, 1, mocked_dataset) # Then expected_pairs = {("a", "b"): 0, ("a", "d"): 1, ("b", "c"): 2} self.assertDictEqual(expected_pairs, featurizer.cooccurrence_vectorizer.word_pairs)
def test_should_be_serializable(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: dummy_intent utterances: - this is the number [number:snips/number](one) """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json pvalue_threshold = 0.42 config = FeaturizerConfig(pvalue_threshold=pvalue_threshold, added_cooccurrence_feature_ratio=0.2) shared = self.get_shared_data(dataset) featurizer = Featurizer(config=config, **shared) utterances = [ text_to_utterance("this is the number"), text_to_utterance("yo") ] classes = np.array([0, 1]) featurizer.fit(dataset, utterances, classes, max(classes)) # When featurizer.persist(self.tmp_file_path) # Then expected_featurizer_dict = { "language_code": "en", "tfidf_vectorizer": "tfidf_vectorizer", "cooccurrence_vectorizer": "cooccurrence_vectorizer", "config": config.to_dict() } featurizer_dict_path = self.tmp_file_path / "featurizer.json" self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict) expected_metadata = {"unit_name": "featurizer"} metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, expected_metadata) tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer" self.assertTrue(tfidf_vectorizer_path.exists()) cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer" self.assertTrue(cooc_vectorizer_path.exists())
def test_featurizer_config(self): # Given config_dict = { "sublinear_tf": True, } # When config = FeaturizerConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_featurizer_config(self): # Given config_dict = { "sublinear_tf": True, "pvalue_threshold": 0.4, "word_clusters_name": None } # When config = FeaturizerConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_intent_classifier_config(self): # Given config_dict = { "unit_name": LogRegIntentClassifier.unit_name, "data_augmentation_config": IntentClassifierDataAugmentationConfig().to_dict(), "featurizer_config": FeaturizerConfig().to_dict(), "random_seed": 42 } # When config = LogRegIntentClassifierConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def __init__(self, language, unknown_words_replacement_string, config=FeaturizerConfig(), tfidf_vectorizer=None, best_features=None, builtin_entity_parser=None, custom_entity_parser=None): self.config = config self.language = language if tfidf_vectorizer is None: tfidf_vectorizer = _get_tfidf_vectorizer( self.language, sublinear_tf=self.config.sublinear_tf) self.tfidf_vectorizer = tfidf_vectorizer self.best_features = best_features self.unknown_words_replacement_string = \ unknown_words_replacement_string self.builtin_entity_parser = builtin_entity_parser self.custom_entity_parser = custom_entity_parser
def test_should_be_deserializable(self): # Given language = LANGUAGE_EN idf_diag = [1.52, 1.21, 1.04] vocabulary = {"hello": 0, "beautiful": 1, "world": 2} best_features = [0, 1] pvalue_threshold = 0.4 entity_utterances_to_feature_names = { "entity_1": ["entityfeatureentity_1"] } featurizer_dict = { "config": FeaturizerConfig().to_dict(), "language_code": language, "tfidf_vectorizer": {"idf_diag": idf_diag, "vocab": vocabulary}, "best_features": best_features, "pvalue_threshold": pvalue_threshold, "entity_utterances_to_feature_names": entity_utterances_to_feature_names, "unknown_words_replacement_string": None } # When featurizer = Featurizer.from_dict(featurizer_dict) # Then self.assertEqual(featurizer.language, language) # pylint: disable=W0212 self.assertListEqual( featurizer.tfidf_vectorizer._tfidf._idf_diag.data.tolist(), idf_diag) # pylint: enable=W0212 self.assertDictEqual(featurizer.tfidf_vectorizer.vocabulary_, vocabulary) self.assertListEqual(featurizer.best_features, best_features) self.assertEqual(featurizer.pvalue_threshold, pvalue_threshold) self.assertDictEqual( featurizer.entity_utterances_to_feature_names, { k: set(v) for k, v in iteritems(entity_utterances_to_feature_names) })
def test_featurizer_config(self): # Given tfid_vectorizer_config = TfidfVectorizerConfig() cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig() config_dict = { "unit_name": "featurizer", "pvalue_threshold": 0.2, "added_cooccurrence_feature_ratio": 0.2, "tfidf_vectorizer_config": tfid_vectorizer_config.to_dict(), "cooccurrence_vectorizer_config": cooccurrence_vectorizer_config.to_dict() } # When config = FeaturizerConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def __init__(self, language, unknown_words_replacement_string, config=FeaturizerConfig(), tfidf_vectorizer=None, best_features=None, entity_utterances_to_feature_names=None): self.config = config self.language = language if tfidf_vectorizer is None: tfidf_vectorizer = _get_tfidf_vectorizer( self.language, sublinear_tf=self.config.sublinear_tf) self.tfidf_vectorizer = tfidf_vectorizer self.best_features = best_features self.entity_utterances_to_feature_names = \ entity_utterances_to_feature_names self.unknown_words_replacement_string = \ unknown_words_replacement_string
def test_feature_index_to_feature_name(self): # Given config = FeaturizerConfig(added_cooccurrence_feature_ratio=.75) featurizer = Featurizer(config=config) # When mocked_cooccurrence_vectorizer = MagicMock() mocked_cooccurrence_vectorizer.word_pairs = {("a", "b"): 0} mocked_tfidf_vectorizer = MagicMock() mocked_tfidf_vectorizer.vocabulary = {"a": 0} featurizer.cooccurrence_vectorizer = mocked_cooccurrence_vectorizer featurizer.tfidf_vectorizer = mocked_tfidf_vectorizer # Then expected = {0: "ngram:a", 1: "pair:a+b"} self.assertDictEqual(expected, featurizer.feature_index_to_feature_name)
def from_dict(cls, obj_dict, **shared): """Creates a :class:`Featurizer` instance from a :obj:`dict` The dict must have been generated with :func:`~Featurizer.to_dict` """ language = obj_dict["language_code"] config = FeaturizerConfig.from_dict(obj_dict["config"]) tfidf_vectorizer = _deserialize_tfidf_vectorizer( obj_dict["tfidf_vectorizer"], language, config.sublinear_tf) self = cls( language=language, tfidf_vectorizer=tfidf_vectorizer, best_features=obj_dict["best_features"], config=config, unknown_words_replacement_string=obj_dict[ "unknown_words_replacement_string"], builtin_entity_parser=shared.get(BUILTIN_ENTITY_PARSER), custom_entity_parser=shared.get(CUSTOM_ENTITY_PARSER) ) return self
def __init__(self, language, unknown_words_replacement_string, config=FeaturizerConfig(), tfidf_vectorizer=None, best_features=None, entity_utterances_to_feature_names=None, pvalue_threshold=0.4): self.config = config self.language = language if tfidf_vectorizer is None: tfidf_vectorizer = _get_tfidf_vectorizer(self.language, self.config.to_dict()) self.tfidf_vectorizer = tfidf_vectorizer self.best_features = best_features self.pvalue_threshold = pvalue_threshold self.entity_utterances_to_feature_names = \ entity_utterances_to_feature_names self.unknown_words_replacement_string = \ unknown_words_replacement_string
def from_dict(cls, obj_dict): """Creates a :class:`Featurizer` instance from a :obj:`dict` The dict must have been generated with :func:`~Featurizer.to_dict` """ language = obj_dict['language_code'] config = FeaturizerConfig.from_dict(obj_dict["config"]) tfidf_vectorizer = _deserialize_tfidf_vectorizer( obj_dict["tfidf_vectorizer"], language, config.sublinear_tf) entity_utterances_to_entity_names = { k: set(v) for k, v in iteritems( obj_dict['entity_utterances_to_feature_names']) } self = cls(language=language, tfidf_vectorizer=tfidf_vectorizer, entity_utterances_to_feature_names= entity_utterances_to_entity_names, best_features=obj_dict['best_features'], config=config, unknown_words_replacement_string=obj_dict[ "unknown_words_replacement_string"]) return self
def test_should_be_serializable(self): # Given language = LANGUAGE_EN tfidf_vectorizer = _get_tfidf_vectorizer(language) pvalue_threshold = 0.42 featurizer = Featurizer(language, config=FeaturizerConfig( pvalue_threshold=pvalue_threshold, word_clusters_name="brown_clusters"), unknown_words_replacement_string=None, tfidf_vectorizer=tfidf_vectorizer) dataset = { "entities": { "entity2": { "data": [{ "value": "entity1", "synonyms": ["entity1"] }], "use_synonyms": True, "automatically_extensible": True } }, "intents": {}, "language": "en" } dataset = validate_and_format_dataset(dataset) utterances = [ "hello world", "beautiful world", "hello here", "bird birdy", "beautiful bird" ] utterances = [text_to_utterance(u) for u in utterances] classes = np.array([0, 0, 0, 1, 1]) featurizer.fit(dataset, utterances, classes) # When serialized_featurizer = featurizer.to_dict() # Then msg = "Featurizer dict should be json serializable to utf8." with self.fail_if_exception(msg): dumped = json_string(serialized_featurizer) msg = "SnipsNLUEngine should be deserializable from dict with unicode" \ " values" with self.fail_if_exception(msg): _ = Featurizer.from_dict(json.loads(dumped)) vocabulary = tfidf_vectorizer.vocabulary_ # pylint: disable=W0212 idf_diag = tfidf_vectorizer._tfidf._idf_diag.data.tolist() # pylint: enable=W0212 best_features = featurizer.best_features entity_utterances_to_feature_names = { "entity1": ["entityfeatureentity2"] } expected_serialized = { "config": { 'sublinear_tf': False, 'pvalue_threshold': pvalue_threshold, 'word_clusters_name': "brown_clusters" }, "language_code": "en", "tfidf_vectorizer": { "idf_diag": idf_diag, "vocab": vocabulary }, "best_features": best_features, "entity_utterances_to_feature_names": entity_utterances_to_feature_names, "unknown_words_replacement_string": None } self.assertDictEqual(expected_serialized, serialized_featurizer)
def test_preprocess_utterances(self, mocked_stem, mocked_word_cluster): # Given language = LANGUAGE_EN def _stem(t): if t == "beautiful": s = "beauty" elif t == "birdy": s = "bird" elif t == "entity": s = "ent" else: s = t return s def stem_function(text, language): return get_default_sep(language).join( [_stem(t) for t in tokenize_light(text, language)]) mocked_word_cluster.return_value = { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } mocked_stem.side_effect = stem_function dataset = { "intents": { "intent1": { "utterances": [] } }, "entities": { "entity_1": { "data": [{ "value": "entity 1", "synonyms": ["alternative entity 1"] }, { "value": "éntity 1", "synonyms": ["alternative entity 1"] }], "use_synonyms": False, "automatically_extensible": False }, "entity_2": { "data": [{ "value": "entity 1", "synonyms": [] }, { "value": "Éntity 2", "synonyms": ["Éntity_2", "Alternative entity 2"] }], "use_synonyms": True, "automatically_extensible": False }, "snips/number": {} }, "language": "en", } dataset = validate_and_format_dataset(dataset) utterances = [ text_to_utterance("hÉllo wOrld Éntity_2"), text_to_utterance("beauTiful World entity 1"), text_to_utterance("Bird bïrdy"), ] labeled_utterance = { DATA: [{ TEXT: "beauTiful éntity " }, { TEXT: "1", ENTITY: "snips/number", SLOT_NAME: "number" }, { TEXT: " bIrd Éntity_2" }] } utterances.append(labeled_utterance) labels = np.array([0, 0, 1, 1]) featurizer = Featurizer( language, None, config=FeaturizerConfig(word_clusters_name="brown_clusters")).fit( dataset, utterances, labels) # When utterances = featurizer.preprocess_utterances(utterances) # Then expected_utterances = [ "hello world entity_2 builtinentityfeaturesnipsnumber " "entityfeatureentity_2", "beauty world ent 1 builtinentityfeaturesnipsnumber " "entityfeatureentity_1 entityfeatureentity_2 " "cluster_1 cluster_3", "bird bird", "beauty ent bird entity_2 builtinentityfeaturesnipsnumber " "builtinentityfeaturesnipsnumber entityfeatureentity_1 " "entityfeatureentity_2 entityfeatureentity_2 cluster_1" ] self.assertListEqual(utterances, expected_utterances)
def test_fit_transform_should_be_consistent_with_transform(self): # Here we mainly test that the output of fit_transform is # the same as the result of fit and then transform. # We're trying to avoid that for some reason indexes of features # get mixed up after feature selection # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = FeaturizerConfig(added_cooccurrence_feature_ratio=.5) shared = self.get_shared_data(dataset) featurizer = Featurizer(config=config, **shared) utterances = [{ "data": [{ "text": "hÉllo wOrld " }, { "text": "Éntity_2", "entity": "entity_2" }] }, { "data": [{ "text": "beauTiful World " }, { "text": "entity 1", "entity": "entity_1" }] }, { "data": [{ "text": "Bird bïrdy" }] }, { "data": [{ "text": "Bird bïrdy" }] }] classes = [0, 0, 1, 1] # When x_0 = featurizer.fit_transform(dataset, utterances, classes, max(classes)) x_1 = featurizer.transform(utterances) # Then self.assertListEqual(x_0.todense().tolist(), x_1.todense().tolist())