def test_fit_series_no_anonymize(self): """Test fit with a pandas.Series, don't anonymize""" # Setup data = pd.Series(['bar', 'foo', 'foo', 'tar']) # Run transformer = Mock() transformer.anonymize = None CategoricalTransformer.fit(transformer, data) # Asserts expect_anonymize_call_count = 0 expect_intervals_call_count = 1 expect_intervals_call_args = pd.Series(['bar', 'foo', 'foo', 'tar']) self.assertEqual( transformer._anonymize.call_count, expect_anonymize_call_count, "Anonymize must be called only when anonymize is something" ) self.assertEqual( transformer._get_intervals.call_count, expect_intervals_call_count, "Get intervals will be called always in fit" ) pd.testing.assert_series_equal( transformer._get_intervals.call_args[0][0], expect_intervals_call_args )
def test_fit_series_anonymize(self): """Test fit with a pandas.Series, anonymize""" # Setup data = pd.Series(['bar', 'foo', 'foo', 'tar']) data_anonymized = pd.Series(['bar', 'foo', 'foo', 'tar']) # Run transformer = Mock() transformer.anonymize = 'email' transformer._anonymize.return_value = data_anonymized CategoricalTransformer.fit(transformer, data) # Asserts expect_anonymize_call_count = 1 expect_intervals_call_count = 1 expect_intervals_call_args = pd.Series(['bar', 'foo', 'foo', 'tar']) self.assertEqual( transformer._anonymize.call_count, expect_anonymize_call_count, "Anonymize must be called only once" ) self.assertEqual( transformer._get_intervals.call_count, expect_intervals_call_count, "Get intervals will be called always in fit" ) pd.testing.assert_series_equal( transformer._get_intervals.call_args[0][0], expect_intervals_call_args )
def test_categoricaltransformer_mixed_more_rows(psutil_mock): """Test the CategoricalTransformer on mixed type data with low virtual memory. Ensure that the CategoricalTransformer can fit, transform, and reverse transform on mixed type data, when there is low virtual memory and a larger number of rows. Expect that the reverse transformed data is the same as the input. Input: - 4 rows of mixed data Output: - The reverse transformed data """ # setup data = pd.Series([True, 'a', 1, None]) transform_data = pd.Series(['a', 1, None, 'a', True, 1]) transformer = CategoricalTransformer() virtual_memory = Mock() virtual_memory.available = 1 psutil_mock.return_value = virtual_memory # run transformer.fit(data) transformed = transformer.transform(transform_data) reverse = transformer.reverse_transform(transformed) # assert pd.testing.assert_series_equal(transform_data, reverse)
def test_categorical_numerical_nans(): """Ensure CategoricalTransformer works on numerical + nan only columns.""" data = pd.Series([1, 2, float('nan'), np.nan]) transformer = CategoricalTransformer() transformer.fit(data) transformed = transformer.transform(data) reverse = transformer.reverse_transform(transformed) pd.testing.assert_series_equal(reverse, data)
def test_fit(self): # Setup transformer = CategoricalTransformer() # Run data = np.array(['foo', 'bar', 'bar', 'foo', 'foo', 'tar']) transformer.fit(data) # Asserts expected_intervals = { 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555), 'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666, 0.027777777777777776) } assert transformer.intervals == expected_intervals
def test_categoricaltransformer_pickle_nans(): """Ensure that CategoricalTransformer can be pickled and loaded with nan value.""" # setup data = pd.Series([1, 2, float('nan'), np.nan]) transformer = CategoricalTransformer() transformer.fit(data) transformed = transformer.transform(data) # create pickle file on memory bytes_io = BytesIO() pickle.dump(transformer, bytes_io) # rewind bytes_io.seek(0) # run pickled_transformer = pickle.load(bytes_io) # assert pickle_transformed = pickled_transformer.transform(data) np.testing.assert_array_equal(pickle_transformed, transformed)
def test_reverse_transform_array(self): """Test reverse_transform a numpy.array""" # Setup data = np.array(['foo', 'bar', 'bar', 'foo', 'foo', 'tar']) rt_data = np.array([-0.6, 0.5, 0.6, 0.2, 0.1, -0.2]) transformer = CategoricalTransformer() # Run transformer.fit(data) result = transformer.reverse_transform(rt_data) # Asserts expected_intervals = { 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555), 'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666, 0.027777777777777776) } assert transformer.intervals == expected_intervals expect = pd.Series(data) pd.testing.assert_series_equal(result, expect)