def test__reverse_transfrom_by_matrix(self, psutil_mock): """Test the _reverse_transform_by_matrix method with numerical data Expect that the transformed data is correctly reverse transformed. Setup: The categorical transformer is instantiated with 4 categories and means. Also patch the `psutil.virtual_memory` function to return a large enough `available_memory`. Input: - transformed data with 4 rows Ouptut: - the original data """ # Setup data = pd.Series([1, 2, 3, 4]) transformed = pd.Series([0.875, 0.625, 0.375, 0.125]) transformer = CategoricalTransformer() transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1]) transformer.dtype = data.dtype virtual_memory = Mock() virtual_memory.available = 4 * 4 * 8 * 3 + 1 psutil_mock.return_value = virtual_memory # Run reverse = transformer._reverse_transform_by_matrix(transformed) # Assert pd.testing.assert_series_equal(data, reverse)
def test_categoricaltransformer_mixed_low_virtual_memory(psutil_mock): """Test the CategoricalTransformer on mixed type data with low virtual memory. Ensure that the CategoricalTransformer can fit, transform, and reverse transform on mixed type data, when there is low virtual memory. Expect that the reverse transformed data is the same as the input. Input: - 4 rows of mixed data Output: - The reverse transformed data """ # setup data = pd.Series([True, 'a', 1, None]) transformer = CategoricalTransformer() virtual_memory = Mock() virtual_memory.available = 1 psutil_mock.return_value = virtual_memory # run reverse = transformer.reverse_transform(transformer.fit_transform(data)) # assert pd.testing.assert_series_equal(data, reverse)
def test_fit_series_no_anonymize(self): """Test fit with a pandas.Series, don't anonymize""" # Setup data = pd.Series(['bar', 'foo', 'foo', 'tar']) # Run transformer = Mock() transformer.anonymize = None CategoricalTransformer.fit(transformer, data) # Asserts expect_anonymize_call_count = 0 expect_intervals_call_count = 1 expect_intervals_call_args = pd.Series(['bar', 'foo', 'foo', 'tar']) self.assertEqual( transformer._anonymize.call_count, expect_anonymize_call_count, "Anonymize must be called only when anonymize is something" ) self.assertEqual( transformer._get_intervals.call_count, expect_intervals_call_count, "Get intervals will be called always in fit" ) pd.testing.assert_series_equal( transformer._get_intervals.call_args[0][0], expect_intervals_call_args )
def test_fit_series_anonymize(self): """Test fit with a pandas.Series, anonymize""" # Setup data = pd.Series(['bar', 'foo', 'foo', 'tar']) data_anonymized = pd.Series(['bar', 'foo', 'foo', 'tar']) # Run transformer = Mock() transformer.anonymize = 'email' transformer._anonymize.return_value = data_anonymized CategoricalTransformer.fit(transformer, data) # Asserts expect_anonymize_call_count = 1 expect_intervals_call_count = 1 expect_intervals_call_args = pd.Series(['bar', 'foo', 'foo', 'tar']) self.assertEqual( transformer._anonymize.call_count, expect_anonymize_call_count, "Anonymize must be called only once" ) self.assertEqual( transformer._get_intervals.call_count, expect_intervals_call_count, "Get intervals will be called always in fit" ) pd.testing.assert_series_equal( transformer._get_intervals.call_args[0][0], expect_intervals_call_args )
def test__transform_by_row(self): """Test the `_transform_by_row` method with numerical data. Expect that the correct transformed data is returned. Setup: The categorical transformer is instantiated with 4 categories and intervals. Input: - data with 4 rows Ouptut: - the transformed data """ # Setup data = pd.Series([1, 2, 3, 4]) transformer = CategoricalTransformer() transformer.intervals = { 4: (0, 0.25, 0.125, 0.041666666666666664), 3: (0.25, 0.5, 0.375, 0.041666666666666664), 2: (0.5, 0.75, 0.625, 0.041666666666666664), 1: (0.75, 1.0, 0.875, 0.041666666666666664), } # Run transformed = transformer._transform_by_row(data) # Asserts expected = np.array([0.875, 0.625, 0.375, 0.125]) assert (transformed == expected).all()
def test__get_faker_anonymize_category_not_exist(self): """Test _get_faker with a category that don't exist""" # Run & assert transformer = Mock() transformer.anonymize = 'SuP3R-P1Th0N-P0w3R' with self.assertRaises(ValueError): CategoricalTransformer._get_faker(transformer)
def test__get_value_no_fuzzy(self): # Setup transformer = CategoricalTransformer(fuzzy=False) transformer.fuzzy = False transformer.intervals = { 'foo': (0, 0.5, 0.25, 0.5 / 6), } # Run result = transformer._get_value('foo') # Asserts assert result == 0.25
def test__normalize_no_clip(self): """Test normalize data""" # Setup transformer = CategoricalTransformer(clip=False) # Run data = pd.Series([-0.43, 0.1234, 1.5, -1.31]) result = transformer._normalize(data) # Asserts expect = pd.Series([0.57, 0.1234, 0.5, 0.69], dtype=float) pd.testing.assert_series_equal(result, expect)
def test__get_value_fuzzy(self, rvs_mock): # setup rvs_mock.return_value = 0.2745 transformer = CategoricalTransformer(fuzzy=True) transformer.intervals = { 'foo': (0, 0.5, 0.25, 0.5 / 6), } # Run result = transformer._get_value('foo') # Asserts assert result == 0.2745
def test___init__(self): """Test default instance""" # Run transformer = CategoricalTransformer() # Asserts self.assertFalse(transformer.anonymize, "Unexpected anonimyze default value")
def test__transform_by_row_called(self): """Test that the `_transform_by_row` method is called. When the number of rows is less than or equal to the number of categories, expect that the `_transform_by_row` method is called. Setup: The categorical transformer is instantiated with 4 categories. Input: - data with 4 rows Output: - the output of `_transform_by_row` Side effects: - `_transform_by_row` will be called once """ # Setup data = pd.Series([1, 2, 3, 4]) categorical_transformer_mock = Mock() categorical_transformer_mock.means = pd.Series( [0.125, 0.375, 0.625, 0.875]) # Run transformed = CategoricalTransformer.transform( categorical_transformer_mock, data) # Asserts categorical_transformer_mock._transform_by_row.assert_called_once_with( data) assert transformed == categorical_transformer_mock._transform_by_row.return_value
def test_fit(self): # Setup transformer = CategoricalTransformer() # Run data = np.array(['foo', 'bar', 'bar', 'foo', 'foo', 'tar']) transformer.fit(data) # Asserts expected_intervals = { 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555), 'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666, 0.027777777777777776) } assert transformer.intervals == expected_intervals
def test_transform_array_no_anonymize(self, mock_maps): """Test transform a numpy.array, no anonymize""" # Setup data = np.array(['bar', 'foo', 'foo', 'tar']) # Run transformer = Mock() transformer.anonymize = None CategoricalTransformer.transform(transformer, data) # Asserts expect_maps_call_count = 0 self.assertEqual( mock_maps.call_count, expect_maps_call_count, "Dont call to the map encoder when not anonymize" )
def _analyze(self, data): """Build a ``dict`` with column names and transformers from a given ``pandas.DataFrame``. When ``self.dtypes`` is ``None``, use the dtypes from the input data. When ``dtype`` is: - ``int``: a ``NumericalTransformer`` is created with ``dtype=int``. - ``float``: a ``NumericalTransformer`` is created with ``dtype=float``. - ``object`` or ``category``: a ``CategoricalTransformer`` is created. - ``bool``: a ``BooleanTransformer`` is created. - ``datetime``: a ``DatetimeTransformer`` is created. Any other ``dtype`` is not supported and raises a ``ValueError``. Args: data (pandas.DataFrame): Data used to analyze the ``pandas.DataFrame`` dtypes. Returns: dict: Mapping of column names and transformer instances. Raises: ValueError: if a ``dtype`` is not supported by the `HyperTransformer``. """ transformers = dict() dtypes = self.dtypes or data.dtypes if self.dtypes: dtypes = self.dtypes else: dtypes = [ data[column].dropna().infer_objects() for column in data.columns ] for name, dtype in zip(data.columns, dtypes): dtype = np.dtype(dtype) if dtype.kind == 'i': transformer = NumericalTransformer(dtype=int) elif dtype.kind == 'f': transformer = NumericalTransformer(dtype=float) elif dtype.kind == 'O': anonymize = self.anonymize.get(name) transformer = CategoricalTransformer(anonymize=anonymize) elif dtype.kind == 'b': transformer = BooleanTransformer() elif dtype.kind == 'M': transformer = DatetimeTransformer() else: raise ValueError('Unsupported dtype: {}'.format(dtype)) transformers[name] = transformer return transformers
def test___init__(self): """Passed arguments must be stored as attributes.""" # Run transformer = CategoricalTransformer( fuzzy='fuzzy_value', clip='clip_value', ) # Asserts assert transformer.fuzzy == 'fuzzy_value' assert transformer.clip == 'clip_value'
def test_categoricaltransformer_integers(): """Test the CategoricalTransformer on integer data. Ensure that the CategoricalTransformer can fit, transform, and reverse transform on integer data. Expect that the reverse transformed data is the same as the input. Input: - 4 rows of int data Output: - The reverse transformed data """ # setup data = pd.Series([1, 2, 3, 2]) transformer = CategoricalTransformer() # run reverse = transformer.reverse_transform(transformer.fit_transform(data)) # assert pd.testing.assert_series_equal(data, reverse)
def test_categoricaltransformer_mixed(): """Test the CategoricalTransformer on mixed type data. Ensure that the CategoricalTransformer can fit, transform, and reverse transform on mixed type data. Expect that the reverse transformed data is the same as the input. Input: - 4 rows of mixed data Output: - The reverse transformed data """ # setup data = pd.Series([True, 'a', 1, None]) transformer = CategoricalTransformer() # run reverse = transformer.reverse_transform(transformer.fit_transform(data)) # assert pd.testing.assert_series_equal(data, reverse)
def test_categoricaltransformer_pickle_nans(): """Ensure that CategoricalTransformer can be pickled and loaded with nan value.""" # setup data = pd.Series([1, 2, float('nan'), np.nan]) transformer = CategoricalTransformer() transformer.fit(data) transformed = transformer.transform(data) # create pickle file on memory bytes_io = BytesIO() pickle.dump(transformer, bytes_io) # rewind bytes_io.seek(0) # run pickled_transformer = pickle.load(bytes_io) # assert pickle_transformed = pickled_transformer.transform(data) np.testing.assert_array_equal(pickle_transformed, transformed)
def test__get_faker_anonymize_list_type(self): """Test _get_faker when anonymize is a list with two elements""" # Run transformer = Mock() transformer.anonymize = ['credit_card_number', 'visa'] faker_method = CategoricalTransformer._get_faker(transformer) fake_value = faker_method() # Asserts assert isinstance(fake_value, str) assert len(fake_value) == 16
def test__normalize(self): """Test normalize data""" # Setup data = pd.Series([-0.43, 0.1234, 1.5, -1.31]) # Run result = CategoricalTransformer._normalize(data) # Asserts expect = pd.Series([0.43, 0.1234, 0.5, 0.31], dtype=float) pd.testing.assert_series_equal(result, expect)
def test__get_value(self, scipy_mock): """Test convert category value into num between 0 and 1""" # Run transformer = Mock() transformer.intervals = { 'foo': (0, 0.5), } result = CategoricalTransformer._get_value(transformer, 'foo') # Asserts assert result == 0.25
def test_categoricaltransformer_strings_2_categories(): """Test the CategoricalTransformer on string data. Ensure that the CategoricalTransformer can fit, transform, and reverse transform on string data, when there are 2 categories of strings with the same value counts. Expect that the reverse transformed data is the same as the input. Input: - 4 rows of string data Output: - The reverse transformed data """ # setup data = pd.Series(['a', 'b', 'a', 'b']) transformer = CategoricalTransformer() reverse = transformer.reverse_transform(transformer.fit_transform(data)) # assert pd.testing.assert_series_equal(data, reverse)
def test__reverse_transform_by_category(self, psutil_mock): """Test the _reverse_transform_by_category method with numerical data. Expect that the transformed data is correctly reverse transformed. Setup: The categorical transformer is instantiated with 4 categories, and the means and intervals are set for those categories. Also patch the `psutil.virtual_memory` function to return an `available_memory` of 1. Input: - transformed data with 5 rows Ouptut: - the original data """ data = pd.Series([1, 3, 3, 2, 1]) transformed = pd.Series([0.875, 0.375, 0.375, 0.625, 0.875]) transformer = CategoricalTransformer() transformer.means = pd.Series([0.125, 0.375, 0.625, 0.875], index=[4, 3, 2, 1]) transformer.intervals = { 4: (0, 0.25, 0.125, 0.041666666666666664), 3: (0.25, 0.5, 0.375, 0.041666666666666664), 2: (0.5, 0.75, 0.625, 0.041666666666666664), 1: (0.75, 1.0, 0.875, 0.041666666666666664), } transformer.dtype = data.dtype virtual_memory = Mock() virtual_memory.available = 1 psutil_mock.return_value = virtual_memory reverse = transformer._reverse_transform_by_category(transformed) pd.testing.assert_series_equal(data, reverse)
def test__get_faker_anonymize_not_tuple_or_list(self): """Test _get_faker when anonymize is neither a typle or a list""" # Run transformer = Mock() transformer.anonymize = 'email' result = CategoricalTransformer._get_faker(transformer) # Asserts self.assertEqual( result.__name__, 'faker', "Expected faker function" )
def test__get_faker_anonymize_list(self): """Test _get_faker when anonymize is a list""" # Run transformer = Mock() transformer.anonymize = ['email'] result = CategoricalTransformer._get_faker(transformer) # Asserts self.assertEqual( result.__name__, 'faker', "Expected faker function" )
def test__get_intervals(self): # Run data = pd.Series(['foo', 'bar', 'bar', 'foo', 'foo', 'tar']) result = CategoricalTransformer._get_intervals(data) # Asserts expected_intervals = { 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555), 'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666, 0.027777777777777776) } assert result[0] == expected_intervals
def test_reverse_transform_array(self): """Test reverse_transform a numpy.array""" # Setup data = np.array(['foo', 'bar', 'bar', 'foo', 'foo', 'tar']) rt_data = np.array([-0.6, 0.5, 0.6, 0.2, 0.1, -0.2]) transformer = CategoricalTransformer() # Run transformer.fit(data) result = transformer.reverse_transform(rt_data) # Asserts expected_intervals = { 'foo': (0, 0.5, 0.25, 0.5 / 6), 'bar': (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555), 'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666, 0.027777777777777776) } assert transformer.intervals == expected_intervals expect = pd.Series(data) pd.testing.assert_series_equal(result, expect)
def test__get_intervals(self): """Test get category intervals""" # Setup data = pd.Series(['bar', 'foo', 'foo', 'tar']) # Run result = CategoricalTransformer._get_intervals(data) # Asserts expected_intervals = { 'foo': (0, 0.5), 'tar': (0.5, 0.75), 'bar': (0.75, 1) } assert result == expected_intervals
def test__get_faker_anonymize_tuple(self): """Test _get_faker when anonymize is a tuple""" # Setup # Run transformer = Mock() transformer.anonymize = ('email',) result = CategoricalTransformer._get_faker(transformer) # Asserts self.assertEqual( result.__name__, 'faker', "Expected faker function" )
def test_categorical_numerical_nans(): """Ensure CategoricalTransformer works on numerical + nan only columns.""" data = pd.Series([1, 2, float('nan'), np.nan]) transformer = CategoricalTransformer() transformer.fit(data) transformed = transformer.transform(data) reverse = transformer.reverse_transform(transformed) pd.testing.assert_series_equal(reverse, data)