Beispiel #1
0
    def test_fit_series_no_anonymize(self):
        """Test fit with a pandas.Series, don't anonymize"""
        # Setup
        data = pd.Series(['bar', 'foo', 'foo', 'tar'])

        # Run
        transformer = Mock()
        transformer.anonymize = None

        CategoricalTransformer.fit(transformer, data)

        # Asserts
        expect_anonymize_call_count = 0
        expect_intervals_call_count = 1
        expect_intervals_call_args = pd.Series(['bar', 'foo', 'foo', 'tar'])

        self.assertEqual(
            transformer._anonymize.call_count,
            expect_anonymize_call_count,
            "Anonymize must be called only when anonymize is something"
        )

        self.assertEqual(
            transformer._get_intervals.call_count,
            expect_intervals_call_count,
            "Get intervals will be called always in fit"
        )

        pd.testing.assert_series_equal(
            transformer._get_intervals.call_args[0][0],
            expect_intervals_call_args
        )
Beispiel #2
0
    def test_fit_series_anonymize(self):
        """Test fit with a pandas.Series, anonymize"""
        # Setup
        data = pd.Series(['bar', 'foo', 'foo', 'tar'])
        data_anonymized = pd.Series(['bar', 'foo', 'foo', 'tar'])

        # Run
        transformer = Mock()
        transformer.anonymize = 'email'
        transformer._anonymize.return_value = data_anonymized

        CategoricalTransformer.fit(transformer, data)

        # Asserts
        expect_anonymize_call_count = 1
        expect_intervals_call_count = 1
        expect_intervals_call_args = pd.Series(['bar', 'foo', 'foo', 'tar'])

        self.assertEqual(
            transformer._anonymize.call_count,
            expect_anonymize_call_count,
            "Anonymize must be called only once"
        )

        self.assertEqual(
            transformer._get_intervals.call_count,
            expect_intervals_call_count,
            "Get intervals will be called always in fit"
        )

        pd.testing.assert_series_equal(
            transformer._get_intervals.call_args[0][0],
            expect_intervals_call_args
        )
Beispiel #3
0
def test_categoricaltransformer_mixed_more_rows(psutil_mock):
    """Test the CategoricalTransformer on mixed type data with low virtual memory.

    Ensure that the CategoricalTransformer can fit, transform, and reverse
    transform on mixed type data, when there is low virtual memory and a larger
    number of rows. Expect that the reverse transformed data is the same as the input.

    Input:
        - 4 rows of mixed data
    Output:
        - The reverse transformed data
    """
    # setup
    data = pd.Series([True, 'a', 1, None])
    transform_data = pd.Series(['a', 1, None, 'a', True, 1])
    transformer = CategoricalTransformer()

    virtual_memory = Mock()
    virtual_memory.available = 1
    psutil_mock.return_value = virtual_memory

    # run
    transformer.fit(data)
    transformed = transformer.transform(transform_data)
    reverse = transformer.reverse_transform(transformed)

    # assert
    pd.testing.assert_series_equal(transform_data, reverse)
Beispiel #4
0
def test_categorical_numerical_nans():
    """Ensure CategoricalTransformer works on numerical + nan only columns."""

    data = pd.Series([1, 2, float('nan'), np.nan])

    transformer = CategoricalTransformer()
    transformer.fit(data)
    transformed = transformer.transform(data)
    reverse = transformer.reverse_transform(transformed)

    pd.testing.assert_series_equal(reverse, data)
Beispiel #5
0
    def test_fit(self):
        # Setup
        transformer = CategoricalTransformer()

        # Run
        data = np.array(['foo', 'bar', 'bar', 'foo', 'foo', 'tar'])
        transformer.fit(data)

        # Asserts
        expected_intervals = {
            'foo': (0, 0.5, 0.25, 0.5 / 6),
            'bar':
            (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555),
            'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666,
                    0.027777777777777776)
        }
        assert transformer.intervals == expected_intervals
Beispiel #6
0
def test_categoricaltransformer_pickle_nans():
    """Ensure that CategoricalTransformer can be pickled and loaded with nan value."""
    # setup
    data = pd.Series([1, 2, float('nan'), np.nan])

    transformer = CategoricalTransformer()
    transformer.fit(data)
    transformed = transformer.transform(data)

    # create pickle file on memory
    bytes_io = BytesIO()
    pickle.dump(transformer, bytes_io)
    # rewind
    bytes_io.seek(0)

    # run
    pickled_transformer = pickle.load(bytes_io)

    # assert
    pickle_transformed = pickled_transformer.transform(data)
    np.testing.assert_array_equal(pickle_transformed, transformed)
Beispiel #7
0
    def test_reverse_transform_array(self):
        """Test reverse_transform a numpy.array"""
        # Setup
        data = np.array(['foo', 'bar', 'bar', 'foo', 'foo', 'tar'])
        rt_data = np.array([-0.6, 0.5, 0.6, 0.2, 0.1, -0.2])
        transformer = CategoricalTransformer()

        # Run
        transformer.fit(data)
        result = transformer.reverse_transform(rt_data)

        # Asserts
        expected_intervals = {
            'foo': (0, 0.5, 0.25, 0.5 / 6),
            'bar':
            (0.5, 0.8333333333333333, 0.6666666666666666, 0.05555555555555555),
            'tar': (0.8333333333333333, 0.9999999999999999, 0.9166666666666666,
                    0.027777777777777776)
        }
        assert transformer.intervals == expected_intervals

        expect = pd.Series(data)
        pd.testing.assert_series_equal(result, expect)