コード例 #1
0
ファイル: hyper_transformer.py プロジェクト: pythiac/RDT
    def _analyze(self, data):
        """Build a ``dict`` with column names and transformers from a given ``pandas.DataFrame``.

        When ``self.dtypes`` is ``None``, use the dtypes from the input data.

        When ``dtype`` is:
            - ``int``: a ``NumericalTransformer`` is created with ``dtype=int``.
            - ``float``: a ``NumericalTransformer`` is created with ``dtype=float``.
            - ``object`` or ``category``: a ``CategoricalTransformer`` is created.
            - ``bool``: a ``BooleanTransformer`` is created.
            - ``datetime``: a ``DatetimeTransformer`` is created.

        Any other ``dtype`` is not supported and raises a ``ValueError``.

        Args:
            data (pandas.DataFrame):
                Data used to analyze the ``pandas.DataFrame`` dtypes.

        Returns:
            dict:
                Mapping of column names and transformer instances.

        Raises:
            ValueError:
                if a ``dtype`` is not supported by the `HyperTransformer``.
        """
        transformers = dict()
        dtypes = self.dtypes or data.dtypes
        if self.dtypes:
            dtypes = self.dtypes
        else:
            dtypes = [
                data[column].dropna().infer_objects()
                for column in data.columns
            ]

        for name, dtype in zip(data.columns, dtypes):
            dtype = np.dtype(dtype)
            if dtype.kind == 'i':
                transformer = NumericalTransformer(dtype=int)
            elif dtype.kind == 'f':
                transformer = NumericalTransformer(dtype=float)
            elif dtype.kind == 'O':
                anonymize = self.anonymize.get(name)
                transformer = CategoricalTransformer(anonymize=anonymize)
            elif dtype.kind == 'b':
                transformer = BooleanTransformer()
            elif dtype.kind == 'M':
                transformer = DatetimeTransformer()
            else:
                raise ValueError('Unsupported dtype: {}'.format(dtype))

            transformers[name] = transformer

        return transformers
コード例 #2
0
    def test_transform_series(self):
        """Test transform pandas.Series"""
        # Setup
        data = pd.Series([1.5, None, 2.5])

        # Run
        transformer = Mock()
        NumericalTransformer.transform(transformer, data)

        # Asserts
        expect_call_count = 1

        self.assertEqual(transformer.null_transformer.transform.call_count,
                         expect_call_count,
                         "Transform must be called only once")
コード例 #3
0
    def test_transform_array(self):
        """Test transform numpy.array"""
        # Setup
        data = np.array([1.5, None, 2.5])

        # Run
        transformer = Mock()
        NumericalTransformer.transform(transformer, data)

        # Asserts
        expect_call_count = 1

        self.assertEqual(transformer.null_transformer.transform.call_count,
                         expect_call_count,
                         "Transform must be called only once")
コード例 #4
0
    def test_fit_nan_ignore_array(self):
        """Test fit nan ignore with numpy.array"""
        # Setup
        data = np.array([1.5, None, 2.5])

        # Run
        transformer = NumericalTransformer(dtype=np.float, nan=None)
        transformer.fit(data)

        # Asserts
        expect_fill_value = None
        expect_dtype = np.float

        self.assertEqual(transformer.null_transformer.fill_value,
                         expect_fill_value, "Data mean is wrong")

        self.assertEqual(transformer._dtype, expect_dtype,
                         "Expected dtype: float")
コード例 #5
0
    def test_fit_nan_mode_series(self):
        """Test fit nan mode with pandas.Series"""
        # Setup
        data = pd.Series([1.5, None, 2.5])

        # Run
        transformer = NumericalTransformer(dtype=np.float, nan='mode')
        transformer.fit(data)

        # Asserts
        expect_fill_value = 1.5
        expect_dtype = np.float

        self.assertEqual(transformer.null_transformer.fill_value,
                         expect_fill_value, "Data mean is wrong")

        self.assertEqual(transformer._dtype, expect_dtype,
                         "Expected dtype: float")
コード例 #6
0
    def test___init__(self):
        """Test default instance"""
        # Run
        transformer = NumericalTransformer()

        # Asserts
        self.assertEqual(transformer.nan, 'mean', "Unexpected nan")
        self.assertIsNone(transformer.null_column,
                          "null_column is None by default")
        self.assertIsNone(transformer.dtype, "dtype is None by default")
コード例 #7
0
    def test_reverse_transform_nan_not_ignore(self):
        """Test reverse_transform with nan not equal to ignore"""
        # Setup
        data = pd.Series([1.5, 2.0, 2.5])
        reversed_data = pd.Series([1.5, 2.0, 2.5])

        # Run
        transformer = Mock()
        transformer.nan = 'mean'
        transformer._dtype = np.float
        transformer.null_transformer.nulls = False
        transformer.null_transformer.reverse_transform.return_value = reversed_data

        NumericalTransformer.reverse_transform(transformer, data)

        # Asserts
        expected_reverse_transform_call_count = 1

        self.assertEqual(
            transformer.null_transformer.reverse_transform.call_count,
            expected_reverse_transform_call_count,
            "NullTransformer.reverse_transform must be called at least once")
コード例 #8
0
    def test_reverse_transform_dtype_int(self, numpy_mock):
        """Test reverse_transform with dtype equal to int"""
        # Setup
        numpy_mock.return_value = pd.Series([3, 2, 3])
        data = pd.Series([3.0, 2.0, 3.0])

        # Run
        transformer = Mock()
        transformer.nan = None
        transformer._dtype = np.int

        result = NumericalTransformer.reverse_transform(transformer, data)

        # Asserts
        expect = pd.Series([3.0, 2.0, 3.0])
        expected_reverse_transform_call_count = 0

        pd.testing.assert_series_equal(result, expect)
        self.assertEqual(
            transformer.null_transformer.reverse_transform.call_count,
            expected_reverse_transform_call_count,
            "NullTransformer.reverse_transform must be called at least once")
コード例 #9
0
    def test_reverse_transform_nan_ignore(self):
        """Test reverse_transform with nan equal to ignore"""
        # Setup
        data = pd.Series([1.5, None, 2.5])

        # Run
        transformer = Mock()
        transformer.nan = None
        transformer._dtype = np.float

        result = NumericalTransformer.reverse_transform(transformer, data)

        # Asserts
        expect = pd.Series([1.5, None, 2.5])
        expected_reverse_transform_call_count = 0

        pd.testing.assert_series_equal(result, expect)
        self.assertEqual(
            transformer.null_transformer.reverse_transform.call_count,
            expected_reverse_transform_call_count,
            "NullTransformer.reverse_transform can't be called when nan is ignore"
        )
コード例 #10
0
class HyperTransformer:
    """HyperTransformer class.

    The ``HyperTransformer`` class contains a collection of ``transformers`` that can be
    used to transform and reverse transform one or more columns at once.

    Args:
        transformers (dict or None):
            dict associating column names with transformers, which can be either passed
            directly as an instance or as a dict specification. If ``None``, a simple
            ``transformers`` dict is built automatically from the data.
        copy (bool):
            Whether to make a copy of the input data or not. Defaults to ``True``.
        anonymize (dict or None):
            Dictionary specifying the names and ``faker`` categories of the categorical
            columns that need to be anonymized. Defaults to ``None``.
        dtypes (list or None):
            List of column data types to use when building the ``transformers`` dict
            automatically. If not passed, the ``DataFrame.dtypes`` are used.
        dtype_transformers (dict or None):
            Transformer templates to use for each dtype. Passed as a dictionary of
            dtype kinds ('i', 'f', 'O', 'b', 'M') and transformer names, classes
            or instances.

    Example:
        Create a simple ``HyperTransformer`` instance that will decide which transformers
        to use based on the fit data ``dtypes``.

        >>> ht = HyperTransformer()

        Create a ``HyperTransformer`` passing a list of dtypes.

        >>> ht = HyperTransformer(dtypes=[int, 'object', np.float64, 'datetime', 'bool'])

        Create a ``HyperTransformer`` passing a ``transformers`` dict.

        >>> transformers = {
        ...     'a': NumericalTransformer(dtype=float),
        ...     'b': {
        ...         'class': 'NumericalTransformer',
        ...         'kwargs': {
        ...             'dtype': int
        ...         }
        ...     }
        ... }
        >>> ht = HyperTransformer(transformers)
    """

    _TRANSFORMER_TEMPLATES = {
        'numerical': NumericalTransformer,
        'integer': NumericalTransformer(dtype=int),
        'float': NumericalTransformer(dtype=float),
        'categorical': CategoricalTransformer,
        'categorical_fuzzy': CategoricalTransformer(fuzzy=True),
        'one_hot_encoding': OneHotEncodingTransformer(error_on_unknown=False),
        'label_encoding': LabelEncodingTransformer,
        'boolean': BooleanTransformer,
        'datetime': DatetimeTransformer,
    }
    _DTYPE_TRANSFORMERS = {
        'i': 'numerical',
        'f': 'numerical',
        'O': 'categorical',
        'b': 'boolean',
        'M': 'datetime',
    }

    def __init__(self,
                 transformers=None,
                 copy=True,
                 anonymize=None,
                 dtypes=None,
                 dtype_transformers=None):
        self.transformers = transformers
        self._transformers = dict()
        self.copy = copy
        self.anonymize = anonymize or dict()
        self.dtypes = dtypes
        self.dtype_transformers = self._DTYPE_TRANSFORMERS.copy()
        if dtype_transformers:
            self.dtype_transformers.update(dtype_transformers)

    def _analyze(self, data):
        """Build a ``dict`` with column names and transformers from a given ``pandas.DataFrame``.

        When ``self.dtypes`` is ``None``, use the dtypes from the input data.

        When ``dtype`` is:
            - ``int``: a ``NumericalTransformer`` is created with ``dtype=int``.
            - ``float``: a ``NumericalTransformer`` is created with ``dtype=float``.
            - ``object`` or ``category``: a ``CategoricalTransformer`` is created.
            - ``bool``: a ``BooleanTransformer`` is created.
            - ``datetime``: a ``DatetimeTransformer`` is created.

        Any other ``dtype`` is not supported and raises a ``ValueError``.

        Args:
            data (pandas.DataFrame):
                Data used to analyze the ``pandas.DataFrame`` dtypes.

        Returns:
            dict:
                Mapping of column names and transformer instances.

        Raises:
            ValueError:
                if a ``dtype`` is not supported by the `HyperTransformer``.
        """
        transformers = dict()
        if self.dtypes:
            dtypes = self.dtypes
        else:
            dtypes = [
                data[column].dropna().infer_objects().dtype
                for column in data.columns
            ]

        for name, dtype in zip(data.columns, dtypes):
            try:
                kind = np.dtype(dtype).kind
            except TypeError:
                # probably category
                kind = 'O'

            transformer_template = self.dtype_transformers[kind]
            if not transformer_template:
                raise ValueError('Unsupported dtype: {}'.format(dtype))

            if isinstance(transformer_template, str):
                transformer_template = self._TRANSFORMER_TEMPLATES[
                    transformer_template]

            if not isinstance(transformer_template, type):
                transformer = deepcopy(transformer_template)
            elif self.anonymize and transformer_template == CategoricalTransformer:
                warnings.warn(
                    'Categorical anonymization is deprecated and will be removed from RDT soon.',
                    DeprecationWarning)
                transformer = CategoricalTransformer(anonymize=self.anonymize)
            else:
                transformer = transformer_template()

            transformers[name] = transformer

        return transformers

    def fit(self, data):
        """Fit the transformers to the data.

        Args:
            data (pandas.DataFrame):
                Data to fit the transformers to.
        """
        if self.transformers is not None:
            self._transformers = load_transformers(self.transformers)
        else:
            self._transformers = self._analyze(data)

        for column_name, transformer in self._transformers.items():
            column = data[column_name]
            transformer.fit(column)

    def transform(self, data):
        """Transform the data.

        If ``self.copy`` is ``True`` make a copy of the input data to avoid modifying it.

        Args:
            data (pandas.DataFrame):
                Data to transform.

        Returns:
            pandas.DataFrame:
                Transformed data.
        """
        self.column_names = []
        if self.copy:
            data = data.copy()

        for column_name, transformer in self._transformers.items():
            column = data.pop(column_name)
            transformed = transformer.transform(column)
            self.column_names.append(column_name)

            shape = transformed.shape

            if len(shape) == 2:
                for index in range(shape[1]):
                    new_column = '{}#{}'.format(column_name, index)
                    data[new_column] = transformed[:, index]

            else:
                data[column_name] = transformed

        return data

    def fit_transform(self, data):
        """Fit the transformers to the data and then transform it.

        Args:
            data (pandas.DataFrame):
                Data to transform.

        Returns:
            pandas.DataFrame:
                Transformed data.
        """
        self.fit(data)
        return self.transform(data)

    @staticmethod
    def _get_columns(data, column_name):
        """Get one or more columns that match a given name.

        Args:
            data (pandas.DataFrame):
                Table to perform the matching.
            column_name (str):
                Name to match the columns.

        Returns:
            numpy.ndarray:
                values of the matching columns

        Raises:
            ValueError:
                if no columns match.
        """
        regex = r'{}(#[0-9]+)?$'.format(re.escape(column_name))
        columns = data.columns[data.columns.str.match(regex)]
        if columns.empty:
            raise ValueError('No columns match_ {}'.format(column_name))

        values = [data.pop(column).values for column in columns]

        if len(values) == 1:
            return values[0]

        return np.column_stack(values)

    def reverse_transform(self, data):
        """Revert the transformations back to the original values.

        Args:
            data (pandas.DataFrame):
                Data to revert.

        Returns:
            pandas.DataFrame:
                reversed data.
        """
        if self.copy:
            data = data.copy()

        for column_name, transformer in self._transformers.items():
            columns = self._get_columns(data, column_name)
            data[column_name] = transformer.reverse_transform(columns)

        return data