Esempio n. 1
0
def test_hypertransformer_with_transformers(faker_mock):
    faker_mock.return_value.first_name.side_effect = [
        'Jaime', 'Cersei', 'Tywin', 'Tyrion'
    ]
    data = get_input_data()
    transformers = get_transformers()

    ht = HyperTransformer(transformers)
    ht.fit(data)
    transformed = ht.transform(data)

    expected = get_transformed_data()

    np.testing.assert_allclose(
        transformed.sort_index(axis=1).values,
        expected.sort_index(axis=1).values)

    reversed_data = ht.reverse_transform(transformed)

    original_names = data.pop('names')
    reversed_names = reversed_data.pop('names')

    pd.testing.assert_frame_equal(data.sort_index(axis=1),
                                  reversed_data.sort_index(axis=1))

    for name in original_names:
        assert name not in reversed_names
    def compute(cls, real_data, synthetic_data, metadata=None):
        """Compute this metric.

        Args:
            real_data (pandas.DataFrame):
                The values from the real dataset.
            synthetic_data (pandas.DataFrame):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict.

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        metadata = cls._validate_inputs(real_data, synthetic_data, metadata)
        transformer = HyperTransformer()
        fields = cls._select_fields(metadata, cls.field_types)
        real_data = transformer.fit_transform(real_data[fields])
        synthetic_data = transformer.transform(synthetic_data[fields])

        values = []
        for column_name, real_column in real_data.items():
            real_column = real_column.values
            synthetic_column = synthetic_data[column_name].values

            score = cls.single_column_metric.compute(real_column,
                                                     synthetic_column)
            values.append(score)

        return np.nanmean(values)
Esempio n. 3
0
    def _fit(self, table_data):
        """Fit the model to the table.

        Args:
            table_data (pandas.DataFrame):
                Data to be learned.
        """
        distributions = self._field_distributions
        fields = self._metadata.get_fields()

        transformers = {}
        for field in table_data:
            field_name = field.replace('.value', '')

            if field_name in fields and fields.get(
                    field_name,
                    dict(),
            ).get('type') != 'categorical':
                transformers[field] = GaussianCopulaTransformer(
                    distribution=distributions.get(field_name,
                                                   self._default_distribution))

        self._ht = HyperTransformer(field_transformers=transformers)
        table_data = self._ht.fit_transform(table_data)

        super()._fit(table_data)
Esempio n. 4
0
    def test__analyze(self):
        """Test _analyze"""
        # Setup
        hp = HyperTransformer(dtype_transformers={'O': 'one_hot_encoding'})

        # Run
        data = pd.DataFrame({
            'int': [1, 2, None],
            'float': [1.0, 2.0, None],
            'object': ['foo', 'bar', None],
            'category': [1, 2, None],
            'bool': [True, False, None],
            'datetime': pd.to_datetime(['1965-05-23', None, '1997-10-17']),
        })
        data['category'] = data['category'].astype('category')
        result = hp._analyze(data)

        # Asserts
        assert isinstance(result, dict)
        assert set(result.keys()) == {'int', 'float', 'object', 'category', 'bool', 'datetime'}

        assert isinstance(result['int'], NumericalTransformer)
        assert isinstance(result['float'], NumericalTransformer)
        assert isinstance(result['object'], OneHotEncodingTransformer)
        assert isinstance(result['category'], OneHotEncodingTransformer)
        assert isinstance(result['bool'], BooleanTransformer)
        assert isinstance(result['datetime'], DatetimeTransformer)
Esempio n. 5
0
    def test___init__(self):
        """Test create new instance of HyperTransformer"""
        # Run
        ht = HyperTransformer()

        # Asserts
        self.assertTrue(ht.copy)
        self.assertEqual(ht.dtypes, None)
Esempio n. 6
0
def test_single_category():
    ht = HyperTransformer(transformers={'a': OneHotEncodingTransformer()})
    data = pd.DataFrame({'a': ['a', 'a', 'a']})

    ht.fit(data)
    transformed = ht.transform(data)

    reverse = ht.reverse_transform(transformed)

    pd.testing.assert_frame_equal(data, reverse)
Esempio n. 7
0
def test_dtype_category():
    df = pd.DataFrame({'a': ['a', 'b', 'c']}, dtype='category')

    ht = HyperTransformer()
    ht.fit(df)

    trans = ht.transform(df)

    rever = ht.reverse_transform(trans)

    pd.testing.assert_frame_equal(df, rever)
Esempio n. 8
0
    def test__analyze_invalid_dtype(self):
        """Test _analyze when a list of dtypes containing an invalid dtype is passed."""
        # Setup
        hp = HyperTransformer(dtypes=['int', 'complex'])

        # Run
        data = pd.DataFrame({
            'int': [1, 2, None],
            'complex': [1.0 + 0j, 2.0 + 1j, None],
        })
        with pytest.raises(ValueError):
            hp._analyze(data)
Esempio n. 9
0
def test_empty_transformers_nan_data():
    """If transformers is an empty dict, do nothing."""
    data = get_input_data_with_nan()

    ht = HyperTransformer(transformers={})
    ht.fit(data)

    transformed = ht.transform(data)
    reverse = ht.reverse_transform(transformed)

    pd.testing.assert_frame_equal(data, transformed)
    pd.testing.assert_frame_equal(data, reverse)
Esempio n. 10
0
    def compute(cls, real_data, synthetic_data, metadata=None):
        """Compute this metric.

        This builds a Machine Learning Classifier that learns to tell the synthetic
        data apart from the real data, which later on is evaluated using Cross Validation.

        The output of the metric is one minus the average ROC AUC score obtained.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict. If not passed, it is build based on the
                real_data fields and dtypes.

        Returns:
            float:
                One minus the ROC AUC Cross Validation Score obtained by the classifier.
        """
        metadata = cls._validate_inputs(real_data, synthetic_data, metadata)
        transformer = HyperTransformer(
            default_data_type_transformers={
                'categorical': OneHotEncodingTransformer(
                    error_on_unknown=False),
            })
        real_data = transformer.fit_transform(real_data).to_numpy()
        synthetic_data = transformer.transform(synthetic_data).to_numpy()

        X = np.concatenate([real_data, synthetic_data])
        y = np.hstack([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
        if np.isin(X, [np.inf, -np.inf]).any():
            X[np.isin(X, [np.inf, -np.inf])] = np.nan

        try:
            scores = []
            kf = StratifiedKFold(n_splits=3, shuffle=True)
            for train_index, test_index in kf.split(X, y):
                y_pred = cls._fit_predict(X[train_index], y[train_index],
                                          X[test_index])
                roc_auc = roc_auc_score(y[test_index], y_pred)

                scores.append(max(0.5, roc_auc) * 2 - 1)

            return 1 - np.mean(scores)
        except ValueError as err:
            raise IncomputableMetricError(
                f'DetectionMetric: Unable to be fit with error {err}')
Esempio n. 11
0
def test_subset_of_columns_nan_data():
    """HyperTransform should be able to transform a subset of the training columns.

    See https://github.com/sdv-dev/RDT/issues/152
    """
    data = get_input_data_with_nan()

    ht = HyperTransformer()
    ht.fit(data)

    subset = data[[data.columns[0]]]
    transformed = ht.transform(subset)
    reverse = ht.reverse_transform(transformed)

    pd.testing.assert_frame_equal(subset, reverse)
Esempio n. 12
0
    def _load_hyper_transformer(self, table_name):
        """Create and return a new ``rdt.HyperTransformer`` instance for a table.

        First get the ``dtypes`` and ``pii fields`` from a given table, then use
        those to build a transformer dictionary to be used by the ``HyperTransformer``.

        Args:
            table_name (str):
                Table name for which to load the HyperTransformer.

        Returns:
            rdt.HyperTransformer:
                Instance of ``rdt.HyperTransformer`` for the given table.
        """
        dtypes = self.get_dtypes(table_name)
        pii_fields = self._get_pii_fields(table_name)
        transformers_dict = self._get_transformers(dtypes, pii_fields)
        return HyperTransformer(transformers=transformers_dict)
Esempio n. 13
0
    def fit(self, table_data):
        """Fit the ``ColumnsModel``.

        Fit a ``GaussianUnivariate`` model to the ``self.constraint_column`` columns in the
        ``table_data`` in order to sample those columns when missing.

        Args:
            table_data (pandas.DataFrame):
                Table data.
        """
        data_to_model = table_data[self.constraint_columns]
        self._hyper_transformer = HyperTransformer(
            default_data_type_transformers={
                'categorical': 'OneHotEncodingTransformer'
            })
        transformed_data = self._hyper_transformer.fit_transform(data_to_model)
        self._model = GaussianMultivariate(distribution=GaussianUnivariate)
        self._model.fit(transformed_data)
Esempio n. 14
0
    def _fit(self, table_data):
        """Fit the model to the table.

        Args:
            table_data (pandas.DataFrame):
                Data to be learned.
        """
        distributions = self._field_distributions
        default = self._default_distribution
        fields = self._metadata.get_fields()
        transformers = {
            field: GaussianCopulaTransformer(
                distribution=distributions.get(field, default))
            for field in table_data.columns
            if fields.get(field, dict()).get('type') != 'categorical'
        }
        self._ht = HyperTransformer(transformers=transformers)
        table_data = self._ht.fit_transform(table_data)

        super()._fit(table_data)
Esempio n. 15
0
    def _compute_auroc(self, real_table, synthetic_table):
        transformer = HyperTransformer()
        real_table = transformer.fit_transform(real_table).values
        synthetic_table = transformer.transform(synthetic_table).values

        X = np.concatenate([real_table, synthetic_table])
        y = np.hstack([np.ones(len(real_table)), np.zeros(len(synthetic_table))])
        X[np.isnan(X)] = 0.0

        if len(X) < 20:
            warnings.warn("Not enough data, skipping the detection tests.")

        scores = []
        kf = StratifiedKFold(n_splits=3, shuffle=True)
        for train_index, test_index in kf.split(X, y):
            self.fit(X[train_index], y[train_index])
            y_pred = self.predict_proba(X[test_index])
            auroc = roc_auc_score(y[test_index], y_pred)
            if auroc < 0.5:
                auroc = 1.0 - auroc
            scores.append(auroc)
        return np.mean(scores)
Esempio n. 16
0
def test_hypertransformer_without_transformers_nan_data():
    data = get_input_data_with_nan()

    ht = HyperTransformer()
    ht.fit(data)
    transformed = ht.transform(data)

    expected = get_transformed_nan_data()

    np.testing.assert_allclose(
        transformed.sort_index(axis=1).values,
        expected.sort_index(axis=1).values)

    reversed_data = ht.reverse_transform(transformed)

    original_names = data.pop('names')
    reversed_names = reversed_data.pop('names')

    pd.testing.assert_frame_equal(data.sort_index(axis=1),
                                  reversed_data.sort_index(axis=1))

    for name in original_names:
        assert name not in reversed_names
Esempio n. 17
0
    def fit(self, table_data):
        """Fit ``Constraint`` class to data.

        If ``fit_columns_model`` is True, then this method will fit
        a ``GaussianCopula`` model to the relevant columns in ``table_data``.
        Subclasses can overwrite this method, or overwrite the ``_fit`` method
        if they will not be needing the model to handle conditional sampling.

        Args:
            table_data (pandas.DataFrame):
                Table data.
        """
        self._fit(table_data)

        if self.fit_columns_model and len(self.constraint_columns) > 1:
            data_to_model = table_data[list(self.constraint_columns)]
            self._hyper_transformer = HyperTransformer(dtype_transformers={
                'O': 'one_hot_encoding',
            })
            transformed_data = self._hyper_transformer.fit_transform(
                data_to_model)
            self._columns_model = GaussianMultivariate(
                distribution=GaussianUnivariate)
            self._columns_model.fit(transformed_data)
Esempio n. 18
0
    def compute(cls, real_data, synthetic_data, metadata=None, dtypes=None):
        """Compute this metric.

        This builds a Machine Learning Classifier that learns to tell the synthetic
        data apart from the real data, which later on is evaluated using Cross Validation.

        The output of the metric is one minus the average ROC AUC score obtained.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict. If not passed, it is build based on the
                real_data fields and dtypes.

        Returns:
            float:
                One minus the ROC AUC Cross Validation Score obtained by the classifier.
        """
        metadata = cls._validate_inputs(real_data, synthetic_data, metadata)

        transformer = HyperTransformer(
            dtype_transformers={'O': 'one_hot_encoding'}, dtypes=dtypes)
        real_data = transformer.fit_transform(real_data).values
        synthetic_data = transformer.transform(synthetic_data).values

        X = np.concatenate([real_data, synthetic_data])
        y = np.hstack([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
        if np.isin(X, [np.inf, -np.inf]).any():
            X[np.isin(X, [np.inf, -np.inf])] = np.nan

        try:
            scores = []
            kf = StratifiedKFold(n_splits=3, shuffle=True)
            for train_index, test_index in kf.split(X, y):
                y_pred, clf = cls._fit_predict(X[train_index], y[train_index],
                                               X[test_index])
                roc_auc = roc_auc_score(y[test_index], y_pred)
                scores.append(max(0.5, roc_auc) * 2 - 1)

            plot = False
            if plot:
                fpr, tpr, _ = roc_curve(y[test_index], y_pred)
                dummy_fpr = np.linspace(0, 1)
                dummy_tpr = np.linspace(0, 1)
                # plot the roc curve for the model
                plt.plot(dummy_fpr,
                         dummy_tpr,
                         linestyle='--',
                         label="Random Classifier")
                plt.plot(fpr, tpr, marker=',', label='ROC-curve')
                plt.fill_between(dummy_tpr, tpr)
                # axis labels
                plt.title("ROC-Curve Churn")
                plt.xlabel('False-Positive Rate')
                plt.ylabel('True-Positive Rate')
                # show the legend
                plt.legend()
                # show the plot
                plt.show()

            return 1 - np.mean(scores)

        except ValueError as err:
            LOGGER.info('DetectionMetric: Skipping due to %s', err)
            return np.nan