def test_fit_transform(self): """Test call fit_transform""" # Run transformer = Mock() HyperTransformer.fit_transform(transformer, pd.DataFrame()) # Asserts expect_call_count_fit = 1 expect_call_count_transform = 1 expect_call_args_fit = pd.DataFrame() expect_call_args_transform = pd.DataFrame() self.assertEqual( transformer.fit.call_count, expect_call_count_fit ) pd.testing.assert_frame_equal( transformer.fit.call_args[0][0], expect_call_args_fit ) self.assertEqual( transformer.transform.call_count, expect_call_count_transform ) pd.testing.assert_frame_equal( transformer.transform.call_args[0][0], expect_call_args_transform )
def compute(cls, real_data, synthetic_data, metadata=None): """Compute this metric. Args: real_data (pandas.DataFrame): The values from the real dataset. synthetic_data (pandas.DataFrame): The values from the synthetic dataset. metadata (dict): Table metadata dict. Returns: Union[float, tuple[float]]: Metric output. """ metadata = cls._validate_inputs(real_data, synthetic_data, metadata) transformer = HyperTransformer() fields = cls._select_fields(metadata, cls.field_types) real_data = transformer.fit_transform(real_data[fields]) synthetic_data = transformer.transform(synthetic_data[fields]) values = [] for column_name, real_column in real_data.items(): real_column = real_column.values synthetic_column = synthetic_data[column_name].values score = cls.single_column_metric.compute(real_column, synthetic_column) values.append(score) return np.nanmean(values)
def compute(cls, real_data, synthetic_data, metadata=None): """Compute this metric. This builds a Machine Learning Classifier that learns to tell the synthetic data apart from the real data, which later on is evaluated using Cross Validation. The output of the metric is one minus the average ROC AUC score obtained. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. If not passed, it is build based on the real_data fields and dtypes. Returns: float: One minus the ROC AUC Cross Validation Score obtained by the classifier. """ metadata = cls._validate_inputs(real_data, synthetic_data, metadata) transformer = HyperTransformer( default_data_type_transformers={ 'categorical': OneHotEncodingTransformer( error_on_unknown=False), }) real_data = transformer.fit_transform(real_data).to_numpy() synthetic_data = transformer.transform(synthetic_data).to_numpy() X = np.concatenate([real_data, synthetic_data]) y = np.hstack([np.ones(len(real_data)), np.zeros(len(synthetic_data))]) if np.isin(X, [np.inf, -np.inf]).any(): X[np.isin(X, [np.inf, -np.inf])] = np.nan try: scores = [] kf = StratifiedKFold(n_splits=3, shuffle=True) for train_index, test_index in kf.split(X, y): y_pred = cls._fit_predict(X[train_index], y[train_index], X[test_index]) roc_auc = roc_auc_score(y[test_index], y_pred) scores.append(max(0.5, roc_auc) * 2 - 1) return 1 - np.mean(scores) except ValueError as err: raise IncomputableMetricError( f'DetectionMetric: Unable to be fit with error {err}')
def _compute_auroc(self, real_table, synthetic_table): transformer = HyperTransformer() real_table = transformer.fit_transform(real_table).values synthetic_table = transformer.transform(synthetic_table).values X = np.concatenate([real_table, synthetic_table]) y = np.hstack([np.ones(len(real_table)), np.zeros(len(synthetic_table))]) X[np.isnan(X)] = 0.0 if len(X) < 20: warnings.warn("Not enough data, skipping the detection tests.") scores = [] kf = StratifiedKFold(n_splits=3, shuffle=True) for train_index, test_index in kf.split(X, y): self.fit(X[train_index], y[train_index]) y_pred = self.predict_proba(X[test_index]) auroc = roc_auc_score(y[test_index], y_pred) if auroc < 0.5: auroc = 1.0 - auroc scores.append(auroc) return np.mean(scores)
class CopulaGAN(CTGAN): """Combination of GaussianCopula transformation and GANs. This model extends the ``CTGAN`` model to add the flexibility of the GaussianCopula transformations provided by the ``GaussianCopulaTransformer`` from ``RDT``. Overall, the fitting process consists of the following steps: 1. Transform each non categorical variable from the input data using a ``GaussianCopulaTransformer``: i. If not specified, find out the distribution which each one of the variables from the input dataset has. ii. Transform each variable to a standard normal space by applying the CDF of the corresponding distribution and later on applying an inverse CDF from a standard normal distribution. 2. Fit CTGAN with the transformed table. And the process of sampling is: 1. Sample using CTGAN 2. Reverse the previous transformation by applying the CDF of a standard normal distribution and then inverting the CDF of the distribution that correpsonds to each variable. The arguments of this model are the same as for CTGAN except for two additional arguments, ``field_distributions`` and ``default_distribution`` that give the ability to define specific transformations for individual fields as well as which distribution to use by default if no specific distribution has been selected. Distributions can be passed as a ``copulas`` univariate instance or as one of the following string values: * ``univariate``: Let ``copulas`` select the optimal univariate distribution. This may result in non-parametric models being used. * ``parametric``: Let ``copulas`` select the optimal univariate distribution, but restrict the selection to parametric distributions only. * ``bounded``: Let ``copulas`` select the optimal univariate distribution, but restrict the selection to bounded distributions only. This may result in non-parametric models being used. * ``semi_bounded``: Let ``copulas`` select the optimal univariate distribution, but restrict the selection to semi-bounded distributions only. This may result in non-parametric models being used. * ``parametric_bounded``: Let ``copulas`` select the optimal univariate distribution, but restrict the selection to parametric and bounded distributions only. * ``parametric_semi_bounded``: Let ``copulas`` select the optimal univariate distribution, but restrict the selection to parametric and semi-bounded distributions only. * ``gaussian``: Use a Gaussian distribution. * ``gamma``: Use a Gamma distribution. * ``beta``: Use a Beta distribution. * ``student_t``: Use a Student T distribution. * ``gaussian_kde``: Use a GaussianKDE distribution. This model is non-parametric, so using this will make ``get_parameters`` unusable. * ``truncated_gaussian``: Use a Truncated Gaussian distribution. Args: field_names (list[str]): List of names of the fields that need to be modeled and included in the generated output data. Any additional fields found in the data will be ignored and will not be included in the generated output. If ``None``, all the fields found in the data are used. field_types (dict[str, dict]): Dictinary specifying the data types and subtypes of the fields that will be modeled. Field types and subtypes combinations must be compatible with the SDV Metadata Schema. field_transformers (dict[str, str]): Dictinary specifying which transformers to use for each field. Available transformers are: * ``integer``: Uses a ``NumericalTransformer`` of dtype ``int``. * ``float``: Uses a ``NumericalTransformer`` of dtype ``float``. * ``categorical``: Uses a ``CategoricalTransformer`` without gaussian noise. * ``categorical_fuzzy``: Uses a ``CategoricalTransformer`` adding gaussian noise. * ``one_hot_encoding``: Uses a ``OneHotEncodingTransformer``. * ``label_encoding``: Uses a ``LabelEncodingTransformer``. * ``boolean``: Uses a ``BooleanTransformer``. * ``datetime``: Uses a ``DatetimeTransformer``. anonymize_fields (dict[str, str]): Dict specifying which fields to anonymize and what faker category they belong to. primary_key (str): Name of the field which is the primary key of the table. constraints (list[Constraint, dict]): List of Constraint objects or dicts. table_metadata (dict or metadata.Table): Table metadata instance or dict representation. If given alongside any other metadata-related arguments, an exception will be raised. If not given at all, it will be built using the other arguments or learned from the data. log_frequency (boolean): Whether to use log frequency of categorical levels in conditional sampling. Defaults to ``True``. embedding_dim (int): Size of the random sample passed to the Generator. Defaults to 128. generator_dim (tuple or list of ints): Size of the output samples for each one of the Residuals. A Resiudal Layer will be created for each one of the values provided. Defaults to (256, 256). discriminator_dim (tuple or list of ints): Size of the output samples for each one of the Discriminator Layers. A Linear Layer will be created for each one of the values provided. Defaults to (256, 256). batch_size (int): Number of data samples to process in each step. verbose (bool): Whether to print fit progress on stdout. Defaults to ``False``. epochs (int): Number of training epochs. Defaults to 300. cuda (bool or str): If ``True``, use CUDA. If an ``str``, use the indicated device. If ``False``, do not use cuda at all. field_distributions (dict): Optionally specify a dictionary that maps the name of each field to the distribution that must be used in it. Fields that are not specified in the input ``dict`` will be modeled using the default distribution. Defaults to ``None``. default_distribution (copulas.univariate.Univariate or str): Distribution to use on the fields for which no specific distribution has been given. Defaults to ``parametric``. """ DEFAULT_DISTRIBUTION = 'parametric' def __init__(self, field_names=None, field_types=None, field_transformers=None, anonymize_fields=None, primary_key=None, constraints=None, table_metadata=None, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256), generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4, discriminator_decay=1e-6, batch_size=500, discriminator_steps=1, log_frequency=True, verbose=False, epochs=300, cuda=True, field_distributions=None, default_distribution=None): super().__init__(field_names=field_names, primary_key=primary_key, field_types=field_types, field_transformers=field_transformers, anonymize_fields=anonymize_fields, constraints=constraints, table_metadata=table_metadata, embedding_dim=embedding_dim, generator_dim=generator_dim, discriminator_dim=discriminator_dim, generator_lr=generator_lr, generator_decay=generator_decay, discriminator_lr=discriminator_lr, discriminator_decay=discriminator_decay, batch_size=batch_size, discriminator_steps=discriminator_steps, log_frequency=log_frequency, verbose=verbose, epochs=epochs, cuda=cuda) self._field_distributions = field_distributions or dict() self._default_distribution = default_distribution or self.DEFAULT_DISTRIBUTION def get_distributions(self): """Get the marginal distributions used by this CopulaGAN. Returns: dict: Dictionary containing the distributions used or detected for each column. """ return { field: transformer._univariate.to_dict()['type'] for field, transformer in self._ht.transformers.items() } def _fit(self, table_data): """Fit the model to the table. Args: table_data (pandas.DataFrame): Data to be learned. """ distributions = self._field_distributions default = self._default_distribution fields = self._metadata.get_fields() transformers = { field: GaussianCopulaTransformer( distribution=distributions.get(field, default)) for field in table_data.columns if fields.get(field, dict()).get('type') != 'categorical' } self._ht = HyperTransformer(transformers=transformers) table_data = self._ht.fit_transform(table_data) super()._fit(table_data) def _sample(self, num_rows, conditions=None): """Sample the indicated number of rows from the model. Args: num_rows (int): Amount of rows to sample. conditions (dict): If specified, this dictionary maps column names to the column value. Then, this method generates `num_rows` samples, all of which are conditioned on the given variables. Returns: pandas.DataFrame: Sampled data. """ sampled = super()._sample(num_rows, conditions) return self._ht.reverse_transform(sampled)
class ColumnsModel: """ColumnsModel class. The ``ColumnsModel`` class enables the usage of conditional sampling when a column is a ``constraint``. """ _columns_model = None def __init__(self, constraint, constraint_columns): if isinstance(constraint_columns, list): self.constraint_columns = constraint_columns else: self.constraint_columns = [constraint_columns] self.constraint = constraint def fit(self, table_data): """Fit the ``ColumnsModel``. Fit a ``GaussianUnivariate`` model to the ``self.constraint_column`` columns in the ``table_data`` in order to sample those columns when missing. Args: table_data (pandas.DataFrame): Table data. """ data_to_model = table_data[self.constraint_columns] self._hyper_transformer = HyperTransformer( default_data_type_transformers={ 'categorical': 'OneHotEncodingTransformer' }) transformed_data = self._hyper_transformer.fit_transform(data_to_model) self._model = GaussianMultivariate(distribution=GaussianUnivariate) self._model.fit(transformed_data) def _reject_sample(self, num_rows, conditions): sampled = self._model.sample(num_rows=num_rows, conditions=conditions) sampled = self._hyper_transformer.reverse_transform(sampled) valid_rows = sampled[self.constraint.is_valid(sampled)] counter = 0 total_sampled = num_rows while len(valid_rows) < num_rows: num_valid = len(valid_rows) if counter >= 100: if len(valid_rows) == 0: raise ValueError( 'Could not get enough valid rows within 100 trials.') else: multiplier = num_rows // num_valid num_rows_missing = num_rows % num_valid remainder_rows = valid_rows.iloc[0:num_rows_missing, :] valid_rows = pd.concat([valid_rows] * multiplier + [remainder_rows], ignore_index=True) break remaining = num_rows - num_valid valid_probability = (num_valid + 1) / (total_sampled + 1) max_rows = num_rows * 10 num_to_sample = min(int(remaining / valid_probability), max_rows) total_sampled += num_to_sample new_sampled = self._model.sample(num_rows=num_to_sample, conditions=conditions) new_sampled = self._hyper_transformer.reverse_transform( new_sampled) new_valid_rows = new_sampled[self.constraint.is_valid(new_sampled)] valid_rows = pd.concat([valid_rows, new_valid_rows], ignore_index=True) counter += 1 return valid_rows.iloc[0:num_rows, :] def sample(self, table_data): """Sample any missing columns. Sample any missing columns, ``self.constraint_columns``, that ``table_data`` does not contain. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Table data with additional ``constraint_columns``. """ condition_columns = [ c for c in self.constraint_columns if c in table_data.columns ] grouped_conditions = table_data[condition_columns].groupby( condition_columns) all_sampled_rows = list() for group, df in grouped_conditions: if not isinstance(group, tuple): group = [group] transformed_condition = self._hyper_transformer.transform( df).iloc[0].to_dict() sampled_rows = self._reject_sample( num_rows=df.shape[0], conditions=transformed_condition) all_sampled_rows.append(sampled_rows) sampled_data = pd.concat(all_sampled_rows, ignore_index=True) return sampled_data
def compute(cls, real_data, synthetic_data, metadata=None, dtypes=None): """Compute this metric. This builds a Machine Learning Classifier that learns to tell the synthetic data apart from the real data, which later on is evaluated using Cross Validation. The output of the metric is one minus the average ROC AUC score obtained. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. If not passed, it is build based on the real_data fields and dtypes. Returns: float: One minus the ROC AUC Cross Validation Score obtained by the classifier. """ metadata = cls._validate_inputs(real_data, synthetic_data, metadata) transformer = HyperTransformer( dtype_transformers={'O': 'one_hot_encoding'}, dtypes=dtypes) real_data = transformer.fit_transform(real_data).values synthetic_data = transformer.transform(synthetic_data).values X = np.concatenate([real_data, synthetic_data]) y = np.hstack([np.ones(len(real_data)), np.zeros(len(synthetic_data))]) if np.isin(X, [np.inf, -np.inf]).any(): X[np.isin(X, [np.inf, -np.inf])] = np.nan try: scores = [] kf = StratifiedKFold(n_splits=3, shuffle=True) for train_index, test_index in kf.split(X, y): y_pred, clf = cls._fit_predict(X[train_index], y[train_index], X[test_index]) roc_auc = roc_auc_score(y[test_index], y_pred) scores.append(max(0.5, roc_auc) * 2 - 1) plot = False if plot: fpr, tpr, _ = roc_curve(y[test_index], y_pred) dummy_fpr = np.linspace(0, 1) dummy_tpr = np.linspace(0, 1) # plot the roc curve for the model plt.plot(dummy_fpr, dummy_tpr, linestyle='--', label="Random Classifier") plt.plot(fpr, tpr, marker=',', label='ROC-curve') plt.fill_between(dummy_tpr, tpr) # axis labels plt.title("ROC-Curve Churn") plt.xlabel('False-Positive Rate') plt.ylabel('True-Positive Rate') # show the legend plt.legend() # show the plot plt.show() return 1 - np.mean(scores) except ValueError as err: LOGGER.info('DetectionMetric: Skipping due to %s', err) return np.nan
class Constraint(metaclass=ConstraintMeta): """Constraint base class. This class is not intended to be used directly and should rather be subclassed to create different types of constraints. If ``handling_strategy`` is passed with the value ``transform`` or ``reject_sampling``, the ``filter_valid`` or ``transform`` and ``reverse_transform`` methods will be replaced respectively by a simple identity function. Attributes: constraint_columns (tuple[str]): The names of the columns used by this constraint. rebuild_columns (typle[str]): The names of the columns that this constraint will rebuild during ``reverse_transform``. Args: handling_strategy (str): How this Constraint should be handled, which can be ``transform``, ``reject_sampling`` or ``all``. fit_columns_model (bool): If False, reject sampling will be used to handle conditional sampling. Otherwise, a model will be trained and used to sample other columns based on the conditioned column. """ constraint_columns = () rebuild_columns = () _hyper_transformer = None _columns_model = None def _identity(self, table_data): return table_data def __init__(self, handling_strategy, fit_columns_model=False): self.fit_columns_model = fit_columns_model if handling_strategy == 'transform': self.filter_valid = self._identity elif handling_strategy == 'reject_sampling': self.rebuild_columns = () self.transform = self._identity self.reverse_transform = self._identity elif handling_strategy != 'all': raise ValueError( 'Unknown handling strategy: {}'.format(handling_strategy)) def _fit(self, table_data): del table_data def fit(self, table_data): """Fit ``Constraint`` class to data. If ``fit_columns_model`` is True, then this method will fit a ``GaussianCopula`` model to the relevant columns in ``table_data``. Subclasses can overwrite this method, or overwrite the ``_fit`` method if they will not be needing the model to handle conditional sampling. Args: table_data (pandas.DataFrame): Table data. """ self._fit(table_data) if self.fit_columns_model and len(self.constraint_columns) > 1: data_to_model = table_data[list(self.constraint_columns)] self._hyper_transformer = HyperTransformer(dtype_transformers={ 'O': 'one_hot_encoding', }) transformed_data = self._hyper_transformer.fit_transform( data_to_model) self._columns_model = GaussianMultivariate( distribution=GaussianUnivariate) self._columns_model.fit(transformed_data) def _transform(self, table_data): return table_data def _reject_sample(self, num_rows, conditions): sampled = self._columns_model.sample(num_rows=num_rows, conditions=conditions) sampled = self._hyper_transformer.reverse_transform(sampled) valid_rows = sampled[self.is_valid(sampled)] counter = 0 total_sampled = num_rows while len(valid_rows) < num_rows: num_valid = len(valid_rows) if counter >= 100: if len(valid_rows) == 0: error = 'Could not get enough valid rows within 100 trials.' raise ValueError(error) else: multiplier = num_rows // num_valid num_rows_missing = num_rows % num_valid remainder_rows = valid_rows.iloc[0:num_rows_missing, :] valid_rows = pd.concat([valid_rows] * multiplier + [remainder_rows], ignore_index=True) break remaining = num_rows - num_valid valid_probability = (num_valid + 1) / (total_sampled + 1) max_rows = num_rows * 10 num_to_sample = min(int(remaining / valid_probability), max_rows) total_sampled += num_to_sample new_sampled = self._columns_model.sample(num_rows=num_to_sample, conditions=conditions) new_sampled = self._hyper_transformer.reverse_transform( new_sampled) new_valid_rows = new_sampled[self.is_valid(new_sampled)] valid_rows = pd.concat([valid_rows, new_valid_rows], ignore_index=True) counter += 1 return valid_rows.iloc[0:num_rows, :] def _sample_constraint_columns(self, table_data): condition_columns = [ c for c in self.constraint_columns if c in table_data.columns ] grouped_conditions = table_data[condition_columns].groupby( condition_columns) all_sampled_rows = list() for group, df in grouped_conditions: if not isinstance(group, tuple): group = [group] transformed_condition = self._hyper_transformer.transform( df).iloc[0].to_dict() sampled_rows = self._reject_sample( num_rows=df.shape[0], conditions=transformed_condition) all_sampled_rows.append(sampled_rows) sampled_data = pd.concat(all_sampled_rows, ignore_index=True) return sampled_data def _validate_constraint_columns(self, table_data): """Validate the columns in ``table_data``. If ``fit_columns_model`` is False and any columns in ``constraint_columns`` are not present in ``table_data``, this method will raise a ``MissingConstraintColumnError``. Otherwise it will return the ``table_data`` unchanged. If ``fit_columns_model`` is True, then this method will sample any missing ``constraint_columns`` from its model conditioned on the ``constraint_columns`` that ``table_data`` does contain. If ``table_data`` doesn't contain any of the ``constraint_columns`` then a ``MissingConstraintColumnError`` will be raised. Args: table_data (pandas.DataFrame): Table data. """ missing_columns = [ col for col in self.constraint_columns if col not in table_data.columns ] if missing_columns: if not self._columns_model: warning_message = ( 'When `fit_columns_model` is False and we are conditioning on a subset ' 'of the constraint columns, conditional sampling uses reject sampling ' 'which can be slow. Changing `fit_columns_model` to True can improve ' 'the performance.') warnings.warn(warning_message, UserWarning) all_columns_missing = len(missing_columns) == len( self.constraint_columns) if self._columns_model is None or all_columns_missing: raise MissingConstraintColumnError() else: sampled_data = self._sample_constraint_columns(table_data) other_columns = [ c for c in table_data.columns if c not in self.constraint_columns ] sampled_data[other_columns] = table_data[other_columns] return sampled_data return table_data def transform(self, table_data): """Perform necessary transformations needed by constraint. Subclasses can optionally overwrite this method. If the transformation requires certain columns to be present in ``table_data``, then the subclass should overwrite the ``_transform`` method instead. This method raises a ``MissingConstraintColumnError`` if the ``table_data`` is missing any columns needed to do the transformation. If columns are present, this method will call the ``_transform`` method. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Input data unmodified. """ table_data = self._validate_constraint_columns(table_data) return self._transform(table_data) def fit_transform(self, table_data): """Fit this Constraint to the data and then transform it. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Transformed data. """ self.fit(table_data) return self.transform(table_data) def reverse_transform(self, table_data): """Identity method for completion. To be optionally overwritten by subclasses. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Input data unmodified. """ return table_data def is_valid(self, table_data): """Say whether the given table rows are valid. This is a dummy version of the method that returns a series of ``True`` values to avoid dropping any rows. This should be overwritten by all the subclasses that have a way to decide which rows are valid and which are not. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.Series: Series of ``True`` values """ return pd.Series(True, index=table_data.index) def filter_valid(self, table_data): """Get only the rows that are valid. The filtering is done by calling the method ``is_valid``, which should be overwritten by subclasses, while this method should stay untouched. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Input data unmodified. """ valid = self.is_valid(table_data) invalid = sum(~valid) if invalid: LOGGER.debug('%s: %s invalid rows out of %s.', self.__class__.__name__, sum(~valid), len(valid)) if isinstance(valid, pd.Series): return table_data[valid.values] return table_data[valid] @classmethod def from_dict(cls, constraint_dict): """Build a Constraint object from a dict. Args: constraint_dict (dict): Dict containing the keyword ``constraint`` alongside any additional arguments needed to create the instance. Returns: Constraint: New constraint instance. """ constraint_dict = constraint_dict.copy() constraint_class = constraint_dict.pop('constraint') subclasses = get_subclasses(cls) if isinstance(constraint_class, str): if '.' in constraint_class: constraint_class = import_object(constraint_class) else: constraint_class = subclasses[constraint_class] return constraint_class(**constraint_dict) def to_dict(self): """Return a dict representation of this Constraint. The dictionary will contain the Qualified Name of the constraint class in the key ``constraint``, as well as any other arguments that were passed to the constructor when the instance was created. Returns: dict: Dict representation of this Constraint. """ constraint_dict = { 'constraint': _get_qualified_name(self.__class__), } for key, obj in copy.deepcopy(self.__kwargs__).items(): if callable(obj) and _module_contains_callable_name(obj): constraint_dict[key] = _get_qualified_name(obj) else: constraint_dict[key] = obj return constraint_dict