def _fit_hyper_transformer(self, data, extra_columns): """Create and return a new ``rdt.HyperTransformer`` instance. First get the ``dtypes`` and then use them to build a transformer dictionary to be used by the ``HyperTransformer``. Args: data (pandas.DataFrame): Data to transform. extra_columns (set): Names of columns that are not in the metadata but that should also be transformed. In most cases, these are the fields that were added by previous transformations which the data underwent. Returns: rdt.HyperTransformer """ meta_dtypes = self.get_dtypes(ids=False) dtypes = {} for column in data.columns: if column in meta_dtypes: dtypes[column] = meta_dtypes[column] elif column in extra_columns: dtypes[column] = data[column].dtype.kind transformers_dict = self._get_transformers(dtypes) self._hyper_transformer = rdt.HyperTransformer(transformers=transformers_dict) self._hyper_transformer.fit(data[list(dtypes.keys())])
def _compute_score(cls, real_data, synthetic_data, entity_columns, target): transformer = rdt.HyperTransformer( default_data_type_transformers={ 'categorical': rdt.transformers.OneHotEncodingTransformer( error_on_unknown=False), 'datetime': rdt.transformers.DatetimeTransformer(strip_constant=True), }) transformer.fit(real_data.drop(entity_columns + [target], axis=1)) real_x, real_y = cls._build_xy(transformer, real_data, entity_columns, target) synt_x, synt_y = cls._build_xy(transformer, synthetic_data, entity_columns, target) train, test = train_test_split(real_x.index, shuffle=True) real_x_train, real_x_test = real_x.loc[train], real_x.loc[test] real_y_train, real_y_test = real_y.loc[train], real_y.loc[test] real_acc = cls._scorer(real_x_train, real_x_test, real_y_train, real_y_test) synt_acc = cls._scorer(synt_x, real_x_test, synt_y, real_y_test) return synt_acc / real_acc
def _fit_predict(cls, synthetic_data, synthetic_target, real_data, real_target): """Fit a model in the synthetic data and make predictions for the real data.""" del real_target # delete argument which subclasses use but this method does not. unique_labels = np.unique(synthetic_target) if len(unique_labels) == 1: predictions = np.full(len(real_data), unique_labels[0]) else: transformer = rdt.HyperTransformer( default_data_type_transformers={ 'categorical': rdt.transformers.OneHotEncodingTransformer( error_on_unknown=False), }) real_data = transformer.fit_transform(real_data) synthetic_data = transformer.transform(synthetic_data) real_data[np.isin(real_data, [np.inf, -np.inf])] = None synthetic_data[np.isin(synthetic_data, [np.inf, -np.inf])] = None model_kwargs = cls.MODEL_KWARGS.copy() if cls.MODEL_KWARGS else {} model = cls.MODEL(**model_kwargs) pipeline = Pipeline([('imputer', SimpleImputer()), ('scaler', RobustScaler()), ('model', model)]) pipeline.fit(synthetic_data, synthetic_target) predictions = pipeline.predict(real_data) return predictions
def _fit_sample(self, real_data, table_metadata): columns, categoricals = self._get_columns(real_data, table_metadata) real_data = real_data[columns] ht = rdt.HyperTransformer(dtype_transformers={ 'O': 'label_encoding', }) ht.fit(real_data.iloc[:, categoricals]) model_data = ht.transform(real_data) supported = set(model_data.select_dtypes(('number', 'bool')).columns) unsupported = set(model_data.columns) - supported if unsupported: unsupported_dtypes = model_data[unsupported].dtypes.unique( ).tolist() raise UnsupportedDataset( f'Unsupported dtypes {unsupported_dtypes}') nulls = model_data.isnull().any() if nulls.any(): unsupported_columns = nulls[nulls].index.tolist() raise UnsupportedDataset( f'Null values found in columns {unsupported_columns}') LOGGER.info("Fitting %s", self.__class__.__name__) self.fit(model_data.to_numpy(), categoricals, ()) LOGGER.info("Sampling %s", self.__class__.__name__) sampled_data = self.sample(len(model_data)) sampled_data = pd.DataFrame(sampled_data, columns=columns) synthetic_data = real_data.copy() synthetic_data.update(ht.reverse_transform(sampled_data)) return synthetic_data
def _fit_hyper_transformer(self, data): """Create and return a new ``rdt.HyperTransformer`` instance. First get the ``dtypes`` and then use them to build a transformer dictionary to be used by the ``HyperTransformer``. Returns: rdt.HyperTransformer """ dtypes = self.get_dtypes(ids=False) transformers_dict = self._get_transformers(dtypes) self._hyper_transformer = rdt.HyperTransformer( transformers=transformers_dict) self._hyper_transformer.fit(data[list(dtypes.keys())])
def compute(cls, real_data, synthetic_data, metadata=None, entity_columns=None): """Compute this metric. Args: real_data (pandas.DataFrame): The values from the real dataset, passed as a pandas.DataFrame. synthetic_data (pandas.DataFrame): The values from the synthetic dataset, passed as a pandas.DataFrame. metadata (dict): TimeSeries metadata dict. If not passed, it is build based on the real_data fields and dtypes. entity_columns (list[str]): Names of the columns which identify different time series sequences. Returns: Union[float, tuple[float]]: Metric output. """ _, entity_columns = cls._validate_inputs(real_data, synthetic_data, metadata, entity_columns) transformer = rdt.HyperTransformer( default_data_type_transformers={ 'categorical': rdt.transformers.OneHotEncodingTransformer( error_on_unknown=False), 'datetime': rdt.transformers.DatetimeTransformer(strip_constant=True), }) transformer.fit(real_data.drop(entity_columns, axis=1)) real_x = cls._build_x(real_data, transformer, entity_columns) synt_x = cls._build_x(synthetic_data, transformer, entity_columns) X = pd.concat([real_x, synt_x]) y = pd.Series(np.array([0] * len(real_x) + [1] * len(synt_x))) X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y) return 1 - cls._compute_score(X_train, X_test, y_train, y_test)
def _transform_fit_sample(self, real_data, metadata): ht = rdt.HyperTransformer() columns_to_transform = list() fields_metadata = metadata['fields'] id_fields = list() for field in fields_metadata: if fields_metadata.get(field).get('type') != 'id': columns_to_transform.append(field) else: id_fields.append(field) ht.fit(real_data[columns_to_transform]) transformed_data = ht.transform(real_data) synthetic_data = self._fit_sample(transformed_data, metadata) reverse_transformed_synthetic_data = ht.reverse_transform(synthetic_data) reverse_transformed_synthetic_data[id_fields] = real_data[id_fields] return reverse_transformed_synthetic_data
def _fit_sample(self, real_data, table_metadata): columns, categoricals = self._get_columns(real_data, table_metadata) ht = rdt.HyperTransformer(dtype_transformers={ 'O': 'label_encoding', }) model_data = ht.fit_transform(real_data[columns]) LOGGER.info("Fitting %s", self.__class__.__name__) self.fit(model_data.to_numpy(), categoricals, ()) LOGGER.info("Sampling %s", self.__class__.__name__) sampled_data = self.sample(len(model_data)) sampled_data = pd.DataFrame(sampled_data, columns=columns) synthetic_data = real_data.copy() synthetic_data.update(ht.reverse_transform(sampled_data)) return synthetic_data
def _fit_hyper_transformer(self, data): """Create and return a new ``rdt.HyperTransformer`` instance. First get the ``dtypes`` and then use them to build a transformer dictionary to be used by the ``HyperTransformer``. Returns: rdt.HyperTransformer """ # dtypes = self.get_dtypes(ids=False) dtypes = {} fields = self._fields_metadata for column in data.columns: if column not in fields or fields[column]['type'] != 'id': dtypes[column] = data[column].dtype.kind transformers_dict = self._get_transformers(dtypes) self._hyper_transformer = rdt.HyperTransformer( transformers=transformers_dict) self._hyper_transformer.fit(data[list(dtypes.keys())])
def _compute_score(cls, real_data, synthetic_data, entity_columns, target): transformer = rdt.HyperTransformer( dtype_transformers={ 'O': 'one_hot_encoding', 'M': rdt.transformers.DatetimeTransformer(strip_constant=True), }) transformer.fit(real_data.drop(entity_columns + [target], axis=1)) real_x, real_y = cls._build_xy(transformer, real_data, entity_columns, target) synt_x, synt_y = cls._build_xy(transformer, synthetic_data, entity_columns, target) train, test = train_test_split(real_x.index, shuffle=True) real_x_train, real_x_test = real_x.loc[train], real_x.loc[test] real_y_train, real_y_test = real_y.loc[train], real_y.loc[test] real_acc = cls._scorer(real_x_train, real_x_test, real_y_train, real_y_test) synt_acc = cls._scorer(synt_x, real_x_test, synt_y, real_y_test) return synt_acc / real_acc