def _calculate_privacy_tests(tgt_data: DataFrame, syn_data: DataFrame): """ Compute privacy tests for a given target and synthetic data set """ assert sorted(tgt_data.columns) == sorted( syn_data.columns ), "Target and Synthetic have different columns" tgt_dict = _generate_column_type_dictionary(tgt_data) syn_dict = _generate_column_type_dictionary(syn_data) assert tgt_dict == syn_dict, "Target and Synthetic have different types" flat_table_target = _flatten_table(tgt_data.reset_index(), tgt_dict) flat_table_syn = _flatten_table(syn_data.reset_index(), tgt_dict) # columns now include 1st and 2nd record column_dict = _generate_column_type_dictionary(flat_table_target) smoothing_factor = 1e-8 tgt_data_p, syn_data_p = _prepare_data_for_privacy_metrics( flat_table_target, flat_table_syn, column_dict, smoothing_factor ) checks, privacy_tests = _calculate_dcr_nndr( tgt_data_p, syn_data_p, column_dict, smoothing_factor ) return checks
def _calculate_coherence( tgt_data: DataFrame, syn_data: DataFrame, type: str, # users_per_category | categories_per_user number_of_bins: int = 10, ): assert type in ["users_per_category", "categories_per_user"], "type not recognized" # check if data is in expected format assert sorted(tgt_data.columns) == sorted( syn_data.columns ), "Target and Synthetic have different columns" column_dict = _generate_column_type_dictionary(tgt_data) tgt_binned, syn_binned = _bin_looped( tgt_data, syn_data, column_dict, number_of_bins ) tgt_binned = tgt_binned.astype(str).reset_index() syn_binned = syn_binned.astype(str).reset_index() result = list() for col in list(column_dict): if type == "users_per_category": tgt_shares = tgt_binned.groupby(by=col, as_index=False).agg( {"id": pd.Series.nunique} ) tgt_shares["id"] = tgt_shares["id"] / tgt_binned["id"].nunique() syn_shares = syn_binned.groupby(by=col, as_index=False).agg( {"id": pd.Series.nunique} ) syn_shares["id"] = syn_shares["id"] / syn_binned["id"].nunique() elif type == "categories_per_user": tgt_shares = tgt_binned.groupby(by="id").agg({col: pd.Series.nunique}) tgt_shares = ( tgt_shares[col].value_counts(normalize=True).to_frame().reset_index() ) syn_shares = syn_binned.groupby(by="id").agg({col: pd.Series.nunique}) syn_shares = ( syn_shares[col].value_counts(normalize=True).to_frame().reset_index() ) tgt_shares.columns = ["value", "tgt"] syn_shares.columns = ["value", "syn"] shares = pd.merge(tgt_shares, syn_shares, on="value", how="left") shares["syn"] = shares["syn"].fillna(0) diff = np.abs(shares["tgt"] - shares["syn"]) out = pd.DataFrame( { "label": [col], "type": type, "max": np.max(diff), "l1d": np.sum(diff), "l2d": np.sqrt(np.sum(diff ** 2)), } ) result.append(out) return pd.concat(result)
def _calculate_coherence(tgt_data:DataFrame, syn_data:DataFrame, type: str, # users_per_category | categories_per_user number_of_bins:int = 10): assert type in ['users_per_category', 'categories_per_user'], "type not recognized" # check if data is in expected format assert sorted(tgt_data.columns) == sorted(syn_data.columns), "Target and Synthetic have different columns" column_dict = _generate_column_type_dictionary(tgt_data) tgt_binned, syn_binned = _bin_looped(tgt_data, syn_data, column_dict, number_of_bins) tgt_binned = tgt_binned.astype(str).reset_index() syn_binned = syn_binned.astype(str).reset_index() result = list() for col in list(column_dict): if type == 'users_per_category': tgt_shares = tgt_binned.groupby(by=col, as_index=False).agg({'id': pd.Series.nunique}) tgt_shares['id'] = tgt_shares['id'] / tgt_binned['id'].nunique() syn_shares = syn_binned.groupby(by=col, as_index=False).agg({'id': pd.Series.nunique}) syn_shares['id'] = syn_shares['id'] / syn_binned['id'].nunique() elif type == 'categories_per_user': tgt_shares = tgt_binned.groupby(by='id').agg({col: pd.Series.nunique}) tgt_shares = tgt_shares[col].value_counts(normalize=True).to_frame().reset_index() syn_shares = syn_binned.groupby(by='id').agg({col: pd.Series.nunique}) syn_shares = syn_shares[col].value_counts(normalize=True).to_frame().reset_index() tgt_shares.columns = ['value', 'tgt'] syn_shares.columns = ['value', 'syn'] shares = pd.merge(tgt_shares, syn_shares, on='value', how='left') shares['syn'] = shares['syn'].fillna(0) diff = np.abs(shares['tgt'] - shares['syn']) out = pd.DataFrame({'label': [col], 'type': type, 'tvd': np.max(diff), 'l1d': np.sum(diff), 'l2d': np.sqrt(np.sum(diff ** 2))}) result.append(out) return pd.concat(result)
def generate(self, number_of_subjects): super().generate(self) df_copy = self.target_data_.copy(deep=True) for x in df_copy.columns: df_copy[[x]] = np.random.permutation(df_copy[[x]]) unique_ids = df_copy.index.get_level_values(0).unique() sampled_ids = np.random.choice(unique_ids, size=number_of_subjects, replace=True) grid = pd.DataFrame({'id': sampled_ids}).sort_values('id') grid['id_new_'] = range(0, number_of_subjects) df = pd.merge(df_copy.reset_index(), grid, on='id') df['id'] = df['id_new_'] df = df.drop(['id_new_'], axis=1).sort_values('id') df = df.set_index(['id', 'sequence_pos']) # merge caueses dataframes to lose their types reference = _generate_column_type_dictionary(self.target_data_) df = _assign_column_type(df, reference) return df
def _calculate_statistical_distances( tgt_data: DataFrame, syn_data: DataFrame, type: str, # 1dim | 2dim | 3dim | 4dim number_of_bins: int = 10, max_combinations: int = 100, ): assert type in ["1dim", "2dim", "3dim", "4dim"], "type not recognized" # check if data is in expected format assert sorted(tgt_data.columns) == sorted( syn_data.columns ), "Target and Synthetic have different columns" column_dict = _generate_column_type_dictionary(tgt_data) def _sample_one_event(data: DataFrame) -> DataFrame: """ Randomly sample one record for each id. """ # set seed to ensure same rows are considered across synthesizers random.seed(a=123) # determine sequence length for each id seq_lens = data.reset_index().groupby("id").size() # randomly draw a sequence_pos for each id draws = pd.DataFrame( { "sequence_pos": np.floor( np.random.rand(len(seq_lens)) * seq_lens ).astype(int) } ).reset_index() # inner join with provided dataframe to filter to drawn records out = ( pd.merge(draws, data, on=["id", "sequence_pos"]) .drop(columns="sequence_pos") .set_index("id") ) return out tgt_sample = _sample_one_event(tgt_data) syn_sample = _sample_one_event(syn_data) tgt_binned, syn_binned = _bin_looped( tgt_sample, syn_sample, column_dict, number_of_bins ) tgt_binned = tgt_binned.astype(str) syn_binned = syn_binned.astype(str) def calc_shares(df_binned, label): shares = df_binned.value_counts().reset_index() shares.columns = ["bins", label] shares[label] = shares[label] / shares[label].sum() return shares if type == "1dim": cross = pd.DataFrame({"col_1": tgt_data.columns.to_list()}) cross["col_2"] = "all" cross["col_3"] = "all" cross["col_4"] = "all" elif type == "2dim": cols_1 = pd.DataFrame({"col_1": tgt_data.columns.to_list(), "key": "xyz"}) cols_2 = pd.DataFrame({"col_2": tgt_data.columns.to_list(), "key": "xyz"}) cross = pd.merge(cols_1, cols_2, on="key").drop("key", axis=1) cross["col_3"] = "all" cross["col_4"] = "all" elif type == "3dim": cols_1 = pd.DataFrame({"col_1": tgt_data.columns.to_list(), "key": "xyz"}) cols_2 = pd.DataFrame({"col_2": tgt_data.columns.to_list(), "key": "xyz"}) cols_3 = pd.DataFrame({"col_3": tgt_data.columns.to_list(), "key": "xyz"}) cross = pd.merge(pd.merge(cols_1, cols_2, on="key"), cols_3, on="key").drop( "key", axis=1 ) cross["col_4"] = "all" elif type == "4dim": cols_1 = pd.DataFrame({"col_1": tgt_data.columns.to_list(), "key": "xyz"}) cols_2 = pd.DataFrame({"col_2": tgt_data.columns.to_list(), "key": "xyz"}) cols_3 = pd.DataFrame({"col_3": tgt_data.columns.to_list(), "key": "xyz"}) cols_4 = pd.DataFrame({"col_4": tgt_data.columns.to_list(), "key": "xyz"}) cross = pd.merge( pd.merge(pd.merge(cols_1, cols_2, on="key"), cols_3, on="key"), cols_4, on="key", ).drop("key", axis=1) # set seed to ensure same variable combinations are considered across synthesizers random.seed(a=123) cross = cross.sample(min(cross.shape[0], max_combinations)) tgt_binned["all"] = "all" syn_binned["all"] = "all" result = list() for i in range(cross.shape[0]): col_1 = list(cross.col_1)[i] col_2 = list(cross.col_2)[i] col_3 = list(cross.col_3)[i] col_4 = list(cross.col_4)[i] col_label = col_1 + "|" + col_2 + "|" + col_3 + "|" + col_4 tgt_values = ( tgt_binned[col_1] + "|" + tgt_binned[col_2] + "|" + tgt_binned[col_3] + "|" + tgt_binned[col_4] ) syn_values = ( syn_binned[col_1] + "|" + syn_binned[col_2] + "|" + syn_binned[col_3] + "|" + syn_binned[col_4] ) shares = pd.merge( calc_shares(tgt_values, "tgt"), calc_shares(syn_values, "syn"), how="left" ) shares["syn"] = shares["syn"].fillna(0) diff = np.abs(shares["tgt"] - shares["syn"]) hell = np.sqrt( np.sum((np.sqrt(shares["tgt"]) - np.sqrt(shares["syn"])) ** 2) ) / np.sqrt(2) out = pd.DataFrame( { "label": [col_label], "type": type, "max": np.max(diff), "l1d": np.sum(diff), "l2d": np.sqrt(np.sum(diff ** 2)), "hellinger": hell, } ) result.append(out) # reset seed random.seed(a=None) return pd.concat(result)
def _calculate_statistical_distances(tgt_data:DataFrame, syn_data:DataFrame, type: str, # 1dim | 2dim | 3dim | 4dim number_of_bins:int = 10, max_combinations:int = 100): assert type in ['1dim', '2dim', '3dim', '4dim'], "type not recognized" # check if data is in expected format assert sorted(tgt_data.columns) == sorted(syn_data.columns), "Target and Synthetic have different columns" column_dict = _generate_column_type_dictionary(tgt_data) def _sample_one_event(data: DataFrame) -> DataFrame: """ Randomly sample one record for each id. """ # set seed to ensure same rows are considered across synthesizers random.seed(a=123) # determine sequence length for each id seq_lens = data.reset_index().groupby('id').size() # randomly draw a sequence_pos for each id draws = pd.DataFrame({'sequence_pos': np.floor(np.random.rand(len(seq_lens)) * seq_lens).astype( int)}).reset_index() # inner join with provided dataframe to filter to drawn records out = pd.merge(draws, data, on=['id', 'sequence_pos']).drop(columns='sequence_pos').set_index('id') return out tgt_sample = _sample_one_event(tgt_data) syn_sample = _sample_one_event(syn_data) tgt_binned, syn_binned = _bin_looped(tgt_sample, syn_sample, column_dict, number_of_bins) tgt_binned = tgt_binned.astype(str) syn_binned = syn_binned.astype(str) def calc_shares(df_binned, label): shares = df_binned.value_counts().reset_index() shares.columns = ['bins', label] shares[label] = shares[label] / shares[label].sum() return shares if type == '1dim': cross = pd.DataFrame({'col_1': tgt_data.columns.to_list()}) cross['col_2'] = 'all' cross['col_3'] = 'all' cross['col_4'] = 'all' elif type == '2dim': cols_1 = pd.DataFrame({'col_1': tgt_data.columns.to_list(), 'key': 'xyz'}) cols_2 = pd.DataFrame({'col_2': tgt_data.columns.to_list(), 'key': 'xyz'}) cross = pd.merge(cols_1, cols_2, on='key').drop('key', axis=1) cross['col_3'] = 'all' cross['col_4'] = 'all' elif type == '3dim': cols_1 = pd.DataFrame({'col_1': tgt_data.columns.to_list(), 'key': 'xyz'}) cols_2 = pd.DataFrame({'col_2': tgt_data.columns.to_list(), 'key': 'xyz'}) cols_3 = pd.DataFrame({'col_3': tgt_data.columns.to_list(), 'key': 'xyz'}) cross = pd.merge(pd.merge(cols_1, cols_2, on='key'), cols_3, on='key').drop('key', axis=1) cross['col_4'] = 'all' elif type == '4dim': cols_1 = pd.DataFrame({'col_1': tgt_data.columns.to_list(), 'key': 'xyz'}) cols_2 = pd.DataFrame({'col_2': tgt_data.columns.to_list(), 'key': 'xyz'}) cols_3 = pd.DataFrame({'col_3': tgt_data.columns.to_list(), 'key': 'xyz'}) cols_4 = pd.DataFrame({'col_4': tgt_data.columns.to_list(), 'key': 'xyz'}) cross = pd.merge(pd.merge(pd.merge(cols_1, cols_2, on='key'), cols_3, on='key'), cols_4, on='key').drop('key', axis=1) # set seed to ensure same variable combinations are considered across synthesizers random.seed(a=123) cross = cross.sample(min(cross.shape[0], max_combinations)) tgt_binned['all'] = 'all' syn_binned['all'] = 'all' result = list() for i in range(cross.shape[0]): col_1 = list(cross.col_1)[i] col_2 = list(cross.col_2)[i] col_3 = list(cross.col_3)[i] col_4 = list(cross.col_4)[i] col_label = col_1 + '|' + col_2 + '|' + col_3 + '|' + col_4 tgt_values = tgt_binned[col_1] + '|' + \ tgt_binned[col_2] + '|' + \ tgt_binned[col_3] + '|' + \ tgt_binned[col_4] syn_values = syn_binned[col_1] + '|' + \ syn_binned[col_2] + '|' + \ syn_binned[col_3] + '|' + \ syn_binned[col_4] shares = pd.merge(calc_shares(tgt_values, 'tgt'), calc_shares(syn_values, 'syn'), how='left') shares['syn'] = shares['syn'].fillna(0) diff = np.abs(shares['tgt'] - shares['syn']) hell = np.sqrt(np.sum((np.sqrt(shares['tgt']) - np.sqrt(shares['syn'])) ** 2)) / np.sqrt(2) out = pd.DataFrame({'label': [col_label], 'type': type, 'tvd': np.max(diff), 'l1d': np.sum(diff), 'l2d': np.sqrt(np.sum(diff ** 2)), 'hellinger': hell}) result.append(out) # reset seed random.seed(a=None) return pd.concat(result)
def transform(self, data): data_copy = data.copy(deep=True) dfs_to_merge = [] df_category = data_copy.select_dtypes(include='category') if not df_category.empty: df_category_dummies = pd.get_dummies(df_category) df_category_wide = df_category_dummies.unstack().fillna(0) df_category_wide.columns = df_category_wide.columns.map(lambda x: '{}_{}'.format(x[1], x[0])) df_category_wide = df_category_wide.sort_index(axis=1) possible_sequence_lengths = list(set([int(x.split("_")[0]) for x in df_category_wide.columns])) category_columns = df_category.columns target_dataframe_dict = {} category_column_mapping = {} target_dataframe_list = {} first_pos = 0 column_ordering = [] column_idx_original = {} for category_column in category_columns: target_categories = [] column_mapping = {} subset_category_columns = [x for x in df_category_wide.columns if category_column in x] for seq_length in possible_sequence_lengths: columns = [x for x in subset_category_columns if seq_length == int(x.split("_")[0])] column_ordering.extend(columns) second_pos = first_pos + len(columns) df_sliced = df_category_wide[columns] mapping = { col: idx for idx, col in enumerate(columns) } column_mapping[str(seq_length)] = columns df_map = pd.DataFrame(df_category_wide[columns].idxmax(axis="columns").map(mapping), columns=[f"idx_{first_pos}_{second_pos}"]) first_pos = second_pos target_categories.append(df_map) category_column_mapping[category_column] = column_mapping category_target = pd.concat(target_categories, axis=1) target_dataframe_list[category_column] = category_target column_idx_original[category_column] = [[pos, name] for pos, name in enumerate(category_target.columns)] self._category_column_mapping = category_column_mapping # needed for loss dfs_to_merge.append(df_category_wide.apply(lambda x: pd.Categorical(x))[column_ordering]) df_numerics = data_copy.select_dtypes(include='number') if not df_numerics.empty: df_numerics_wide = df_numerics.unstack().fillna(0) df_numerics_wide.columns = df_numerics_wide.columns.map(lambda x: '{}_{}'.format(x[0], x[1])) self._transformer_mean = df_numerics_wide.mean() self._transformer_std = df_numerics_wide.std() df_numerics_wide_standard = (df_numerics_wide - self._transformer_mean) / self._transformer_std dfs_to_merge.append(df_numerics_wide_standard) if len(dfs_to_merge) > 1: transformed_data = functools.reduce(lambda left, right: pd.merge(left, right, on=['id']), dfs_to_merge) else: transformed_data = dfs_to_merge[0] self.original_column_mapping = _generate_column_type_dictionary(data) self.column_mapping = _generate_column_type_dictionary(transformed_data) self.idx_mapping = {idx: self.column_mapping[x] for idx, x in enumerate(transformed_data.columns)} self._last_index = None self._category_idx_real_data = None # data will need to have targets for loss if not df_category.empty: self._last_index = transformed_data.shape[1] self._category_idx_real_data = {} end_idx = transformed_data.shape[1] for cat_col_name, cat_target_dataframe in target_dataframe_list.items(): transformed_data = pd.concat([transformed_data, cat_target_dataframe], axis=1) pos_info = column_idx_original[cat_col_name] self._category_idx_real_data[cat_col_name] = {name: pos + end_idx for pos, name in pos_info} end_idx = end_idx + cat_target_dataframe.shape[1] return transformed_data