def test_k_anonymize_w_user(self): df, feature_columns, categorical = init() feature_columns = ['column2', 'column3'] sensitive_column = 'column4' schema = StructType([ StructField("column1", IntegerType()), StructField("column2", StringType()), StructField("column3", StringType()), StructField("column4", StringType()), StructField("column5", IntegerType()) ]) resultdf = Preserver.k_anonymize_w_user(df, 3, feature_columns, sensitive_column, categorical, schema) testdata = [[6, '1', 'test1,test2', 'x', 20], [6, '1', 'test1,test2', 'y', 30], [4, '1', 'test1,test2', 'y', 35], [8, '2', 'test2,test3', 'x', 50], [8, '2', 'test2,test3', 'x', 45], [4, '2', 'test2,test3', 'y', 20]] testdf = spark.createDataFrame(testdata, schema=schema) try: self.assertTrue(testdf.exceptAll(resultdf).count() == 0) print("K-Anonymity function with user - Passed") except AssertionError: print("K-Anonymity function with user - Failed")
def test2_k_anonymize(self): df, feature_columns, categorical = init() sensitive_column = 'column5' schema = StructType([ StructField("column1", StringType()), StructField("column2", StringType()), StructField("column3", StringType()), StructField("column5", DoubleType()), StructField("count", IntegerType()) ]) resultdf = Preserver.k_anonymize(df, 3, feature_columns, sensitive_column, categorical, schema) testdata = [["0-10", '1', 'test1,test2', 20.0, 1], ["0-10", '1', 'test1,test2', 30.0, 1], ["0-10", '1', 'test1,test2', 35.0, 1], ["0-10", '2', 'test3,test2', 20.0, 1], ["0-10", '2', 'test3,test2', 45.0, 1], ["0-10", '2', 'test3,test2', 50.0, 1]] testdf = spark.createDataFrame(testdata, schema=schema) try: self.assertTrue(testdf.exceptAll(resultdf).count() == 0) print("K-Anonymity function 2 - Passed") except AssertionError: print("K-Anonymity function 2 - Failed")
def test_user_anonymize(self): df, feature_columns, categorical = init() sensitive_column = 'column4' schema = StructType([ StructField("column1", StringType()), StructField("column2", StringType()), StructField("column3", StringType()), StructField("column4", StringType()), StructField("column5", StringType()) ]) user = 4 usercolumn_name = "column1" k = 2 resultdf = Preserver.anonymize_user(df, k, user, usercolumn_name, sensitive_column, categorical, schema) testdata = [[6, '1', 'test1', 'x', '20'], [6, '1', 'test1', 'y', '30'], [8, '1,2', 'test2,test3', 'x', '20-55'], [8, '1,2', 'test2,test3', 'x', '20-55'], [4, '1,2', 'test2,test3', 'y', '20-55'], [4, '1,2', 'test2,test3', 'y', '20-55']] testdf = spark.createDataFrame(testdata, schema=schema) try: self.assertTrue(testdf.exceptAll(resultdf).count() == 0) print("User anonymize function - Passed") except AssertionError: print("User anonymize function - Failed")
def mondrian_k_anonymization(k, input_dataframe, categorical_list, feature_columns, sensitive_column): # create a list with columns needed output_columns = feature_columns.copy() output_columns.append(sensitive_column) df = input_dataframe.fillna(0).select(*output_columns).toDF( *output_columns) schema = get_anonymized_schema(df.schema, sensitive_column, output_columns) categorical = set(categorical_list) df_ano = Preserver.k_anonymize_w_user( df, k, feature_columns, sensitive_column, categorical, schema).toDF(*(column_name + "_anon" for column_name in output_columns)) return df.join(df_ano, df[sensitive_column] == df_ano[sensitive_column + '_anon'], "left")
def test_t_closeness(self): df, feature_columns, categorical = init() sensitive_column = 'column4' schema = StructType([ StructField("column1", StringType()), StructField("column2", StringType()), StructField("column3", StringType()), StructField("column4", StringType()), StructField("count", IntegerType()) ]) resultdf = Preserver.t_closeness(df, 3, 0.2, feature_columns, sensitive_column, categorical, schema) testdata = [["0-10", '1', 'test1,test2', 'x', 1], ["0-10", '1', 'test1,test2', 'y', 2], ["0-10", '2', 'test3,test2', 'x', 2], ["0-10", '2', 'test3,test2', 'y', 1]] testdf = spark.createDataFrame(testdata, schema=schema) try: self.assertTrue(testdf.exceptAll(resultdf).count() == 0) print("T-closeness function - Passed") except AssertionError: print("T-closeness function - Failed")