def test_get_query_count_with_same_results_cnt(self): data = DataUtils(file_path=TEST_DATA_PATH) df = data.get_pandas_df() result_cnt = [ self.user_query.get_query_count(df, query) for query in self.queries ] self.assertEqual(result_cnt == [1278, 24339, 24339, 41415], True)
def __init__(self, sensitive_data): """ Import the original data and initialize the utility measurement object Parameters ---------- sensitive_data: string The path to the original data. """ self.LOG = Base.get_logger("UserQuery") sensitive = DataUtils(file_path=sensitive_data) self.sensitive_df = sensitive.get_pandas_df()
def __init__(self, sensitive_data): """ Import the original data and initialize the utility measurement object Parameters ---------- sensitive_data: string The path to the original data. """ self.LOG = Base.get_logger("UserQuery") sensitive = DataUtils(file_path = sensitive_data) self.sensitive_df = sensitive.get_pandas_df()
def get_errors(self, synthetic_data, user_queries): """ Find the errors of the given queries between sensitive data and synthetic data Parameters ---------- synthetic_data: string The path to the synthetic data. user_queries: list the list of user queries. Returns ------- results: list The list of results corresponding to each query """ # import synthetic data as dataframe def get_one_error(df1, df2, query): import time t0 = time.time() try: len_df1_result = self.get_query_count(df1, query) len_df2_result = self.get_query_count(df1, query) except Exception as e: return str(e) if len_df1_result == 0: return 'inif' self.LOG.info("User query error measurement spends: %d seconds" % (time.time() - t0)) return np.abs(len_df1_result - len_df2_result) / len_df1_result synthetic = DataUtils(file_path=synthetic_data) synthetic_df = synthetic.get_pandas_df() results = [ get_one_error(self.sensitive_df, synthetic_df, query) for query in user_queries ] return results
def get_errors(self, synthetic_data, user_queries): """ Find the errors of the given queries between sensitive data and synthetic data Parameters ---------- synthetic_data: string The path to the synthetic data. user_queries: list the list of user queries. Returns ------- results: list The list of results corresponding to each query """ # import synthetic data as dataframe def get_one_error(df1, df2, query): import time t0 = time.time() try: len_df1_result = self.get_query_count(df1, query) len_df2_result = self.get_query_count(df1, query) except Exception as e: return str(e) if len_df1_result == 0: return 'inif' self.LOG.info("User query error measurement spends: %d seconds" % (time.time() - t0)) return np.abs(len_df1_result - len_df2_result) / len_df1_result synthetic = DataUtils(file_path = synthetic_data) synthetic_df = synthetic.get_pandas_df() results = [get_one_error(self.sensitive_df, synthetic_df, query) for query in user_queries] return results
def data_generalize(self): data = DataUtils(pandas_df=self.sim_df, valbin_maps = self.valbin_map, selected_attrs = self.selected_attrs) data.data_generalize() self.generalized_dataframe = data.get_pandas_df()
class Anonymization(Base): def __init__(self, request, is_celery): self.privacy_level = request['privacy_level'] self.epsilon = float(request['epsilon']) self.min_freq = float(request['min_freq']) if 'min_freq' in request.keys() else 0. self.exp_round = request['exp_round'] if 'exp_round' in request.keys() else None self.dp_id = request['dp_id'] task = get_object_or_404(Task, pk = request['task_id']) if is_celery else request['task_id'] self.task_id = task.task_id self.eps1_level = task.eps1_level self.data_path = task.data_path self.jtree_strct = ast.literal_eval(str(task.jtree_strct)) self.opted_cluster = ast.literal_eval(str(task.opted_cluster)) self.edges = ast.literal_eval(str(task.dep_graph)) self.domain = task.domain if isinstance(task.domain, dict) else collections.OrderedDict(ast.literal_eval(task.domain)) # This is the corsed domain self.valbin_map = ast.literal_eval(str(task.valbin_map)) self.selected_attrs = task.selected_attrs if isinstance(task.selected_attrs, dict) else self.convert_selected_attrs(task.selected_attrs) self.white_list = ast.literal_eval(str(task.white_list)) self.nodes = self.domain.keys() self.histogramdds = None self.data = None self.sim_df = None self.statistics_err = None self.generalized_dataframe = None self.synthetic_path = None def get_coarse_data(self): # TODO: Read coarse data from memory cach. # TODO: To deal with failure. folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} file_path = os.path.join(folder,c.COARSE_DATA_NAME) self.data = DataUtils( file_path = file_path, valbin_maps = self.valbin_map, selected_attrs = self.selected_attrs ) def kaggregate(self): if self.min_freq > 0: # cluster_num = len(self.jtree_strct) # thresh = self.get_freq_thresh(epsilon, cluster_num, min_freq) thresh = self.min_freq self.data.aggregation(thresh) self.domain = self.data.get_domain() self.valbin_map = self.data.get_valbin_maps() def get_histograms(self): combined_queries = self.combine_cliques_for_query(self.jtree_strct, self.opted_cluster) stats_func = StatsFunctions() self.histogramdds = stats_func.histogramdd_batch(self.data, combined_queries) def do_inference(self): inference = Inference( self.data, self.get_jtree_file_path(self.task_id, self.eps1_level), self.domain, self.opted_cluster, self.histogramdds, self.epsilon) self.model = inference.execute() def simulate(self): simulator = Simulate(self.model, self.data.get_nrows()) self.sim_df = simulator.run() def get_statistical_error(self): """ Compute the mean and standard varience error rates(Both coarse data). Parameters task_id: The task id to retrieve coarsed data. sim_coarsed_df: The noised sythetic data. Returns { "A":0.05, "B":0.12, ... } """ eps1 = self.eps1_level eps2 = self.epsilon white_list = self.white_list k = self.min_freq nodes = self.nodes # read the original coarse data first. coarsed_df = self.data.get_pandas_df() # make sure the order sim_coarsed_df = self.sim_df[self.nodes] coarsed_df_mean = np.array(coarsed_df.mean(), dtype = float) coarsed_df_std = np.array(coarsed_df.std(), dtype = float) sim_df_mean = np.array(sim_coarsed_df.mean(), dtype = float) sim_df_std = np.array(sim_coarsed_df.std(), dtype = float) mean_error = np.abs((sim_df_mean - coarsed_df_mean)*100 / coarsed_df_mean) std_error = np.abs((sim_df_std - coarsed_df_std)*100 / coarsed_df_std) mean_error = [str(rate)+'%' for rate in np.round(mean_error, decimals = 2)] std_error = [str(rate)+'%' for rate in np.round(std_error, decimals = 2)] self.print_pretty_summary(nodes, mean_error, std_error, eps1, eps2, white_list, k) self.statistics_err = { 'attrs':nodes, 'measures':['mean', 'std'], 'values':{ 'mean':mean_error, 'std':std_error } } def data_generalize(self): data = DataUtils(pandas_df=self.sim_df, valbin_maps = self.valbin_map, selected_attrs = self.selected_attrs) data.data_generalize() self.generalized_dataframe = data.get_pandas_df() def save_data(self): if self.exp_round: self.synthetic_path = self.save_sim_data_exp() else: self.synthetic_path = self.save_sim_data() def save_sim_data(self, spec_file_name = None): # TODO: to deal with failure folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} if not os.path.exists(folder): os.makedirs(folder) file_name = c.SIM_DATA_NAME_PATTERN % {'privacy_level': self.privacy_level} if spec_file_name is not None: file_name = spec_file_name # TODO: a parameter to specify no header output file_path = os.path.join(folder,file_name) self.generalized_dataframe.to_csv(file_path, index = False, header = False) else: file_path = os.path.join(folder,file_name) self.generalized_dataframe.to_csv(file_path, index = False) return c.SIM_DATA_URI_PATTERN % {'task_id':self.task_id, 'file_name':file_name} def save_sim_data_exp(self): spec_file_name = "sim_eps1lv_%(eps_lv)s_eps2lv_%(privacy_level)s_k_%(min_freq)s_round_%(exp_round)s.csv" % { 'exp_round': self.exp_round, 'privacy_level': self.privacy_level, 'eps_lv': self.eps1_level, 'min_freq': int(self.min_freq) } return self.save_sim_data(spec_file_name = spec_file_name) def print_pretty_summary(self, nodes, mean_error, std_error, eps1, eps2, white_list, k): LOG = Base.get_logger("Statical Accuracy Summary") import pandas as pd frame = pd.DataFrame({ 'Attribures': nodes, 'Mean': mean_error, 'STD': std_error }) LOG.info("eps1: %.2f, eps2: %.2f" % (eps1, eps2)) LOG.info("White List: %s" % str(white_list)) LOG.info("k-aggregate value: %d" % k) LOG.info('\n'+str(frame)) def create_instance(self): Job.objects.create( task_id = self.task_id, privacy_level = self.privacy_level, epsilon = self.epsilon, synthetic_path = self.synthetic_path, statistics_err = self.statistics_err ) def update_instance(self, status, is_celery): if not is_celery: return instance = get_object_or_404(Job, pk = self.dp_id) instance.synthetic_path = self.synthetic_path instance.statistics_err = self.statistics_err instance.status = ProcessStatus.get_code(status) instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') instance.save()
class DataUtilitiesTests(TestCase): # TODO: The Data Coarse and Generalize step should seperate, to simulate a more real case. def setUp(self): self.selected_attrs = dict({ 'Age':'C', 'workclass':'D', 'fnlwgt':'C', 'education':'D', 'education_num':'D', 'marital_status':'D', 'occupation':'D', 'relationship':'D', 'race':'D', 'sex':'D', 'capital_gain':'C', 'capital_loss':'C', 'hours_per_week':'C', 'native_country':'D', 'salary_class':'D' }) self.data = DataUtils(file_path = TESTING_FILE, selected_attrs = self.selected_attrs) self.data.data_coarsilize() self.base = Base() def test_data_preview(self): data = DataUtils(file_path = TESTING_FILE) preview = data.data_preview() self.assertEqual(len(preview.values[0]) > 0, True) def test_read_data_by_three_selected_column(self): """ Test the read data by user specified columns """ self.assertEqual(len(self.data.get_nodes_name()) == len(self.selected_attrs), True) def test_data_domain_keep_original_order(self): """ Test the order in domain object is in same order with original raw data. """ df = self.data.get_pandas_df() domain = self.data.get_domain() cols = domain.keys() self.assertEqual(cols == list(df.columns.values), True) def test_data_coarsilization(self): print self.data.get_pandas_df()[:5] def test_data_generalization(self): self.data.data_generalize() print self.data.get_pandas_df()[:5] def test_is_skip_pre_processing_with_create(self): create_flag = True request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, request, create_flag) self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_data_path_change(self): create_flag = False request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file22222222.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_selected_attr_change(self): create_flag = False request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['D', 'D', 'D', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_without_change(self): create_flag = False request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag) self.assertEqual(skip_pre_process == True, True)
def data_generalize(self, dataframe, valbin_map, selected_attrs): data = DataUtils(pandas_df=dataframe, valbin_maps=valbin_map, selected_attrs=selected_attrs) data.data_generalize() return data.get_pandas_df()
def test_get_query_count_with_same_results_cnt(self): data = DataUtils(file_path = TEST_DATA_PATH) df = data.get_pandas_df() result_cnt = [self.user_query.get_query_count(df, query) for query in self.queries] self.assertEqual(result_cnt == [1278, 24339, 24339, 41415], True)
class DataUtilitiesTests(TestCase): # TODO: The Data Coarse and Generalize step should seperate, to simulate a more real case. def setUp(self): self.selected_attrs = dict({ 'Age': 'C', 'workclass': 'D', 'fnlwgt': 'C', 'education': 'D', 'education_num': 'D', 'marital_status': 'D', 'occupation': 'D', 'relationship': 'D', 'race': 'D', 'sex': 'D', 'capital_gain': 'C', 'capital_loss': 'C', 'hours_per_week': 'C', 'native_country': 'D', 'salary_class': 'D' }) self.data = DataUtils(file_path=TESTING_FILE, selected_attrs=self.selected_attrs) self.data.data_coarsilize() self.base = Base() def test_data_preview(self): data = DataUtils(file_path=TESTING_FILE) preview = data.data_preview() self.assertEqual(len(preview.values[0]) > 0, True) def test_read_data_by_three_selected_column(self): """ Test the read data by user specified columns """ self.assertEqual( len(self.data.get_nodes_name()) == len(self.selected_attrs), True) def test_data_domain_keep_original_order(self): """ Test the order in domain object is in same order with original raw data. """ df = self.data.get_pandas_df() domain = self.data.get_domain() cols = domain.keys() self.assertEqual(cols == list(df.columns.values), True) def test_data_coarsilization(self): print self.data.get_pandas_df()[:5] def test_data_generalization(self): self.data.data_generalize() print self.data.get_pandas_df()[:5] def test_is_skip_pre_processing_with_create(self): create_flag = True request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, request, create_flag) self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_data_path_change(self): create_flag = False request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file22222222.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_selected_attr_change(self): create_flag = False request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['D', 'D', 'D', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_without_change(self): create_flag = False request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, instance, create_flag) self.assertEqual(skip_pre_process == True, True)
def data_generalize(self): data = DataUtils(pandas_df=self.sim_df, valbin_maps=self.valbin_map, selected_attrs=self.selected_attrs) data.data_generalize() self.generalized_dataframe = data.get_pandas_df()
class Anonymization(Base): def __init__(self, request, is_celery): self.privacy_level = request['privacy_level'] self.epsilon = float(request['epsilon']) self.min_freq = float( request['min_freq']) if 'min_freq' in request.keys() else 0. self.exp_round = request['exp_round'] if 'exp_round' in request.keys( ) else None self.dp_id = request['dp_id'] task = get_object_or_404( Task, pk=request['task_id']) if is_celery else request['task_id'] self.task_id = task.task_id self.eps1_level = task.eps1_level self.data_path = task.data_path self.jtree_strct = ast.literal_eval(str(task.jtree_strct)) self.opted_cluster = ast.literal_eval(str(task.opted_cluster)) self.edges = ast.literal_eval(str(task.dep_graph)) self.domain = task.domain if isinstance( task.domain, dict) else collections.OrderedDict( ast.literal_eval(task.domain)) # This is the corsed domain self.valbin_map = ast.literal_eval(str(task.valbin_map)) self.selected_attrs = task.selected_attrs if isinstance( task.selected_attrs, dict) else self.convert_selected_attrs( task.selected_attrs) self.white_list = ast.literal_eval(str(task.white_list)) self.nodes = self.domain.keys() self.histogramdds = None self.data = None self.sim_df = None self.statistics_err = None self.generalized_dataframe = None self.synthetic_path = None def get_coarse_data(self): # TODO: Read coarse data from memory cach. # TODO: To deal with failure. folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} file_path = os.path.join(folder, c.COARSE_DATA_NAME) self.data = DataUtils(file_path=file_path, valbin_maps=self.valbin_map, selected_attrs=self.selected_attrs) def kaggregate(self): if self.min_freq > 0: # cluster_num = len(self.jtree_strct) # thresh = self.get_freq_thresh(epsilon, cluster_num, min_freq) thresh = self.min_freq self.data.aggregation(thresh) self.domain = self.data.get_domain() self.valbin_map = self.data.get_valbin_maps() def get_histograms(self): combined_queries = self.combine_cliques_for_query( self.jtree_strct, self.opted_cluster) stats_func = StatsFunctions() self.histogramdds = stats_func.histogramdd_batch( self.data, combined_queries) def do_inference(self): inference = Inference( self.data, self.get_jtree_file_path(self.task_id, self.eps1_level), self.domain, self.opted_cluster, self.histogramdds, self.epsilon) self.model = inference.execute() def simulate(self): simulator = Simulate(self.model, self.data.get_nrows()) self.sim_df = simulator.run() def get_statistical_error(self): """ Compute the mean and standard varience error rates(Both coarse data). Parameters task_id: The task id to retrieve coarsed data. sim_coarsed_df: The noised sythetic data. Returns { "A":0.05, "B":0.12, ... } """ eps1 = self.eps1_level eps2 = self.epsilon white_list = self.white_list k = self.min_freq nodes = self.nodes # read the original coarse data first. coarsed_df = self.data.get_pandas_df() # make sure the order sim_coarsed_df = self.sim_df[self.nodes] coarsed_df_mean = np.array(coarsed_df.mean(), dtype=float) coarsed_df_std = np.array(coarsed_df.std(), dtype=float) sim_df_mean = np.array(sim_coarsed_df.mean(), dtype=float) sim_df_std = np.array(sim_coarsed_df.std(), dtype=float) mean_error = np.abs( (sim_df_mean - coarsed_df_mean) * 100 / coarsed_df_mean) std_error = np.abs( (sim_df_std - coarsed_df_std) * 100 / coarsed_df_std) mean_error = [ str(rate) + '%' for rate in np.round(mean_error, decimals=2) ] std_error = [ str(rate) + '%' for rate in np.round(std_error, decimals=2) ] self.print_pretty_summary(nodes, mean_error, std_error, eps1, eps2, white_list, k) self.statistics_err = { 'attrs': nodes, 'measures': ['mean', 'std'], 'values': { 'mean': mean_error, 'std': std_error } } def data_generalize(self): data = DataUtils(pandas_df=self.sim_df, valbin_maps=self.valbin_map, selected_attrs=self.selected_attrs) data.data_generalize() self.generalized_dataframe = data.get_pandas_df() def save_data(self): if self.exp_round: self.synthetic_path = self.save_sim_data_exp() else: self.synthetic_path = self.save_sim_data() def save_sim_data(self, spec_file_name=None): # TODO: to deal with failure folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} if not os.path.exists(folder): os.makedirs(folder) file_name = c.SIM_DATA_NAME_PATTERN % { 'privacy_level': self.privacy_level } if spec_file_name is not None: file_name = spec_file_name # TODO: a parameter to specify no header output file_path = os.path.join(folder, file_name) self.generalized_dataframe.to_csv(file_path, index=False, header=False) else: file_path = os.path.join(folder, file_name) self.generalized_dataframe.to_csv(file_path, index=False) return c.SIM_DATA_URI_PATTERN % { 'task_id': self.task_id, 'file_name': file_name } def save_sim_data_exp(self): spec_file_name = "sim_eps1lv_%(eps_lv)s_eps2lv_%(privacy_level)s_k_%(min_freq)s_round_%(exp_round)s.csv" % { 'exp_round': self.exp_round, 'privacy_level': self.privacy_level, 'eps_lv': self.eps1_level, 'min_freq': int(self.min_freq) } return self.save_sim_data(spec_file_name=spec_file_name) def print_pretty_summary(self, nodes, mean_error, std_error, eps1, eps2, white_list, k): LOG = Base.get_logger("Statical Accuracy Summary") import pandas as pd frame = pd.DataFrame({ 'Attribures': nodes, 'Mean': mean_error, 'STD': std_error }) LOG.info("eps1: %.2f, eps2: %.2f" % (eps1, eps2)) LOG.info("White List: %s" % str(white_list)) LOG.info("k-aggregate value: %d" % k) LOG.info('\n' + str(frame)) def create_instance(self): Job.objects.create(task_id=self.task_id, privacy_level=self.privacy_level, epsilon=self.epsilon, synthetic_path=self.synthetic_path, statistics_err=self.statistics_err) def update_instance(self, status, is_celery): if not is_celery: return instance = get_object_or_404(Job, pk=self.dp_id) instance.synthetic_path = self.synthetic_path instance.statistics_err = self.statistics_err instance.status = ProcessStatus.get_code(status) instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') instance.save()
def data_generalize(self, dataframe, valbin_map, selected_attrs): data = DataUtils(pandas_df=dataframe, valbin_maps = valbin_map, selected_attrs = selected_attrs) data.data_generalize() return data.get_pandas_df()