class TestStatsFunctions(TestCase): def setUp(self): selected_attributes = { "Age":"C", "workclass":"D", "fnlwgt":"C", "education":"D", "education_num":"D", "marital_status":"D", "occupation":"D", "relationship":"D", "race":"D", "sex":"D", "capital_gain":"C", "capital_loss":"C", "hours_per_week":"C", "native_country":"D", "salary_class":"D" } self.data = DataUtils(c.TEST_ORIGIN_DATA_PATH, selected_attrs = selected_attributes) self.data.data_coarsilize() self.stats_funcs = StatsFunctions() def test_histogram(self): cliques = [['capital_loss', 'salary_class'], ['fnlwgt'], ['workclass', 'occupation'], ['education', 'education_num', 'salary_class'], ['sex', 'hours_per_week', 'salary_class'], ['occupation', 'sex', 'capital_gain', 'salary_class'], ['marital_status', 'relationship', 'sex', 'salary_class'], ['race', 'native_country']] self.stats_funcs.histogramdd_batch(self.data, cliques)
class TestStatsFunctions(TestCase): def setUp(self): selected_attributes = { "Age": "C", "workclass": "D", "fnlwgt": "C", "education": "D", "education_num": "D", "marital_status": "D", "occupation": "D", "relationship": "D", "race": "D", "sex": "D", "capital_gain": "C", "capital_loss": "C", "hours_per_week": "C", "native_country": "D", "salary_class": "D" } self.data = DataUtils(c.TEST_ORIGIN_DATA_PATH, selected_attrs=selected_attributes) self.data.data_coarsilize() self.stats_funcs = StatsFunctions() def test_histogram(self): cliques = [['capital_loss', 'salary_class'], ['fnlwgt'], ['workclass', 'occupation'], ['education', 'education_num', 'salary_class'], ['sex', 'hours_per_week', 'salary_class'], ['occupation', 'sex', 'capital_gain', 'salary_class'], ['marital_status', 'relationship', 'sex', 'salary_class'], ['race', 'native_country']] self.stats_funcs.histogramdd_batch(self.data, cliques)
def read_data(self): self.data = DataUtils(file_path=self.data_path, selected_attrs=self.selected_attrs, names=self.names, specified_c_domain=self.specified_c_domain, chunk_size=self.chunk_size, date_format=self.date_format)
def post(self, request, format = None): """ The Request contains data path 1. using data utilities to preview the given data 2. retrieves all the fields and the first 5 data. 3. return the fields and sample data to client """ result = dict() req = request.data data = DataUtils(str(req['file_path'])) if 'col_name' in req.keys(): cnts, edges, dtype = data.get_histogram(req['col_name']) result['cnts'] = cnts result['edges'] = edges result['dtype'] = dtype else: col_names, dtypes, domain_size, data_size = data.data_preview() result['col_names'] = col_names result['dtypes'] = dtypes result['domain_size'] = domain_size result['data_size'] = data_size #result = json.dumps(result) return Response(result, status = status.HTTP_200_OK)
def get_coarse_data(self): # TODO: Read coarse data from memory cach. # TODO: To deal with failure. folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} file_path = os.path.join(folder, c.COARSE_DATA_NAME) self.data = DataUtils(file_path=file_path, valbin_maps=self.valbin_map, selected_attrs=self.selected_attrs)
def test_get_query_count_with_same_results_cnt(self): data = DataUtils(file_path=TEST_DATA_PATH) df = data.get_pandas_df() result_cnt = [ self.user_query.get_query_count(df, query) for query in self.queries ] self.assertEqual(result_cnt == [1278, 24339, 24339, 41415], True)
def __init__(self, sensitive_data): """ Import the original data and initialize the utility measurement object Parameters ---------- sensitive_data: string The path to the original data. """ self.LOG = Base.get_logger("UserQuery") sensitive = DataUtils(file_path = sensitive_data) self.sensitive_df = sensitive.get_pandas_df()
def __init__(self, sensitive_data): """ Import the original data and initialize the utility measurement object Parameters ---------- sensitive_data: string The path to the original data. """ self.LOG = Base.get_logger("UserQuery") sensitive = DataUtils(file_path=sensitive_data) self.sensitive_df = sensitive.get_pandas_df()
def read_data(self): self.data = DataUtils( file_path = self.data_path, selected_attrs = self.selected_attrs, names = self.names, specified_c_domain = self.specified_c_domain, chunk_size = self.chunk_size, date_format = self.date_format )
def get_coarse_data(self): # TODO: Read coarse data from memory cach. # TODO: To deal with failure. folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} file_path = os.path.join(folder,c.COARSE_DATA_NAME) self.data = DataUtils( file_path = file_path, valbin_maps = self.valbin_map, selected_attrs = self.selected_attrs )
def get_coarse_data(self, task): # TODO: Read coarse data from memory cach. # TODO: To deal with failure. folder = c.MEDIATE_DATA_DIR % {'task_id': task.task_id} file_path = os.path.join(folder, c.COARSE_DATA_NAME) data = DataUtils(file_path=file_path, valbin_maps=ast.literal_eval(task.valbin_map), selected_attrs=self.convert_selected_attrs( task.selected_attrs)) return data
def get_errors(self, synthetic_data, user_queries): """ Find the errors of the given queries between sensitive data and synthetic data Parameters ---------- synthetic_data: string The path to the synthetic data. user_queries: list the list of user queries. Returns ------- results: list The list of results corresponding to each query """ # import synthetic data as dataframe def get_one_error(df1, df2, query): import time t0 = time.time() try: len_df1_result = self.get_query_count(df1, query) len_df2_result = self.get_query_count(df1, query) except Exception as e: return str(e) if len_df1_result == 0: return 'inif' self.LOG.info("User query error measurement spends: %d seconds" % (time.time() - t0)) return np.abs(len_df1_result - len_df2_result) / len_df1_result synthetic = DataUtils(file_path=synthetic_data) synthetic_df = synthetic.get_pandas_df() results = [ get_one_error(self.sensitive_df, synthetic_df, query) for query in user_queries ] return results
def setUp(self): selected_attributes = { "Age": "C", "workclass": "D", "fnlwgt": "C", "education": "D", "education_num": "D", "marital_status": "D", "occupation": "D", "relationship": "D", "race": "D", "sex": "D", "capital_gain": "C", "capital_loss": "C", "hours_per_week": "C", "native_country": "D", "salary_class": "D" } self.data = DataUtils(c.TEST_ORIGIN_DATA_PATH, selected_attrs=selected_attributes) self.data.data_coarsilize() self.stats_funcs = StatsFunctions()
def setUp(self): self.selected_attrs = dict({ 'Age': 'C', 'workclass': 'D', 'fnlwgt': 'C', 'education': 'D', 'education_num': 'D', 'marital_status': 'D', 'occupation': 'D', 'relationship': 'D', 'race': 'D', 'sex': 'D', 'capital_gain': 'C', 'capital_loss': 'C', 'hours_per_week': 'C', 'native_country': 'D', 'salary_class': 'D' }) self.data = DataUtils(file_path=TESTING_FILE, selected_attrs=self.selected_attrs) self.data.data_coarsilize() self.base = Base()
class JunctionTreeTests(TestCase): def setUp(self): self.data = DataUtils(TESTING_FILE) self.dep_graph = DependencyGraph(self.data) self.edges = self.dep_graph.get_dep_edges() self.nodes = self.data.get_nodes_name() self.jtree_path = c.TEST_JTREE_FILE_PATH def test_jtree_without_noise(self): dep_graph = DependencyGraph(self.data, noise_flag=False) edges = dep_graph.get_dep_edges() jtree = JunctionTree(edges, self.nodes, self.jtree_path) cliques = jtree.get_jtree()['cliques'] self.assertEqual( cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['Income', 'TRV'], ['Age'], ['DGF']], True) def test_jtree_with_white_list(self): dep_graph = DependencyGraph(self.data, white_list=[['Age', 'Income', 'TRV'], ['DGF', 'HTN']]) edges = dep_graph.get_dep_edges() jtree = JunctionTree(edges, self.nodes, self.jtree_path) cliques = jtree.get_jtree()['cliques'] self.assertEqual( cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['HTN', 'DGF'], ['Income', 'TRV', 'Age']], True) def test_build_jtree_then_check_jtree_file(self): self.TestA() self.TestB() def TestA(self): """ The dependency graph is a complete graph, so there is only one clique in the junction tree """ jtree = JunctionTree(self.edges, self.nodes, self.jtree_path) jtreepy = jtree.get_jtree() #print jtreepy self.assertEqual(len(jtreepy) == 3, True) def TestB(self): import os, time from stat import * st = os.stat(self.jtree_path) now = time.time() # TODO: Need to know this file is new modified #self.assertEqual((st.st_mtime - now) < 100000, True)
def get_errors(self, synthetic_data, user_queries): """ Find the errors of the given queries between sensitive data and synthetic data Parameters ---------- synthetic_data: string The path to the synthetic data. user_queries: list the list of user queries. Returns ------- results: list The list of results corresponding to each query """ # import synthetic data as dataframe def get_one_error(df1, df2, query): import time t0 = time.time() try: len_df1_result = self.get_query_count(df1, query) len_df2_result = self.get_query_count(df1, query) except Exception as e: return str(e) if len_df1_result == 0: return 'inif' self.LOG.info("User query error measurement spends: %d seconds" % (time.time() - t0)) return np.abs(len_df1_result - len_df2_result) / len_df1_result synthetic = DataUtils(file_path = synthetic_data) synthetic_df = synthetic.get_pandas_df() results = [get_one_error(self.sensitive_df, synthetic_df, query) for query in user_queries] return results
class JunctionTreeTests(TestCase): def setUp(self): self.data = DataUtils(TESTING_FILE) self.dep_graph = DependencyGraph(self.data) self.edges = self.dep_graph.get_dep_edges() self.nodes = self.data.get_nodes_name() self.jtree_path = c.TEST_JTREE_FILE_PATH def test_jtree_without_noise(self): dep_graph = DependencyGraph(self.data, noise_flag = False) edges = dep_graph.get_dep_edges() jtree = JunctionTree(edges, self.nodes, self.jtree_path) cliques = jtree.get_jtree()['cliques'] self.assertEqual(cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['Income', 'TRV'], ['Age'], ['DGF']], True) def test_jtree_with_white_list(self): dep_graph = DependencyGraph(self.data, white_list = [['Age', 'Income', 'TRV'], ['DGF', 'HTN']]) edges = dep_graph.get_dep_edges() jtree = JunctionTree(edges, self.nodes, self.jtree_path) cliques = jtree.get_jtree()['cliques'] self.assertEqual(cliques == [['HTN', 'Height'], ['HTN', 'Weight'], ['HTN', 'DGF'], ['Income', 'TRV', 'Age']], True) def test_build_jtree_then_check_jtree_file(self): self.TestA() self.TestB() def TestA(self): """ The dependency graph is a complete graph, so there is only one clique in the junction tree """ jtree = JunctionTree(self.edges, self.nodes, self.jtree_path) jtreepy = jtree.get_jtree() #print jtreepy self.assertEqual(len(jtreepy) == 3, True) def TestB(self): import os, time from stat import * st = os.stat(self.jtree_path) now = time.time() # TODO: Need to know this file is new modified #self.assertEqual((st.st_mtime - now) < 100000, True)
def setUp(self): selected_attributes = { "Age":"C", "workclass":"D", "fnlwgt":"C", "education":"D", "education_num":"D", "marital_status":"D", "occupation":"D", "relationship":"D", "race":"D", "sex":"D", "capital_gain":"C", "capital_loss":"C", "hours_per_week":"C", "native_country":"D", "salary_class":"D" } self.data = DataUtils(c.TEST_ORIGIN_DATA_PATH, selected_attrs = selected_attributes) self.data.data_coarsilize() self.stats_funcs = StatsFunctions()
def setUp(self): self.selected_attrs = dict({ 'Age':'C', 'workclass':'D', 'fnlwgt':'C', 'education':'D', 'education_num':'D', 'marital_status':'D', 'occupation':'D', 'relationship':'D', 'race':'D', 'sex':'D', 'capital_gain':'C', 'capital_loss':'C', 'hours_per_week':'C', 'native_country':'D', 'salary_class':'D' }) self.data = DataUtils(file_path = TESTING_FILE, selected_attrs = self.selected_attrs) self.data.data_coarsilize() self.base = Base()
def data_generalize(self, dataframe, valbin_map, selected_attrs): data = DataUtils(pandas_df=dataframe, valbin_maps=valbin_map, selected_attrs=selected_attrs) data.data_generalize() return data.get_pandas_df()
class Anonymization(Base): def __init__(self, request, is_celery): self.privacy_level = request['privacy_level'] self.epsilon = float(request['epsilon']) self.min_freq = float( request['min_freq']) if 'min_freq' in request.keys() else 0. self.exp_round = request['exp_round'] if 'exp_round' in request.keys( ) else None self.dp_id = request['dp_id'] task = get_object_or_404( Task, pk=request['task_id']) if is_celery else request['task_id'] self.task_id = task.task_id self.eps1_level = task.eps1_level self.data_path = task.data_path self.jtree_strct = ast.literal_eval(str(task.jtree_strct)) self.opted_cluster = ast.literal_eval(str(task.opted_cluster)) self.edges = ast.literal_eval(str(task.dep_graph)) self.domain = task.domain if isinstance( task.domain, dict) else collections.OrderedDict( ast.literal_eval(task.domain)) # This is the corsed domain self.valbin_map = ast.literal_eval(str(task.valbin_map)) self.selected_attrs = task.selected_attrs if isinstance( task.selected_attrs, dict) else self.convert_selected_attrs( task.selected_attrs) self.white_list = ast.literal_eval(str(task.white_list)) self.nodes = self.domain.keys() self.histogramdds = None self.data = None self.sim_df = None self.statistics_err = None self.generalized_dataframe = None self.synthetic_path = None def get_coarse_data(self): # TODO: Read coarse data from memory cach. # TODO: To deal with failure. folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} file_path = os.path.join(folder, c.COARSE_DATA_NAME) self.data = DataUtils(file_path=file_path, valbin_maps=self.valbin_map, selected_attrs=self.selected_attrs) def kaggregate(self): if self.min_freq > 0: # cluster_num = len(self.jtree_strct) # thresh = self.get_freq_thresh(epsilon, cluster_num, min_freq) thresh = self.min_freq self.data.aggregation(thresh) self.domain = self.data.get_domain() self.valbin_map = self.data.get_valbin_maps() def get_histograms(self): combined_queries = self.combine_cliques_for_query( self.jtree_strct, self.opted_cluster) stats_func = StatsFunctions() self.histogramdds = stats_func.histogramdd_batch( self.data, combined_queries) def do_inference(self): inference = Inference( self.data, self.get_jtree_file_path(self.task_id, self.eps1_level), self.domain, self.opted_cluster, self.histogramdds, self.epsilon) self.model = inference.execute() def simulate(self): simulator = Simulate(self.model, self.data.get_nrows()) self.sim_df = simulator.run() def get_statistical_error(self): """ Compute the mean and standard varience error rates(Both coarse data). Parameters task_id: The task id to retrieve coarsed data. sim_coarsed_df: The noised sythetic data. Returns { "A":0.05, "B":0.12, ... } """ eps1 = self.eps1_level eps2 = self.epsilon white_list = self.white_list k = self.min_freq nodes = self.nodes # read the original coarse data first. coarsed_df = self.data.get_pandas_df() # make sure the order sim_coarsed_df = self.sim_df[self.nodes] coarsed_df_mean = np.array(coarsed_df.mean(), dtype=float) coarsed_df_std = np.array(coarsed_df.std(), dtype=float) sim_df_mean = np.array(sim_coarsed_df.mean(), dtype=float) sim_df_std = np.array(sim_coarsed_df.std(), dtype=float) mean_error = np.abs( (sim_df_mean - coarsed_df_mean) * 100 / coarsed_df_mean) std_error = np.abs( (sim_df_std - coarsed_df_std) * 100 / coarsed_df_std) mean_error = [ str(rate) + '%' for rate in np.round(mean_error, decimals=2) ] std_error = [ str(rate) + '%' for rate in np.round(std_error, decimals=2) ] self.print_pretty_summary(nodes, mean_error, std_error, eps1, eps2, white_list, k) self.statistics_err = { 'attrs': nodes, 'measures': ['mean', 'std'], 'values': { 'mean': mean_error, 'std': std_error } } def data_generalize(self): data = DataUtils(pandas_df=self.sim_df, valbin_maps=self.valbin_map, selected_attrs=self.selected_attrs) data.data_generalize() self.generalized_dataframe = data.get_pandas_df() def save_data(self): if self.exp_round: self.synthetic_path = self.save_sim_data_exp() else: self.synthetic_path = self.save_sim_data() def save_sim_data(self, spec_file_name=None): # TODO: to deal with failure folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} if not os.path.exists(folder): os.makedirs(folder) file_name = c.SIM_DATA_NAME_PATTERN % { 'privacy_level': self.privacy_level } if spec_file_name is not None: file_name = spec_file_name # TODO: a parameter to specify no header output file_path = os.path.join(folder, file_name) self.generalized_dataframe.to_csv(file_path, index=False, header=False) else: file_path = os.path.join(folder, file_name) self.generalized_dataframe.to_csv(file_path, index=False) return c.SIM_DATA_URI_PATTERN % { 'task_id': self.task_id, 'file_name': file_name } def save_sim_data_exp(self): spec_file_name = "sim_eps1lv_%(eps_lv)s_eps2lv_%(privacy_level)s_k_%(min_freq)s_round_%(exp_round)s.csv" % { 'exp_round': self.exp_round, 'privacy_level': self.privacy_level, 'eps_lv': self.eps1_level, 'min_freq': int(self.min_freq) } return self.save_sim_data(spec_file_name=spec_file_name) def print_pretty_summary(self, nodes, mean_error, std_error, eps1, eps2, white_list, k): LOG = Base.get_logger("Statical Accuracy Summary") import pandas as pd frame = pd.DataFrame({ 'Attribures': nodes, 'Mean': mean_error, 'STD': std_error }) LOG.info("eps1: %.2f, eps2: %.2f" % (eps1, eps2)) LOG.info("White List: %s" % str(white_list)) LOG.info("k-aggregate value: %d" % k) LOG.info('\n' + str(frame)) def create_instance(self): Job.objects.create(task_id=self.task_id, privacy_level=self.privacy_level, epsilon=self.epsilon, synthetic_path=self.synthetic_path, statistics_err=self.statistics_err) def update_instance(self, status, is_celery): if not is_celery: return instance = get_object_or_404(Job, pk=self.dp_id) instance.synthetic_path = self.synthetic_path instance.statistics_err = self.statistics_err instance.status = ProcessStatus.get_code(status) instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') instance.save()
class DataUtilitiesTests(TestCase): # TODO: The Data Coarse and Generalize step should seperate, to simulate a more real case. def setUp(self): self.selected_attrs = dict({ 'Age':'C', 'workclass':'D', 'fnlwgt':'C', 'education':'D', 'education_num':'D', 'marital_status':'D', 'occupation':'D', 'relationship':'D', 'race':'D', 'sex':'D', 'capital_gain':'C', 'capital_loss':'C', 'hours_per_week':'C', 'native_country':'D', 'salary_class':'D' }) self.data = DataUtils(file_path = TESTING_FILE, selected_attrs = self.selected_attrs) self.data.data_coarsilize() self.base = Base() def test_data_preview(self): data = DataUtils(file_path = TESTING_FILE) preview = data.data_preview() self.assertEqual(len(preview.values[0]) > 0, True) def test_read_data_by_three_selected_column(self): """ Test the read data by user specified columns """ self.assertEqual(len(self.data.get_nodes_name()) == len(self.selected_attrs), True) def test_data_domain_keep_original_order(self): """ Test the order in domain object is in same order with original raw data. """ df = self.data.get_pandas_df() domain = self.data.get_domain() cols = domain.keys() self.assertEqual(cols == list(df.columns.values), True) def test_data_coarsilization(self): print self.data.get_pandas_df()[:5] def test_data_generalization(self): self.data.data_generalize() print self.data.get_pandas_df()[:5] def test_is_skip_pre_processing_with_create(self): create_flag = True request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, request, create_flag) self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_data_path_change(self): create_flag = False request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file22222222.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_selected_attr_change(self): create_flag = False request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['D', 'D', 'D', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_without_change(self): create_flag = False request = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } instance = { 'data_path':'/path/to/dummy/file.csv', 'selected_attrs':{ 'names':['A', 'D', 'C', 'B'], 'types':['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip(request, instance, create_flag) self.assertEqual(skip_pre_process == True, True)
def test_data_preview(self): data = DataUtils(file_path = TESTING_FILE) preview = data.data_preview() self.assertEqual(len(preview.values[0]) > 0, True)
def data_generalize(self): data = DataUtils(pandas_df=self.sim_df, valbin_maps=self.valbin_map, selected_attrs=self.selected_attrs) data.data_generalize() self.generalized_dataframe = data.get_pandas_df()
def test_data_preview(self): data = DataUtils(file_path=TESTING_FILE) preview = data.data_preview() self.assertEqual(len(preview.values[0]) > 0, True)
class DataUtilitiesTests(TestCase): # TODO: The Data Coarse and Generalize step should seperate, to simulate a more real case. def setUp(self): self.selected_attrs = dict({ 'Age': 'C', 'workclass': 'D', 'fnlwgt': 'C', 'education': 'D', 'education_num': 'D', 'marital_status': 'D', 'occupation': 'D', 'relationship': 'D', 'race': 'D', 'sex': 'D', 'capital_gain': 'C', 'capital_loss': 'C', 'hours_per_week': 'C', 'native_country': 'D', 'salary_class': 'D' }) self.data = DataUtils(file_path=TESTING_FILE, selected_attrs=self.selected_attrs) self.data.data_coarsilize() self.base = Base() def test_data_preview(self): data = DataUtils(file_path=TESTING_FILE) preview = data.data_preview() self.assertEqual(len(preview.values[0]) > 0, True) def test_read_data_by_three_selected_column(self): """ Test the read data by user specified columns """ self.assertEqual( len(self.data.get_nodes_name()) == len(self.selected_attrs), True) def test_data_domain_keep_original_order(self): """ Test the order in domain object is in same order with original raw data. """ df = self.data.get_pandas_df() domain = self.data.get_domain() cols = domain.keys() self.assertEqual(cols == list(df.columns.values), True) def test_data_coarsilization(self): print self.data.get_pandas_df()[:5] def test_data_generalization(self): self.data.data_generalize() print self.data.get_pandas_df()[:5] def test_is_skip_pre_processing_with_create(self): create_flag = True request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, request, create_flag) self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_data_path_change(self): create_flag = False request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file22222222.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_with_selected_attr_change(self): create_flag = False request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['D', 'D', 'D', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, instance, create_flag) print skip_pre_process self.assertEqual(skip_pre_process == False, True) def test_is_skip_pre_processing_without_change(self): create_flag = False request = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } instance = { 'data_path': '/path/to/dummy/file.csv', 'selected_attrs': { 'names': ['A', 'D', 'C', 'B'], 'types': ['C', 'C', 'C', 'D'] } } skip_pre_process = self.base.is_pre_process_skip( request, instance, create_flag) self.assertEqual(skip_pre_process == True, True)
def setUp(self): self.data = DataUtils(TESTING_FILE) self.dep_graph = DependencyGraph(self.data) self.edges = self.dep_graph.get_dep_edges() self.nodes = self.data.get_nodes_name() self.jtree_path = c.TEST_JTREE_FILE_PATH
def test_get_query_count_with_same_results_cnt(self): data = DataUtils(file_path = TEST_DATA_PATH) df = data.get_pandas_df() result_cnt = [self.user_query.get_query_count(df, query) for query in self.queries] self.assertEqual(result_cnt == [1278, 24339, 24339, 41415], True)
class Preprocess(Base): def __init__(self, request): self.chunk_size = request[ 'chunk_size'] if 'chunk_size' in request.keys() else -1 self.coarse_data_path = None self.data = None self.data_path = request['data_path'] self.date_format = request['selected_attrs'][ 'date_format'] if 'date_format' in request['selected_attrs'].keys( ) else None self.dep_graph = None # original edges self.domain = None self.edges = None self.eps1_val = request['eps1_val'] if 'eps1_val' in request.keys( ) else c.EPSILON_1 self.eps1_level = request[ 'eps1_level'] if 'eps1_level' in request.keys() else 1 self.jtree_strct = None self.jtree_file_path = None self.names = request['names'] if 'names' in request.keys() else None self.nodes = None self.opted_cluster = None self.selected_attrs = self.convert_selected_attrs( request['selected_attrs']) self.specified_c_domain = request['selected_attrs'][ 'specified_c_domain'] if 'specified_c_domain' in request[ 'selected_attrs'].keys() else None self.task_id = request['task_id'] self.task_folder = self.create_task_folder(self.task_id) self.valbin_map = None self.white_list = self.get_white_list(request) def read_data(self): self.data = DataUtils(file_path=self.data_path, selected_attrs=self.selected_attrs, names=self.names, specified_c_domain=self.specified_c_domain, chunk_size=self.chunk_size, date_format=self.date_format) def coarse(self): self.data.data_coarsilize() self.domain = self.data.get_domain() self.nodes = self.data.get_nodes_name() self.valbin_map = str(self.data.get_valbin_maps()) def build_dep_graph(self): # dependency graph dep_graph_obj = DependencyGraph(self.data, eps1_val=self.eps1_val) self.edges = dep_graph_obj.get_dep_edges(display=True) self.cust_edges = dep_graph_obj.set_white_list(self.white_list) \ .get_dep_edges(display = True) self.dep_graph = str(self.edges) def get_white_list(self, request): white_list = request['white_list'] if 'white_list' in request.keys( ) and len(request['white_list']) > 0 else "[]" if not isinstance(white_list, list): white_list = ast.literal_eval(white_list) return white_list def build_jtree(self): # junction tree jtree = JunctionTree( self.cust_edges, self.nodes, self.get_jtree_file_path( self.task_id, self.eps1_level), # the path to save junction tree file ) # optimize marginal var_reduce = VarianceReduce(self.domain, jtree.get_jtree()['cliques'], 0.2) self.opted_cluster = var_reduce.main() self.jtree_strct = jtree.get_jtree()['cliques'] self.jtree_file_path = self.save_merged_jtree(self.task_id, self.eps1_level, self.jtree_strct) def save_coarse_data(self): # TODO: to deal with failure file_path = os.path.join(self.task_folder, c.COARSE_DATA_NAME) if self.data is not None: self.data.save(file_path) self.coarse_data_path = file_path def update_instance(self, status, is_celery): if not is_celery: return instance = get_object_or_404(Task, pk=self.task_id) instance.eps1_val = self.eps1_val instance.eps1_level = self.eps1_level instance.dep_graph = str(self.edges) instance.valbin_map = str(self.valbin_map) instance.domain = str( self.domain.items()) if self.domain is not None else None instance.white_list = self.white_list instance.jtree_strct = str(self.jtree_strct) instance.opted_cluster = str(self.opted_cluster) instance.status = ProcessStatus.get_code(status) instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') instance.save()
def setUp(self): nodes = ['Age', 'Height', 'Weight', 'Income', 'TRV', 'HTN', 'DGF'] edges = [['Height', 'HTN'], ['Weight', 'HTN'], ['Income', 'TRV']] jtree = JunctionTree(edges, nodes) cliques = jtree.get_jtree()['cliques'] opted_cluster = [['DGF'], ['Income', 'TRV'], ['Age'], ['Height', 'HTN'], ['Weight', 'HTN']] combined_queries = self.combine_cliques_for_query( cliques, opted_cluster) stats_func = StatsFunctions() domain = collections.OrderedDict( [('Age', [ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85 ]), ('Height', [ 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200 ]), ('Weight', [ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108 ]), ('Income', [ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140 ]), ('TRV', [ 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60 ]), ('HTN', [0, 1]), ('DGF', [0, 1])]) data1 = DataUtils(file_path=TESTING_FILE) histogramdds = stats_func.histogramdd_batch(data1, combined_queries) self.inference = Inference(data1, JTREE_TEST_FILE, domain, opted_cluster, histogramdds, 0.2) domain_parsed = collections.OrderedDict([ ('Age', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ]), ('Height', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ]), ('Weight', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ]), ('Income', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120 ]), ('TRV', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59 ]), ('HTN', [0, 1]), ('DGF', [0, 1]) ]) data2 = DataUtils(file_path=TEST_PARSED_FILE) histogramdds = stats_func.histogramdd_batch(data2, combined_queries) self.inference_parsed = Inference(data2, JTREE_TEST_FILE, domain_parsed, opted_cluster, histogramdds, 0.2)
class Preprocess(Base): def __init__(self, request): self.chunk_size = request['chunk_size'] if 'chunk_size' in request.keys() else -1 self.coarse_data_path = None self.data = None self.data_path = request['data_path'] self.date_format = request['selected_attrs']['date_format'] if 'date_format' in request['selected_attrs'].keys() else None self.dep_graph = None # original edges self.domain = None self.edges = None self.eps1_val = request['eps1_val'] if 'eps1_val' in request.keys() else c.EPSILON_1 self.eps1_level = request['eps1_level'] if 'eps1_level' in request.keys() else 1 self.jtree_strct = None self.jtree_file_path = None self.names = request['names'] if 'names' in request.keys() else None self.nodes = None self.opted_cluster = None self.selected_attrs = self.convert_selected_attrs(request['selected_attrs']) self.specified_c_domain = request['selected_attrs']['specified_c_domain'] if 'specified_c_domain' in request['selected_attrs'].keys() else None self.task_id = request['task_id'] self.task_folder = self.create_task_folder(self.task_id) self.valbin_map = None self.white_list = self.get_white_list(request) def read_data(self): self.data = DataUtils( file_path = self.data_path, selected_attrs = self.selected_attrs, names = self.names, specified_c_domain = self.specified_c_domain, chunk_size = self.chunk_size, date_format = self.date_format ) def coarse(self): self.data.data_coarsilize() self.domain = self.data.get_domain() self.nodes = self.data.get_nodes_name() self.valbin_map = str(self.data.get_valbin_maps()) def build_dep_graph(self): # dependency graph dep_graph_obj = DependencyGraph(self.data, eps1_val = self.eps1_val) self.edges = dep_graph_obj.get_dep_edges(display = True) self.cust_edges = dep_graph_obj.set_white_list(self.white_list) \ .get_dep_edges(display = True) self.dep_graph = str(self.edges) def get_white_list(self, request): white_list = request['white_list'] if 'white_list' in request.keys() and len(request['white_list']) > 0 else "[]" if not isinstance(white_list, list): white_list = ast.literal_eval(white_list) return white_list def build_jtree(self): # junction tree jtree = JunctionTree( self.cust_edges, self.nodes, self.get_jtree_file_path(self.task_id, self.eps1_level), # the path to save junction tree file ) # optimize marginal var_reduce = VarianceReduce(self.domain, jtree.get_jtree()['cliques'], 0.2) self.opted_cluster = var_reduce.main() self.jtree_strct = jtree.get_jtree()['cliques'] self.jtree_file_path = self.save_merged_jtree(self.task_id, self.eps1_level, self.jtree_strct) def save_coarse_data(self): # TODO: to deal with failure file_path = os.path.join(self.task_folder,c.COARSE_DATA_NAME) if self.data is not None: self.data.save(file_path) self.coarse_data_path = file_path def update_instance(self, status, is_celery): if not is_celery: return instance = get_object_or_404(Task, pk = self.task_id) instance.eps1_val = self.eps1_val instance.eps1_level = self.eps1_level instance.dep_graph = str(self.edges) instance.valbin_map = str(self.valbin_map) instance.domain = str(self.domain.items()) if self.domain is not None else None instance.white_list = self.white_list instance.jtree_strct = str(self.jtree_strct) instance.opted_cluster = str(self.opted_cluster) instance.status = ProcessStatus.get_code(status) instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') instance.save()
class Anonymization(Base): def __init__(self, request, is_celery): self.privacy_level = request['privacy_level'] self.epsilon = float(request['epsilon']) self.min_freq = float(request['min_freq']) if 'min_freq' in request.keys() else 0. self.exp_round = request['exp_round'] if 'exp_round' in request.keys() else None self.dp_id = request['dp_id'] task = get_object_or_404(Task, pk = request['task_id']) if is_celery else request['task_id'] self.task_id = task.task_id self.eps1_level = task.eps1_level self.data_path = task.data_path self.jtree_strct = ast.literal_eval(str(task.jtree_strct)) self.opted_cluster = ast.literal_eval(str(task.opted_cluster)) self.edges = ast.literal_eval(str(task.dep_graph)) self.domain = task.domain if isinstance(task.domain, dict) else collections.OrderedDict(ast.literal_eval(task.domain)) # This is the corsed domain self.valbin_map = ast.literal_eval(str(task.valbin_map)) self.selected_attrs = task.selected_attrs if isinstance(task.selected_attrs, dict) else self.convert_selected_attrs(task.selected_attrs) self.white_list = ast.literal_eval(str(task.white_list)) self.nodes = self.domain.keys() self.histogramdds = None self.data = None self.sim_df = None self.statistics_err = None self.generalized_dataframe = None self.synthetic_path = None def get_coarse_data(self): # TODO: Read coarse data from memory cach. # TODO: To deal with failure. folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} file_path = os.path.join(folder,c.COARSE_DATA_NAME) self.data = DataUtils( file_path = file_path, valbin_maps = self.valbin_map, selected_attrs = self.selected_attrs ) def kaggregate(self): if self.min_freq > 0: # cluster_num = len(self.jtree_strct) # thresh = self.get_freq_thresh(epsilon, cluster_num, min_freq) thresh = self.min_freq self.data.aggregation(thresh) self.domain = self.data.get_domain() self.valbin_map = self.data.get_valbin_maps() def get_histograms(self): combined_queries = self.combine_cliques_for_query(self.jtree_strct, self.opted_cluster) stats_func = StatsFunctions() self.histogramdds = stats_func.histogramdd_batch(self.data, combined_queries) def do_inference(self): inference = Inference( self.data, self.get_jtree_file_path(self.task_id, self.eps1_level), self.domain, self.opted_cluster, self.histogramdds, self.epsilon) self.model = inference.execute() def simulate(self): simulator = Simulate(self.model, self.data.get_nrows()) self.sim_df = simulator.run() def get_statistical_error(self): """ Compute the mean and standard varience error rates(Both coarse data). Parameters task_id: The task id to retrieve coarsed data. sim_coarsed_df: The noised sythetic data. Returns { "A":0.05, "B":0.12, ... } """ eps1 = self.eps1_level eps2 = self.epsilon white_list = self.white_list k = self.min_freq nodes = self.nodes # read the original coarse data first. coarsed_df = self.data.get_pandas_df() # make sure the order sim_coarsed_df = self.sim_df[self.nodes] coarsed_df_mean = np.array(coarsed_df.mean(), dtype = float) coarsed_df_std = np.array(coarsed_df.std(), dtype = float) sim_df_mean = np.array(sim_coarsed_df.mean(), dtype = float) sim_df_std = np.array(sim_coarsed_df.std(), dtype = float) mean_error = np.abs((sim_df_mean - coarsed_df_mean)*100 / coarsed_df_mean) std_error = np.abs((sim_df_std - coarsed_df_std)*100 / coarsed_df_std) mean_error = [str(rate)+'%' for rate in np.round(mean_error, decimals = 2)] std_error = [str(rate)+'%' for rate in np.round(std_error, decimals = 2)] self.print_pretty_summary(nodes, mean_error, std_error, eps1, eps2, white_list, k) self.statistics_err = { 'attrs':nodes, 'measures':['mean', 'std'], 'values':{ 'mean':mean_error, 'std':std_error } } def data_generalize(self): data = DataUtils(pandas_df=self.sim_df, valbin_maps = self.valbin_map, selected_attrs = self.selected_attrs) data.data_generalize() self.generalized_dataframe = data.get_pandas_df() def save_data(self): if self.exp_round: self.synthetic_path = self.save_sim_data_exp() else: self.synthetic_path = self.save_sim_data() def save_sim_data(self, spec_file_name = None): # TODO: to deal with failure folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} if not os.path.exists(folder): os.makedirs(folder) file_name = c.SIM_DATA_NAME_PATTERN % {'privacy_level': self.privacy_level} if spec_file_name is not None: file_name = spec_file_name # TODO: a parameter to specify no header output file_path = os.path.join(folder,file_name) self.generalized_dataframe.to_csv(file_path, index = False, header = False) else: file_path = os.path.join(folder,file_name) self.generalized_dataframe.to_csv(file_path, index = False) return c.SIM_DATA_URI_PATTERN % {'task_id':self.task_id, 'file_name':file_name} def save_sim_data_exp(self): spec_file_name = "sim_eps1lv_%(eps_lv)s_eps2lv_%(privacy_level)s_k_%(min_freq)s_round_%(exp_round)s.csv" % { 'exp_round': self.exp_round, 'privacy_level': self.privacy_level, 'eps_lv': self.eps1_level, 'min_freq': int(self.min_freq) } return self.save_sim_data(spec_file_name = spec_file_name) def print_pretty_summary(self, nodes, mean_error, std_error, eps1, eps2, white_list, k): LOG = Base.get_logger("Statical Accuracy Summary") import pandas as pd frame = pd.DataFrame({ 'Attribures': nodes, 'Mean': mean_error, 'STD': std_error }) LOG.info("eps1: %.2f, eps2: %.2f" % (eps1, eps2)) LOG.info("White List: %s" % str(white_list)) LOG.info("k-aggregate value: %d" % k) LOG.info('\n'+str(frame)) def create_instance(self): Job.objects.create( task_id = self.task_id, privacy_level = self.privacy_level, epsilon = self.epsilon, synthetic_path = self.synthetic_path, statistics_err = self.statistics_err ) def update_instance(self, status, is_celery): if not is_celery: return instance = get_object_or_404(Job, pk = self.dp_id) instance.synthetic_path = self.synthetic_path instance.statistics_err = self.statistics_err instance.status = ProcessStatus.get_code(status) instance.end_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') instance.save()
def setUp(self): self.data = DataUtils(TESTING_FILE)
def data_generalize(self): data = DataUtils(pandas_df=self.sim_df, valbin_maps = self.valbin_map, selected_attrs = self.selected_attrs) data.data_generalize() self.generalized_dataframe = data.get_pandas_df()
def data_generalize(self, dataframe, valbin_map, selected_attrs): data = DataUtils(pandas_df=dataframe, valbin_maps = valbin_map, selected_attrs = selected_attrs) data.data_generalize() return data.get_pandas_df()