def read_data(self): self.data = DataUtils(file_path=self.data_path, selected_attrs=self.selected_attrs, names=self.names, specified_c_domain=self.specified_c_domain, chunk_size=self.chunk_size, date_format=self.date_format)
def get_coarse_data(self): # TODO: Read coarse data from memory cach. # TODO: To deal with failure. folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id} file_path = os.path.join(folder, c.COARSE_DATA_NAME) self.data = DataUtils(file_path=file_path, valbin_maps=self.valbin_map, selected_attrs=self.selected_attrs)
def test_get_query_count_with_same_results_cnt(self): data = DataUtils(file_path=TEST_DATA_PATH) df = data.get_pandas_df() result_cnt = [ self.user_query.get_query_count(df, query) for query in self.queries ] self.assertEqual(result_cnt == [1278, 24339, 24339, 41415], True)
def get_coarse_data(self, task): # TODO: Read coarse data from memory cach. # TODO: To deal with failure. folder = c.MEDIATE_DATA_DIR % {'task_id': task.task_id} file_path = os.path.join(folder, c.COARSE_DATA_NAME) data = DataUtils(file_path=file_path, valbin_maps=ast.literal_eval(task.valbin_map), selected_attrs=self.convert_selected_attrs( task.selected_attrs)) return data
def __init__(self, sensitive_data): """ Import the original data and initialize the utility measurement object Parameters ---------- sensitive_data: string The path to the original data. """ self.LOG = Base.get_logger("UserQuery") sensitive = DataUtils(file_path=sensitive_data) self.sensitive_df = sensitive.get_pandas_df()
def get_errors(self, synthetic_data, user_queries): """ Find the errors of the given queries between sensitive data and synthetic data Parameters ---------- synthetic_data: string The path to the synthetic data. user_queries: list the list of user queries. Returns ------- results: list The list of results corresponding to each query """ # import synthetic data as dataframe def get_one_error(df1, df2, query): import time t0 = time.time() try: len_df1_result = self.get_query_count(df1, query) len_df2_result = self.get_query_count(df1, query) except Exception as e: return str(e) if len_df1_result == 0: return 'inif' self.LOG.info("User query error measurement spends: %d seconds" % (time.time() - t0)) return np.abs(len_df1_result - len_df2_result) / len_df1_result synthetic = DataUtils(file_path=synthetic_data) synthetic_df = synthetic.get_pandas_df() results = [ get_one_error(self.sensitive_df, synthetic_df, query) for query in user_queries ] return results
def setUp(self): selected_attributes = { "Age": "C", "workclass": "D", "fnlwgt": "C", "education": "D", "education_num": "D", "marital_status": "D", "occupation": "D", "relationship": "D", "race": "D", "sex": "D", "capital_gain": "C", "capital_loss": "C", "hours_per_week": "C", "native_country": "D", "salary_class": "D" } self.data = DataUtils(c.TEST_ORIGIN_DATA_PATH, selected_attrs=selected_attributes) self.data.data_coarsilize() self.stats_funcs = StatsFunctions()
def setUp(self): self.selected_attrs = dict({ 'Age': 'C', 'workclass': 'D', 'fnlwgt': 'C', 'education': 'D', 'education_num': 'D', 'marital_status': 'D', 'occupation': 'D', 'relationship': 'D', 'race': 'D', 'sex': 'D', 'capital_gain': 'C', 'capital_loss': 'C', 'hours_per_week': 'C', 'native_country': 'D', 'salary_class': 'D' }) self.data = DataUtils(file_path=TESTING_FILE, selected_attrs=self.selected_attrs) self.data.data_coarsilize() self.base = Base()
def setUp(self): nodes = ['Age', 'Height', 'Weight', 'Income', 'TRV', 'HTN', 'DGF'] edges = [['Height', 'HTN'], ['Weight', 'HTN'], ['Income', 'TRV']] jtree = JunctionTree(edges, nodes) cliques = jtree.get_jtree()['cliques'] opted_cluster = [['DGF'], ['Income', 'TRV'], ['Age'], ['Height', 'HTN'], ['Weight', 'HTN']] combined_queries = self.combine_cliques_for_query( cliques, opted_cluster) stats_func = StatsFunctions() domain = collections.OrderedDict( [('Age', [ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85 ]), ('Height', [ 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200 ]), ('Weight', [ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108 ]), ('Income', [ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140 ]), ('TRV', [ 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60 ]), ('HTN', [0, 1]), ('DGF', [0, 1])]) data1 = DataUtils(file_path=TESTING_FILE) histogramdds = stats_func.histogramdd_batch(data1, combined_queries) self.inference = Inference(data1, JTREE_TEST_FILE, domain, opted_cluster, histogramdds, 0.2) domain_parsed = collections.OrderedDict([ ('Age', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ]), ('Height', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ]), ('Weight', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 ]), ('Income', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120 ]), ('TRV', [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59 ]), ('HTN', [0, 1]), ('DGF', [0, 1]) ]) data2 = DataUtils(file_path=TEST_PARSED_FILE) histogramdds = stats_func.histogramdd_batch(data2, combined_queries) self.inference_parsed = Inference(data2, JTREE_TEST_FILE, domain_parsed, opted_cluster, histogramdds, 0.2)
def setUp(self): self.data = DataUtils(TESTING_FILE) self.dep_graph = DependencyGraph(self.data) self.edges = self.dep_graph.get_dep_edges() self.nodes = self.data.get_nodes_name() self.jtree_path = c.TEST_JTREE_FILE_PATH
def setUp(self): self.data = DataUtils(TESTING_FILE)
def data_generalize(self, dataframe, valbin_map, selected_attrs): data = DataUtils(pandas_df=dataframe, valbin_maps=valbin_map, selected_attrs=selected_attrs) data.data_generalize() return data.get_pandas_df()
def test_data_preview(self): data = DataUtils(file_path=TESTING_FILE) preview = data.data_preview() self.assertEqual(len(preview.values[0]) > 0, True)
def data_generalize(self): data = DataUtils(pandas_df=self.sim_df, valbin_maps=self.valbin_map, selected_attrs=self.selected_attrs) data.data_generalize() self.generalized_dataframe = data.get_pandas_df()