예제 #1
0
 def read_data(self):
     self.data = DataUtils(file_path=self.data_path,
                           selected_attrs=self.selected_attrs,
                           names=self.names,
                           specified_c_domain=self.specified_c_domain,
                           chunk_size=self.chunk_size,
                           date_format=self.date_format)
예제 #2
0
 def get_coarse_data(self):
     # TODO: Read coarse data from memory cach.
     # TODO: To deal with failure.
     folder = c.MEDIATE_DATA_DIR % {'task_id': self.task_id}
     file_path = os.path.join(folder, c.COARSE_DATA_NAME)
     self.data = DataUtils(file_path=file_path,
                           valbin_maps=self.valbin_map,
                           selected_attrs=self.selected_attrs)
    def test_get_query_count_with_same_results_cnt(self):
        data = DataUtils(file_path=TEST_DATA_PATH)
        df = data.get_pandas_df()
        result_cnt = [
            self.user_query.get_query_count(df, query)
            for query in self.queries
        ]

        self.assertEqual(result_cnt == [1278, 24339, 24339, 41415], True)
예제 #4
0
 def get_coarse_data(self, task):
     # TODO: Read coarse data from memory cach.
     # TODO: To deal with failure.
     folder = c.MEDIATE_DATA_DIR % {'task_id': task.task_id}
     file_path = os.path.join(folder, c.COARSE_DATA_NAME)
     data = DataUtils(file_path=file_path,
                      valbin_maps=ast.literal_eval(task.valbin_map),
                      selected_attrs=self.convert_selected_attrs(
                          task.selected_attrs))
     return data
예제 #5
0
    def __init__(self, sensitive_data):
        """ Import the original data and initialize the utility measurement object

		Parameters
		----------
		sensitive_data: string
			The path to the original data. 
		"""
        self.LOG = Base.get_logger("UserQuery")
        sensitive = DataUtils(file_path=sensitive_data)
        self.sensitive_df = sensitive.get_pandas_df()
예제 #6
0
    def get_errors(self, synthetic_data, user_queries):
        """ Find the errors of the given queries between sensitive data and synthetic data

		Parameters
		----------
		synthetic_data: string
			The path to the synthetic data.
		user_queries: list
			the list of user queries.

		Returns
		-------
		results: list
			The list of results corresponding to each query
		"""

        # import synthetic data as dataframe
        def get_one_error(df1, df2, query):
            import time
            t0 = time.time()
            try:
                len_df1_result = self.get_query_count(df1, query)
                len_df2_result = self.get_query_count(df1, query)
            except Exception as e:
                return str(e)

            if len_df1_result == 0:
                return 'inif'
            self.LOG.info("User query error measurement spends: %d seconds" %
                          (time.time() - t0))

            return np.abs(len_df1_result - len_df2_result) / len_df1_result

        synthetic = DataUtils(file_path=synthetic_data)
        synthetic_df = synthetic.get_pandas_df()
        results = [
            get_one_error(self.sensitive_df, synthetic_df, query)
            for query in user_queries
        ]
        return results
 def setUp(self):
     selected_attributes = {
         "Age": "C",
         "workclass": "D",
         "fnlwgt": "C",
         "education": "D",
         "education_num": "D",
         "marital_status": "D",
         "occupation": "D",
         "relationship": "D",
         "race": "D",
         "sex": "D",
         "capital_gain": "C",
         "capital_loss": "C",
         "hours_per_week": "C",
         "native_country": "D",
         "salary_class": "D"
     }
     self.data = DataUtils(c.TEST_ORIGIN_DATA_PATH,
                           selected_attrs=selected_attributes)
     self.data.data_coarsilize()
     self.stats_funcs = StatsFunctions()
예제 #8
0
    def setUp(self):
        self.selected_attrs = dict({
            'Age': 'C',
            'workclass': 'D',
            'fnlwgt': 'C',
            'education': 'D',
            'education_num': 'D',
            'marital_status': 'D',
            'occupation': 'D',
            'relationship': 'D',
            'race': 'D',
            'sex': 'D',
            'capital_gain': 'C',
            'capital_loss': 'C',
            'hours_per_week': 'C',
            'native_country': 'D',
            'salary_class': 'D'
        })
        self.data = DataUtils(file_path=TESTING_FILE,
                              selected_attrs=self.selected_attrs)
        self.data.data_coarsilize()

        self.base = Base()
    def setUp(self):
        nodes = ['Age', 'Height', 'Weight', 'Income', 'TRV', 'HTN', 'DGF']
        edges = [['Height', 'HTN'], ['Weight', 'HTN'], ['Income', 'TRV']]
        jtree = JunctionTree(edges, nodes)

        cliques = jtree.get_jtree()['cliques']
        opted_cluster = [['DGF'], ['Income', 'TRV'], ['Age'],
                         ['Height', 'HTN'], ['Weight', 'HTN']]
        combined_queries = self.combine_cliques_for_query(
            cliques, opted_cluster)
        stats_func = StatsFunctions()

        domain = collections.OrderedDict(
            [('Age',
              [
                  22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
                  37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
                  52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
                  67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
                  82, 83, 84, 85
              ]),
             ('Height',
              [
                  137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
                  149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
                  161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
                  173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
                  185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
                  197, 198, 199, 200
              ]),
             ('Weight', [
                 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
                 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
                 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
                 104, 105, 106, 107, 108
             ]),
             ('Income', [
                 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
                 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
                 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
                 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
                 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
                 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
                 132, 133, 134, 135, 136, 137, 138, 139, 140
             ]),
             ('TRV', [
                 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60
             ]), ('HTN', [0, 1]), ('DGF', [0, 1])])

        data1 = DataUtils(file_path=TESTING_FILE)
        histogramdds = stats_func.histogramdd_batch(data1, combined_queries)
        self.inference = Inference(data1, JTREE_TEST_FILE, domain,
                                   opted_cluster, histogramdds, 0.2)

        domain_parsed = collections.OrderedDict([
            ('Age',
             [
                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
                 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
             ]),
            ('Height', [
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
            ]),
            ('Weight', [
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
            ]),
            ('Income', [
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
                82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
                98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
                111, 112, 113, 114, 115, 116, 117, 118, 119, 120
            ]),
            ('TRV', [
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
                50, 51, 52, 53, 54, 55, 56, 57, 58, 59
            ]), ('HTN', [0, 1]), ('DGF', [0, 1])
        ])

        data2 = DataUtils(file_path=TEST_PARSED_FILE)
        histogramdds = stats_func.histogramdd_batch(data2, combined_queries)
        self.inference_parsed = Inference(data2, JTREE_TEST_FILE,
                                          domain_parsed, opted_cluster,
                                          histogramdds, 0.2)
 def setUp(self):
     self.data = DataUtils(TESTING_FILE)
     self.dep_graph = DependencyGraph(self.data)
     self.edges = self.dep_graph.get_dep_edges()
     self.nodes = self.data.get_nodes_name()
     self.jtree_path = c.TEST_JTREE_FILE_PATH
 def setUp(self):
     self.data = DataUtils(TESTING_FILE)
예제 #12
0
 def data_generalize(self, dataframe, valbin_map, selected_attrs):
     data = DataUtils(pandas_df=dataframe,
                      valbin_maps=valbin_map,
                      selected_attrs=selected_attrs)
     data.data_generalize()
     return data.get_pandas_df()
예제 #13
0
 def test_data_preview(self):
     data = DataUtils(file_path=TESTING_FILE)
     preview = data.data_preview()
     self.assertEqual(len(preview.values[0]) > 0, True)
예제 #14
0
 def data_generalize(self):
     data = DataUtils(pandas_df=self.sim_df,
                      valbin_maps=self.valbin_map,
                      selected_attrs=self.selected_attrs)
     data.data_generalize()
     self.generalized_dataframe = data.get_pandas_df()