Ejemplo n.º 1
0
 def test_empty_result_count_typed_notau_prepost(self):
     schema_all = copy.deepcopy(schema)
     schema_all['PUMS.PUMS'].censor_dims = False
     reader = PandasReader(df, schema)
     query = QueryParser(schema).queries(
         "SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0]
     private_reader = PrivateReader(reader,
                                    schema_all,
                                    privacy=Privacy(epsilon=1.0))
     private_reader._execute_ast(query, True)
     for i in range(3):
         print(private_reader._options)
         trs = private_reader._execute_ast(query, True)
         print("empty query")
         print(trs)
         assert (len(trs) == 2)
Ejemplo n.º 2
0
 def test_check_thresholds_gauss(self):
     # check tau for various privacy parameters
     epsilons = [0.1, 2.0]
     max_contribs = [1, 3]
     deltas = [10E-5, 10E-15]
     query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married"
     reader = PandasReader(df, schema)
     qp = QueryParser(schema)
     q = qp.query(query)
     for eps in epsilons:
         for d in max_contribs:
             for delta in deltas:
                 privacy = Privacy(epsilon=eps, delta=delta)
                 privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian
                 # using slightly different formulations of same formula from different papers
                 # make sure private_reader round-trips
                 gaus_scale = math.sqrt(d) * math.sqrt(
                     2 * math.log(1.25 / delta)) / eps
                 gaus_rho = 1 + gaus_scale * math.sqrt(
                     2 * math.log(d / math.sqrt(2 * math.pi * delta)))
                 schema_c = copy.copy(schema)
                 schema_c["PUMS.PUMS"].max_ids = d
                 private_reader = PrivateReader(reader,
                                                metadata=schema_c,
                                                privacy=privacy)
                 assert (private_reader._options.max_contrib == d)
                 r = private_reader._execute_ast(q)
                 assert (math.isclose(private_reader.tau,
                                      gaus_rho,
                                      rel_tol=0.03,
                                      abs_tol=2))
Ejemplo n.º 3
0
    def run_agg_query(self, df, metadata, query, confidence, get_exact=True):
        """
        Run the query using the private reader and input query
        Get query response back
        """
        reader = PandasReader(df, metadata)
        actual = 0.0
        # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation
        if get_exact:
            actual = reader.execute(query)[1:][0][0]
        private_reader = PrivateReader(reader,
                                       metadata,
                                       privacy=Privacy(epsilon=self.epsilon))
        query_ast = private_reader.parse_query_string(query)

        noisy_values = []
        low_bounds = []
        high_bounds = []
        for idx in range(self.repeat_count):
            res = private_reader._execute_ast(query_ast, True)
            # Disabled because confidence interval not available in report
            # interval = res.report[res.colnames[0]].intervals[confidence]
            # low_bounds.append(interval[0].low)
            # high_bounds.append(interval[0].high)
            noisy_values.append(res[1:][0][0])
        return np.array(noisy_values), actual, low_bounds, high_bounds
Ejemplo n.º 4
0
 def release(self, dataset: object) -> Report:
     """
     Dataset is a collection of [Dataset Metadata, PandasReader]
     Releases response to SQL query based on the number of repetitions
     requested by eval_params if actual is set of False. 
     
     """
     private_reader = PrivateReader(dataset[0], dataset[1],
                                    self.privacy_params.epsilon)
     query_ast = private_reader.parse_query_string(self.algorithm)
     srs_orig = private_reader.reader._execute_ast_df(query_ast)
     noisy_values = []
     for idx in range(self.eval_params.repeat_count):
         res = private_reader._execute_ast(query_ast, True)
         if not res[1:]:
             return Report({"__key__": "noisy_values_empty"})
         else:
             noisy_values.append(res[1:][0][0])
     return Report({"__key__": noisy_values})
Ejemplo n.º 5
0
    def run_agg_query_df(self,
                         df,
                         metadata,
                         query,
                         confidence,
                         file_name="d1"):
        """
        Run the query using the private reader and input query
        Get query response back for multiple dimensions and aggregations
        """
        # Getting exact result
        reader = PandasReader(df, metadata)
        exact_res = reader.execute(query)[1:]

        private_reader = PrivateReader(reader,
                                       metadata,
                                       privacy=Privacy(epsilon=self.epsilon))
        query_ast = private_reader.parse_query_string(query)

        # Distinguishing dimension and measure columns

        sample_res = private_reader._execute_ast(query_ast, True)
        headers = sample_res[0]

        dim_cols = []
        num_cols = []

        out_syms = query_ast.all_symbols()
        out_types = [s[1].type() for s in out_syms]
        out_col_names = [s[0] for s in out_syms]

        for col, ctype in zip(out_col_names, out_types):
            if ctype == "string":
                dim_cols.append(col)
            else:
                num_cols.append(col)

        # Repeated query and store results
        res = []
        for idx in range(self.repeat_count):
            dim_rows = []
            num_rows = []
            singleres = private_reader._execute_ast_df(query_ast,
                                                       cache_exact=True)
            # values = singleres[col]
            for col in dim_cols:
                dim_rows.append(singleres[col].tolist())
            for col in num_cols:
                values = singleres[col].tolist()
                num_rows.append(list(zip(values)))

            res.extend(list(zip(*dim_rows, *num_rows)))

        exact_df = pd.DataFrame(exact_res, columns=headers)
        noisy_df = pd.DataFrame(res, columns=headers)

        # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2
        if len(dim_cols) == 0:
            dim_cols.append("__dim__")

        if dim_cols[0] == "__dim__":
            exact_df[dim_cols[0]] = ["key"] * len(exact_df)
            noisy_df[dim_cols[0]] = ["key"] * len(noisy_df)

        return noisy_df, exact_df, dim_cols, num_cols