Ejemplo n.º 1
0
    def dp_query_test(self,
                      d1_query,
                      d2_query,
                      debug=False,
                      plot=True,
                      bound=True,
                      exact=False,
                      repeat_count=10000,
                      confidence=0.95,
                      get_exact=True):
        ag = agg.Aggregation(t=1, repeat_count=repeat_count)
        d1, d2, d1_metadata, d2_metadata = self.generate_neighbors(
            load_csv=True)

        fD1, fD1_actual, fD1_low, fD1_high = ag.run_agg_query(
            d1, d1_metadata, d1_query, confidence, get_exact)
        fD2, fD2_actual, fD2_low, fD2_high = ag.run_agg_query(
            d2, d2_metadata, d2_query, confidence, get_exact)
        d1hist, d2hist, bin_edges = self.generate_histogram_neighbors(
            fD1, fD2, binsize="auto")
        d1size, d2size = fD1.size, fD2.size
        dp_res, d1histupperbound, d2histupperbound, d1lower, d2lower = self.dp_test(
            d1hist, d2hist, bin_edges, d1size, d2size, debug)
        acc_res, utility_res, within_bounds = self.accuracy_test(
            fD1_actual, fD1_low, fD1_high, confidence)
        bias_res, msd = self.bias_test(fD1_actual, fD1)
        if (plot):
            self.plot_histogram_neighbors(fD1, fD2, d1histupperbound,
                                          d2histupperbound, d1hist, d2hist,
                                          d1lower, d2lower, bin_edges, bound,
                                          exact)
        return dp_res, acc_res, utility_res, bias_res
Ejemplo n.º 2
0
    def dp_powerset_test(self,
                         query_str,
                         debug=False,
                         plot=True,
                         bound=True,
                         exact=False,
                         repeat_count=10000,
                         confidence=0.95,
                         test_cases=5):
        ag = agg.Aggregation(t=1, repeat_count=repeat_count)
        ex = exp.Exploration()
        res_list = {}
        halton_samples = ex.generate_halton_samples(bounds=ex.corners,
                                                    dims=ex.N,
                                                    n_sample=test_cases)
        # Iterate through each sample generated by halton sequence
        for sample in halton_samples:
            df, metadata = ex.create_small_dataset(sample)
            ex.generate_powerset(df)
            print("Test case: ", list(sample))
            for filename in ex.visited:
                print("Testing: ", filename)
                d1_query = query_str + "d1_" + filename + "." + "d1_" + filename
                d2_query = query_str + "d2_" + filename + "." + "d2_" + filename
                [d1, d2, d1_metadata, d2_metadata] = ex.neighbor_pair[filename]
                fD1, fD1_actual, fD1_low, fD1_high = ag.run_agg_query(
                    d1, d1_metadata, d1_query, confidence)
                fD2, fD2_actual, fD2_low, fD2_high = ag.run_agg_query(
                    d2, d2_metadata, d2_query, confidence)

                #acc_res, utility_res, within_bounds = self.accuracy_test(fD1_actual, fD1_low, fD1_high, confidence)
                acc_res, utility_res, within_bounds = None, None, None
                bias_res, msd = self.bias_test(fD1_actual, fD1)
                d1hist, d2hist, bin_edges = self.generate_histogram_neighbors(
                    fD1, fD2, binsize="auto")
                d1size, d2size = fD1.size, fD2.size
                dp_res, d1histupperbound, d2histupperbound, d1lower, d2lower = self.dp_test(
                    d1hist, d2hist, bin_edges, d1size, d2size, debug)
                print("DP Predicate Test Result: ", dp_res)
                if (plot):
                    self.plot_histogram_neighbors(fD1, fD2, d1histupperbound,
                                                  d2histupperbound, d1hist,
                                                  d2hist, d1lower, d2lower,
                                                  bin_edges, bound, exact)
                key = "[" + ','.join(str(e)
                                     for e in list(sample)) + "] - " + filename
                res_list[key] = [
                    dp_res, acc_res, utility_res, within_bounds, bias_res, msd
                ]

        print("Halton sequence based Powerset Test Result")
        for data, res in res_list.items():
            print(data, "-", res)

        dp_res = np.all(np.array([res[0] for data, res in res_list.items()]))
        #acc_res = np.all(np.array([res[1] for res in res_list]))
        #utility_res = np.all(np.array([res[2] for res in res_list]))
        acc_res, utility_res = None, None
        bias_res = np.all(np.array([res[4] for data, res in res_list.items()]))
        return dp_res, acc_res, utility_res, bias_res
Ejemplo n.º 3
0
    def whitenoise_core_test(self,
                             dataset_path,
                             col_names,
                             f,
                             *args,
                             numbins=0,
                             binsize="auto",
                             debug=False,
                             plot=True,
                             bound=True,
                             exact=False,
                             repeat_count=100,
                             epsilon=1.0,
                             actual=1.0,
                             **kwargs):
        ag = agg.Aggregation(t=1, repeat_count=repeat_count)
        self.dataset_path = dataset_path
        d1, d2, d1_metadata, d2_metadata = self.generate_neighbors(
            load_csv=True)

        d1_file_path = os.path.join(self.file_dir, self.csv_path, "d1.csv")
        d2_file_path = os.path.join(self.file_dir, self.csv_path, "d2.csv")

        if (len(args) == 3):
            fD1 = ag.whitenoise_core_dp_multi_agg(f, d1_file_path, col_names,
                                                  args, epsilon, kwargs)
            fD2 = ag.whitenoise_core_dp_multi_agg(f, d2_file_path, col_names,
                                                  args, epsilon, kwargs)
        else:
            fD1 = ag.whitenoise_core_dp_agg(f, d1_file_path, col_names, args,
                                            epsilon, kwargs)
            fD2 = ag.whitenoise_core_dp_agg(f, d2_file_path, col_names, args,
                                            epsilon, kwargs)

        d1size, d2size = fD1.size, fD2.size
        d1hist, d2hist, bin_edges = \
            self.generate_histogram_neighbors(fD1, fD2, numbins, binsize, exact=exact)
        dp_res, d1histupperbound, d2histupperbound, d1lower, d2lower = self.dp_test(
            d1hist, d2hist, bin_edges, d1size, d2size, debug)
        print("DP Predicate Test:", dp_res, "\n")
        bias_res, msd = self.bias_test(actual, fD1)
        print("Bias Test:", bias_res, "\n")

        if (plot):
            self.plot_histogram_neighbors(fD1, fD2, d1histupperbound,
                                          d2histupperbound, d1hist, d2hist,
                                          d1lower, d2lower, bin_edges, bound)
        return dp_res, bias_res
Ejemplo n.º 4
0
    def dp_groupby_query_test(self,
                              d1_query,
                              d2_query,
                              debug=False,
                              plot=True,
                              bound=True,
                              exact=False,
                              repeat_count=10000,
                              confidence=0.95):
        ag = agg.Aggregation(t=1, repeat_count=repeat_count)
        d1, d2, d1_metadata, d2_metadata = self.generate_neighbors(
            load_csv=True)

        d1_res, d1_exact, dim_cols, num_cols = ag.run_agg_query_df(
            d1, d1_metadata, d1_query, confidence, file_name="d1")
        d2_res, d2_exact, dim_cols, num_cols = ag.run_agg_query_df(
            d2, d2_metadata, d2_query, confidence, file_name="d2")

        res_list = []
        for col in num_cols:
            d1_gp = d1_res.groupby(dim_cols)[col].apply(list).reset_index(
                name=col)
            d2_gp = d2_res.groupby(dim_cols)[col].apply(list).reset_index(
                name=col)
            exact_gp = d1_exact.groupby(dim_cols)[col].apply(list).reset_index(
                name=col)
            # Full outer join after flattening the results above to one row per dimension key
            # We cannot be sure if every dimension key has a response in every repeated query run because of tau thresholding
            # That's why we do a full outer join and flatten whatever vector of results we get for the numerical column across repeat runs
            # This is what we use for generating the histogram of results for that dimension key
            d1_d2 = d1_gp.merge(d2_gp, on=dim_cols, how='outer')
            d1_d2 = d1_d2.merge(exact_gp, on=dim_cols, how='left')
            n_cols = len(d1_d2.columns)
            for index, row in d1_d2.iterrows():
                print(d1_d2.iloc[index, :n_cols - 3])
                print("Column: ", col)
                # fD1 and fD2 will have the results of the K repeated query results that can be passed through histogram test
                # These results are for that particular numerical column and the specific dimension key of d1_d2
                fD1 = np.array(
                    [val[0] for val in d1_d2.iloc[index, n_cols - 3]])
                fD2 = np.array(
                    [val[0] for val in d1_d2.iloc[index, n_cols - 2]])
                exact_val = d1_d2.iloc[index, n_cols - 1][0]
                d1hist, d2hist, bin_edges = self.generate_histogram_neighbors(
                    fD1, fD2, binsize="auto")
                d1size, d2size = fD1.size, fD2.size
                dp_res, d1histupperbound, d2histupperbound, d1lower, d2lower = self.dp_test(
                    d1hist, d2hist, bin_edges, d1size, d2size, debug)
                print("DP Predicate Test Result: ", dp_res)

                # Accuracy Test
                low = np.array(
                    [val[1] for val in d1_d2.iloc[index, n_cols - 2]])
                high = np.array(
                    [val[2] for val in d1_d2.iloc[index, n_cols - 2]])
                acc_res, utility_res, within_bounds = self.accuracy_test(
                    exact_val, low, high, confidence)
                bias_res, msd = self.bias_test(exact_val, fD1)
                res_list.append([
                    dp_res, acc_res, utility_res, within_bounds, bias_res, msd
                ])
                if (plot):
                    self.plot_histogram_neighbors(fD1, fD2, d1histupperbound,
                                                  d2histupperbound, d1hist,
                                                  d2hist, d1lower, d2lower,
                                                  bin_edges, bound, exact)

        for res in res_list:
            print(res)

        res_list = res_list.values() if hasattr(
            res_list, "values") else res_list  # TODO why is this needed?
        dp_res = np.all(np.array([res[0] for res in res_list]))
        acc_res = np.all(np.array([res[1] for res in res_list]))
        utility_res = np.all(np.array([res[2] for res in res_list]))
        bias_res = np.all(np.array([res[4] for res in res_list]))
        return dp_res, acc_res, utility_res, bias_res