Exemple #1
0
    def _generate_histogram_neighbors(self, fD1, fD2, ep: EvaluatorParams):
        """
        Generate histograms given the vectors of repeated aggregation results
        applied on neighboring datasets
        """
        fD1 = np.asarray(fD1, dtype="float64")
        fD2 = np.asarray(fD2, dtype="float64")
        d = np.concatenate((fD1, fD2), axis=None)
        n = len(fD1)
        binlist = []
        minval = min(min(fD1), min(fD2))
        maxval = max(max(fD1), max(fD2))

        # Deciding bin width and bin list
        if ep.exact:
            binlist = np.linspace(minval, maxval, 2)
        elif ep.numbins > 0:
            binlist = np.linspace(minval, maxval, ep.numbins)
        elif ep.binsize == "auto":
            iqr = np.subtract(*np.percentile(d, [75, 25]))
            numerator = 2 * iqr if iqr > 0 else maxval - minval
            denominator = n**(1.0 / 3)
            binwidth = numerator / denominator  # Freedman–Diaconis' choice
            ep.numbins = int(math.ceil(
                (maxval - minval) / binwidth)) if maxval > minval else 20
            binlist = np.linspace(minval, maxval, ep.numbins)
        else:
            # Choose bin size of unity
            binlist = np.arange(np.floor(minval), np.ceil(maxval))

        # Calculating histograms of fD1 and fD2
        fD1hist, bin_edges = np.histogram(fD1, bins=binlist, density=False)
        fD2hist, bin_edges = np.histogram(fD2, bins=binlist, density=False)

        return fD1hist, fD2hist, bin_edges
 def test_interface_benchmark(self):
     logging.getLogger().setLevel(logging.DEBUG)
     lib = DPSampleLibrary()
     pa = DPSample()
     epsilon_list = [0.001, 0.5, 1.0, 2.0, 4.0]
     pp = PrivacyParams(epsilon=1.0)
     ev = EvaluatorParams(repeat_count=500)
     # Creating neighboring datasets
     d1 = pd.DataFrame(random.sample(range(1, 1000), 100),
                       columns=['Usage'])
     drop_idx = np.random.choice(d1.index, 1, replace=False)
     d2 = d1.drop(drop_idx)
     benchmarking = DPBenchmarking()
     # Preparing benchmarking params
     pa_algorithms = {pa: [lib.dp_count]}
     privacy_params_list = []
     for epsilon in epsilon_list:
         pp = PrivacyParams()
         pp.epsilon = epsilon
         privacy_params_list.append(pp)
     d1_d2_list = [[d1, d2]]
     benchmark_params = BenchmarkParams(pa_algorithms, privacy_params_list,
                                        d1_d2_list, ev)
     benchmark_metrics_list = benchmarking.benchmark(benchmark_params)
     for bm in benchmark_metrics_list:
         for key, metrics in bm.key_metrics.items():
             test_logger.debug("Epsilon: " + str(bm.privacy_params.epsilon) + \
                 " MSE:" + str(metrics.mse) + \
                 " Privacy Test: " + str(metrics.dp_res))
             assert (metrics.dp_res == True)
     assert (len(benchmark_metrics_list) == 5)
    def test_interface_algorithm(self):
        logging.getLogger().setLevel(logging.DEBUG)
        lib = DPSampleLibrary()
        dv = DPSample()
        pp = PrivacyParams(epsilon=1.0)
        ev = EvaluatorParams(repeat_count=500)
        df = pd.DataFrame(random.sample(range(1, 1000), 100),
                          columns=['Usage'])

        # Preparing and releasing from Sample DP algorithm to send noisy results to evaluator
        dv.prepare(lib.dp_count, pp, ev)
        report = dv.release(df)

        # Test DP respose from interface
        assert (isinstance(report.res, dict))
        assert (len(report.res) > 0)
        firstkey = list(report.res.keys())[0]
        test_logger.debug("First key name is:" + str(firstkey))
        test_logger.debug("Repeated noisy count responses: " +
                          str(report.res[firstkey]))
        assert (isinstance(firstkey, str))
        assert (len(report.res[firstkey]) == ev.repeat_count)

        # Test non-DP i.e. actual response from interface should be a single numeric return
        report = dv.actual_release(df)
        test_logger.debug("Actual count response: " +
                          str(report.res[firstkey]))

        assert (isinstance(report.res[firstkey], (int, float)))
 def test_interface_multikey(self):
     logging.getLogger().setLevel(logging.DEBUG)
     lib = DPSampleLibrary()
     pa = DPMultiKey()
     metrics = Metrics()
     # Before running the DP test, it should be default to False
     # and Wasserstein distance should be 0
     assert (metrics.dp_res == False)
     assert (metrics.wasserstein_distance == 0.0)
     assert (metrics.jensen_shannon_divergence == 0.0)
     assert (metrics.kl_divergence == 0.0)
     assert (metrics.mse == 0.0)
     assert (metrics.std == 0.0)
     assert (metrics.msd == 0.0)
     pp = PrivacyParams(epsilon=1.0)
     ev = EvaluatorParams(repeat_count=500)
     # Creating neighboring datasets
     col1 = list(range(0, 1000))
     col2 = list(range(-1000, 0))
     d1 = pd.DataFrame(list(zip(col1, col2)), columns=['Col1', 'Col2'])
     drop_idx = np.random.choice(d1.index, 1, replace=False)
     d2 = d1.drop(drop_idx)
     # Call evaluate
     eval = DPEvaluator()
     key_metrics = eval.evaluate(d1, d2, pa, lib.dp_sum, pp, ev)
     # After evaluation, it should return True and distance metrics should be non-zero
     for key, metrics in key_metrics.items():
         assert (metrics.dp_res == True)
         test_logger.debug("Wasserstein Distance:" +
                           str(metrics.wasserstein_distance))
         test_logger.debug("Jensen Shannon Divergence:" +
                           str(metrics.jensen_shannon_divergence))
         test_logger.debug("KL Divergence:" + str(metrics.kl_divergence))
         test_logger.debug("MSE:" + str(metrics.mse))
         test_logger.debug("Standard Deviation:" + str(metrics.std))
         test_logger.debug("Mean Signed Deviation:" + str(metrics.msd))
         assert (metrics.wasserstein_distance > 0.0)
         assert (metrics.jensen_shannon_divergence > 0.0)
         assert (metrics.kl_divergence != 0.0)
         assert (metrics.mse > 0.0)
         assert (metrics.std != 0.0)
         assert (metrics.msd != 0.0)
Exemple #5
0
    def test_interface_count(self):
        logging.getLogger().setLevel(logging.DEBUG)
        # Initialize params and algorithm to benchmark
        pa = DPSingletonQuery()
        pp = PrivacyParams(epsilon=1.0)
        ev = EvaluatorParams(repeat_count=100)
        dd = DatasetParams(dataset_size=500)
        query = "SELECT COUNT(UserId) AS UserCount FROM dataset.dataset"

        # Preparing neighboring datasets
        df, metadata = self.create_simulated_dataset(dd.dataset_size,
                                                     "dataset")
        d1_dataset, d2_dataset, d1_metadata, d2_metadata = self.generate_neighbors(
            df, metadata)
        d1 = PandasReader(d1_dataset, d1_metadata)
        d2 = PandasReader(d2_dataset, d2_metadata)

        # Call evaluate
        eval = DPEvaluator()
        key_metrics = eval.evaluate([d1_metadata, d1], [d2_metadata, d2], pa,
                                    query, pp, ev)
        # After evaluation, it should return True and distance metrics should be non-zero
        for key, metrics in key_metrics.items():
            assert (metrics.dp_res == True)
            test_logger.debug("Wasserstein Distance:" +
                              str(metrics.wasserstein_distance))
            test_logger.debug("Jensen Shannon Divergence:" +
                              str(metrics.jensen_shannon_divergence))
            test_logger.debug("KL Divergence:" + str(metrics.kl_divergence))
            test_logger.debug("MSE:" + str(metrics.mse))
            test_logger.debug("Standard Deviation:" + str(metrics.std))
            test_logger.debug("Mean Signed Deviation:" + str(metrics.msd))
            assert (metrics.wasserstein_distance > 0.0)
            assert (metrics.jensen_shannon_divergence > 0.0)
            assert (metrics.kl_divergence != 0.0)
            assert (metrics.mse > 0.0)
            assert (metrics.std != 0.0)
            assert (metrics.msd != 0.0)
Exemple #6
0
 def __init__(self, learner_params):
     self.lp = learner_params
     self.pp = PrivacyParams(epsilon=1.0)
     self.ev = EvaluatorParams(repeat_count=100)
     self.dd = DatasetParams(dataset_size=500)
Exemple #7
0
 def __init__(self):
     self.pp = PrivacyParams(epsilon=1.0)
     self.ev = EvaluatorParams(repeat_count=100)
     self.dd = DatasetParams(dataset_size=500)
     self.pa = DPSingletonQuery()