def actual_release(self, dataset: object): """ Returns non-private exact response from the algorithm """ actual_res = {} if (self.algorithm == wn.dp_mean): with wn.Analysis(filter_level="all") as analysis: data = wn.to_float(wn.Dataset(value=dataset)) agg = wn.mean(data=data, data_lower=float(min(dataset)), data_upper=float(max(dataset)), data_rows=len(dataset), data_columns=1) analysis.release() actual_res["__key__"] = agg.value if (self.algorithm == wn.dp_sum): with wn.Analysis(filter_level="all") as analysis: data = wn.to_float(wn.Dataset(value=dataset)) agg = wn.sum(data=data, data_lower=float(min(dataset)), data_upper=float(max(dataset)), data_rows=len(dataset), data_columns=1) analysis.release() actual_res["__key__"] = agg.value return Report(actual_res)
def test_histogram(): import numpy as np # establish data information data = np.genfromtxt(TEST_CSV_PATH, delimiter=',', names=True) education_categories = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17" ] income = list(data[:]['income']) income_edges = list(range(0, 100_000, 10_000)) print('actual', np.histogram(income, bins=income_edges)[0]) with wn.Analysis() as analysis: data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names) income = wn.to_int(data['income'], lower=0, upper=0) sex = wn.to_bool(data['sex'], true_label="1") income_histogram = wn.dp_histogram(income, edges=income_edges, privacy_usage={'epsilon': 1.}) analysis.release() print("Income histogram Geometric DP release: " + str(income_histogram.value))
def test_properties(): with wn.Analysis(): # load data data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names) # establish data age_dt = wn.cast(data['age'], 'FLOAT') # ensure data are non-null non_null_age_dt = wn.impute(age_dt, distribution='Uniform', lower=0., upper=100.) clamped = wn.clamp(age_dt, lower=0., upper=100.) # create potential for null data again potentially_null_age_dt = non_null_age_dt / 0. # print('original properties:\n{0}\n\n'.format(age_dt.properties)) print('properties after imputation:\n{0}\n\n'.format( non_null_age_dt.nullity)) print('properties after nan mult:\n{0}\n\n'.format( potentially_null_age_dt.nullity)) print("lower", clamped.lower) print("upper", clamped.upper) print("releasable", clamped.releasable) # print("props", clamped.properties) print("data_type", clamped.data_type) print("categories", clamped.categories)
def __init__(self, dataset, priv_budget=10): self._analysis = wn.Analysis() self._analysis.__enter__() if isinstance(dataset, dict): self._dataset = wn.Dataset(value=dataset, column_names=dataset.keys()) elif isinstance(dataset, list): self._dataset = wn.Dataset(value=dataset, num_columns=1) else: raise ValueError("more complex types not yet handled") self._filterresult = wn.Dataset(value=[], num_columns=1) self._analysis.exit() self.priv_budget = priv_budget self.priv_used = 0
def test_multilayer_analysis(run=True): with wn.Analysis() as analysis: PUMS = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names) age = wn.to_float(PUMS['age']) sex = wn.to_bool(PUMS['sex'], true_label="TRUE") age_clamped = wn.clamp(age, lower=0., upper=150.) age_resized = wn.resize(age_clamped, n=1000) mean_age = wn.dp_mean(data=wn.to_float(PUMS['race']), privacy_usage={'epsilon': .65}, data_lower=0., data_upper=100., data_n=500) analysis.release() sex_plus_22 = wn.add(wn.to_float(sex), 22., left_n=1000, left_lower=0., left_upper=1.) wn.dp_mean(age_resized / 2. + sex_plus_22, privacy_usage={'epsilon': .1}, data_lower=mean_age - 5.2, data_upper=102., data_n=500) + 5. wn.dp_variance(data=wn.to_float(PUMS['educ']), privacy_usage={'epsilon': .15}, data_n=1000, data_lower=0., data_upper=12.) # wn.dp_moment_raw( # wn.to_float(PUMS['married']), # privacy_usage={'epsilon': .15}, # data_n=1000000, # data_lower=0., # data_upper=12., # order=3 # ) # # wn.dp_covariance( # left=wn.to_float(PUMS['age']), # right=wn.to_float(PUMS['married']), # privacy_usage={'epsilon': .15}, # left_n=1000, # right_n=1000, # left_lower=0., # left_upper=1., # right_lower=0., # right_upper=1. # ) if run: analysis.release() return analysis
def test_equal(): with wn.Analysis(filter_level='all') as analysis: data = wn.Dataset(**dataset_bools) equality = data[0] == data[1] analysis.release() assert np.array_equal(equality.value, np.array([True, False, False, True]))
def test_index(): with wn.Analysis(filter_level='all') as analysis: data = wn.Dataset(**dataset_bools) index_0 = data[0] analysis.release() assert all(a == b for a, b in zip(index_0.value, [True, True, False, False]))
def test_partition(): with wn.Analysis(filter_level='all') as analysis: data = wn.Dataset(**dataset_bools)[[0, 1]] partitioned = wn.partition(data, num_partitions=3) analysis.release() # print(partitioned.value) assert np.array_equal(partitioned.value[0], np.array([[True, True], [True, False]])) assert np.array_equal(partitioned.value[1], np.array([[False, True]])) assert np.array_equal(partitioned.value[2], np.array([[False, False]]))
def whitenoise_core_dp_multi_agg(self, f, dataset_path, col_names, args, epsilon, kwargs): releases = [] with wn.Analysis() as analysis: for x in range(self.repeat_count): df = wn.Dataset(path=dataset_path, column_names=col_names) release = f(data=wn.to_float(df[[args[0], args[1]]]), privacy_usage={'epsilon': epsilon}, **kwargs) releases.append(release) analysis.release() noisy_values = [release.value[0][0] for release in releases] return np.array(noisy_values)
def test_dp_count(run=True): with wn.Analysis() as analysis: dataset_pums = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names) count = wn.dp_count(dataset_pums['sex'] == '1', privacy_usage={'epsilon': 0.5}) if run: analysis.release() print(count.value) return analysis
def test_raw_dataset(run=True): with wn.Analysis() as analysis: wn.dp_mean( data=wn.Dataset(value=[1., 2., 3., 4., 5.], num_columns=1), privacy_usage={'epsilon': 1}, data_lower=0., data_upper=10., data_n=10, ) if run: analysis.release() return analysis
def test_divide(): with wn.Analysis(): data_A = wn.Dataset(**generate_synthetic(float, variants=['Random'])) f_random = data_A['F_Random'] imputed = wn.impute(f_random, lower=0., upper=10.) clamped_nonzero = wn.clamp(imputed, lower=1., upper=10.) clamped_zero = wn.clamp(imputed, lower=0., upper=10.) # test properties assert f_random.nullity assert not imputed.nullity assert (2. / imputed).nullity assert (f_random / imputed).nullity assert (2. / clamped_zero).nullity
def test_dp_mean(): with wn.Analysis(): data = wn.Dataset(**generate_synthetic(float, variants=['Random'])) mean = wn.dp_mean( data['F_Random'], # privacy_usage={'epsilon': 0.1}, accuracy={ 'value': .2, 'alpha': .05 }, data_lower=0., data_upper=10., data_n=10) print("accuracy", mean.get_accuracy(0.05)) print(mean.from_accuracy(2.3, .05))
def test_covariance(): import numpy as np import pandas as pd import matplotlib.pyplot as plt data = np.genfromtxt(TEST_CSV_PATH, delimiter=',', names=True) with wn.Analysis() as analysis: wn_data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names) # get full covariance matrix cov = wn.dp_covariance(data=wn.to_float(wn_data['age', 'sex', 'educ', 'income', 'married']), privacy_usage={'epsilon': 10}, data_lower=[0., 0., 1., 0., 0.], data_upper=[100., 1., 16., 500_000., 1.], data_n=1000) analysis.release() # store DP covariance and correlation matrix dp_cov = cov.value print(dp_cov) dp_corr = dp_cov / np.outer(np.sqrt(np.diag(dp_cov)), np.sqrt(np.diag(dp_cov))) # get non-DP covariance/correlation matrices age = list(data[:]['age']) sex = list(data[:]['sex']) educ = list(data[:]['educ']) income = list(data[:]['income']) married = list(data[:]['married']) non_dp_cov = np.cov([age, sex, educ, income, married]) non_dp_corr = non_dp_cov / np.outer(np.sqrt(np.diag(non_dp_cov)), np.sqrt(np.diag(non_dp_cov))) print('Non-DP Covariance Matrix:\n{0}\n\n'.format( pd.DataFrame(non_dp_cov))) print('Non-DP Correlation Matrix:\n{0}\n\n'.format( pd.DataFrame(non_dp_corr))) print('DP Correlation Matrix:\n{0}'.format(pd.DataFrame(dp_corr))) # skip plot step if IS_CI_BUILD: return plt.imshow(non_dp_corr - dp_corr, interpolation='nearest') plt.colorbar() plt.show()
def release(self, dataset: object) -> Report: """ Releases report according to the OpenDP Core applying functions on the dataset or return the actual report if actual is set to True """ noisy_res = {"__key__": []} # Repeating analysis multiple times to collect enough samples for evaluation for i in range(self.eval_params.repeat_count): with wn.Analysis() as analysis: data = wn.to_float(wn.Dataset(value=dataset)) agg = self.algorithm( data=data, privacy_usage={'epsilon': self.privacy_params.epsilon}, data_lower=float(min(dataset)), data_upper=float(max(dataset)), data_rows=len(dataset), data_columns=1) analysis.release() noisy_res["__key__"].append(agg.value) return Report(noisy_res)
def test_insertion_simple(): """ Conduct a differentially private analysis with values inserted from other systems :return: """ with wn.Analysis() as analysis: # construct a fake dataset that describes your actual data (will never be run) data = wn.Dataset(path="", column_names=["A", "B", "C", "D"]) # pull a column out col_a = wn.to_float(data['A']) # describe the preprocessing you actually perform on the data col_a_clamped = wn.impute(wn.clamp(col_a, lower=0., upper=10.)) col_a_resized = wn.resize(col_a_clamped, n=1000000) # run a fake aggregation actual_mean = wn.mean(col_a_resized) # insert aggregated data from an external system actual_mean.set(10) # describe the differentially private operation gaussian_mean = wn.gaussian_mechanism(actual_mean, privacy_usage={ "epsilon": .4, "delta": 1e-6 }) # check if the analysis is permissible analysis.validate() # compute the missing releasable nodes- in this case, only the gaussian mean analysis.release() # retrieve the noised mean print("gaussian mean", gaussian_mean.value) # release a couple other statistics using other mechanisms in the same batch actual_sum = wn.sum(col_a_clamped) actual_sum.set(123456) laplace_sum = wn.laplace_mechanism(actual_sum, privacy_usage={"epsilon": .1}) actual_count = wn.count(col_a) actual_count.set(9876) geo_count = wn.simple_geometric_mechanism( actual_count, 0, 10000, privacy_usage={"epsilon": .1}) analysis.release() print("laplace sum", laplace_sum.value) print("geometric count", geo_count.value) actual_histogram_b = wn.histogram( wn.clamp(data['B'], categories=['X', 'Y', 'Z'], null_value="W")) actual_histogram_b.set([12, 1280, 2345, 12]) geo_histogram_b = wn.simple_geometric_mechanism( actual_histogram_b, 0, 10000, privacy_usage={"epsilon": .1}) col_c = wn.to_bool(data['C'], true_label="T") actual_histogram_c = wn.histogram(col_c) actual_histogram_c.set([5000, 5000]) lap_histogram_c = wn.laplace_mechanism(actual_histogram_c, privacy_usage={"epsilon": .1}) analysis.release() print("noised histogram b", geo_histogram_b.value) print("noised histogram c", lap_histogram_c.value) print("C dimensionality", col_c.dimensionality) print("C categories", col_c.categories) # multicolumnar insertion # pull a column out col_rest = wn.to_float(data[['C', 'D']]) # describe the preprocessing you actually perform on the data col_rest_resized = wn.resize(wn.impute( wn.clamp(col_rest, lower=[0., 5.], upper=1000.)), n=10000) # run a fake aggregation actual_mean = wn.mean(col_rest_resized) # insert aggregated data from an external system actual_mean.set([[10., 12.]]) # describe the differentially private operation gaussian_mean = wn.gaussian_mechanism(actual_mean, privacy_usage={ "epsilon": .4, "delta": 1e-6 }) # check if the analysis is permissible analysis.validate() # compute the missing releasable nodes- in this case, only the gaussian mean analysis.release() # retrieve the noised mean print("rest gaussian mean", gaussian_mean.value)
def create_dicts(data, non_income_data, plausible_variable_combinations): count_dict = dict() priv_count_dict = dict() mean_income_dict = dict() priv_mean_income_dict = dict() median_income_dict = dict() priv_median_income_dict = dict() min_income_dict = dict() priv_min_income_dict = dict() max_income_dict = dict() priv_max_income_dict = dict() # get number of data elements with each set of variable values for i, combination in enumerate(plausible_variable_combinations): # print('run {0} of {1}'.format(i+1, len(plausible_variable_combinations))) if len(combination) == 1: dt = data[non_income_data[combination[0]] == 1] elif len(combination) == 2: dt = data[(non_income_data[combination[0]] == 1) & (non_income_data[combination[1]] == 1)] elif len(combination) == 3: dt = data[(non_income_data[combination[0]] == 1) & (non_income_data[combination[1]] == 1) & (non_income_data[combination[2]] == 1)] elif len(combination) == 4: dt = data[(non_income_data[combination[0]] == 1) & (non_income_data[combination[1]] == 1) & (non_income_data[combination[2]] == 1) & (non_income_data[combination[3]] == 1)] elif len(combination) == 5: dt = data[(non_income_data[combination[0]] == 1) & (non_income_data[combination[1]] == 1) & (non_income_data[combination[2]] == 1) & (non_income_data[combination[3]] == 1) & (non_income_data[combination[4]] == 1)] count_dict['__'.join(combination)] = dt.shape[0] mean_income_dict['__'.join(combination)] = np.mean(dt['income']) median_income_dict['__'.join(combination)] = np.median(dt['income']) min_income_dict['__'.join(combination)] = np.min(dt['income']) max_income_dict['__'.join(combination)] = np.max(dt['income']) with wn.Analysis() as analysis: # load data priv_data = wn.Dataset(value=list(dt['income']), num_columns=1) # estimate sample size count = wn.dp_count(data=wn.cast(priv_data, 'FLOAT'), privacy_usage={'epsilon': .05}, lower=0, upper=1000) analysis.release() priv_count_dict['__'.join(combination)] = max(0, count.value) with wn.Analysis() as analysis: # load data priv_data = wn.Dataset(value=list(dt['income']), num_columns=1) # get mean mean = wn.dp_mean(data=wn.cast(priv_data, 'FLOAT'), privacy_usage={'epsilon': 0.1}, data_lower=0., data_upper=100_000., data_n=max(1, count.value)) # get median median = wn.dp_median(data=wn.cast(priv_data, 'FLOAT'), privacy_usage={'epsilon': 0.1}, data_lower=0., data_upper=100_000., data_n=max(1, count.value)) # get min _min = wn.dp_minimum(data=wn.cast(priv_data, 'FLOAT'), privacy_usage={'epsilon': 0.1}, data_lower=0., data_upper=100_000., data_n=max(1, count.value)) # get max _max = wn.dp_maximum(data=wn.cast(priv_data, 'FLOAT'), privacy_usage={'epsilon': 0.1}, data_lower=0., data_upper=100_000., data_n=max(1, count.value)) analysis.release() priv_mean_income_dict['__'.join(combination)] = min( max(0, mean.value), 100_000) priv_median_income_dict['__'.join(combination)] = min( max(0, median.value), 100_000) priv_min_income_dict['__'.join(combination)] = min( max(0, _min.value), 100_000) priv_max_income_dict['__'.join(combination)] = min( max(0, _max.value), 100_000) return (count_dict, priv_count_dict, mean_income_dict, priv_mean_income_dict, median_income_dict, priv_median_income_dict, min_income_dict, priv_min_income_dict, max_income_dict, priv_max_income_dict)
def __get_dp_noise_dataset(self): df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe() df.to_csv('tmp.csv', index=False) return wn.Dataset(path='tmp.csv', column_names=self.get_columns()), len(df.index)
def test_everything(run=True): with wn.Analysis(dynamic=True) as analysis: data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names) age_int = wn.to_int(data['age'], 0, 150) sex = wn.to_bool(data['sex'], "1") educ = wn.to_float(data['educ']) race = data['race'] income = wn.to_float(data['income']) married = wn.to_bool(data['married'], "1") numerics = wn.to_float(data[['age', 'income']]) # intentionally busted component # print("invalid component id ", (sex + "a").component_id) # broadcast scalar over 2d, broadcast scalar over 1d, columnar broadcasting, left and right mul numerics * 2. + 2. * educ # add different values for each column numerics + [[1., 2.]] # index into first column age = numerics[0] income = numerics[[False, True]] # boolean ops and broadcasting mask = sex & married | (~married ^ False) | (age > 50.) | (age_int == 25) # numerical clamping wn.clamp(numerics, 0., [150., 150_000.]) wn.clamp(data['educ'], categories=[str(i) for i in range(8, 10)], null_value="-1") wn.count(mask) wn.covariance(age, income) wn.digitize(educ, edges=[1., 3., 10.], null_value=-1) # checks for safety against division by zero income / 2. income / wn.clamp(educ, 5., 20.) wn.dp_count(data, privacy_usage={"epsilon": 0.5}) wn.dp_count(mask, privacy_usage={"epsilon": 0.5}) wn.dp_histogram(mask, privacy_usage={"epsilon": 0.5}) age = wn.impute(wn.clamp(age, 0., 150.)) wn.dp_maximum(age, privacy_usage={"epsilon": 0.5}) wn.dp_minimum(age, privacy_usage={"epsilon": 0.5}) wn.dp_median(age, privacy_usage={"epsilon": 0.5}) age_n = wn.resize(age, n=800) wn.dp_mean(age_n, privacy_usage={"epsilon": 0.5}) wn.dp_moment_raw(age_n, order=3, privacy_usage={"epsilon": 0.5}) wn.dp_sum(age, privacy_usage={"epsilon": 0.5}) wn.dp_variance(age_n, privacy_usage={"epsilon": 0.5}) wn.filter(income, mask) race_histogram = wn.histogram(race, categories=["1", "2", "3"], null_value="3") wn.histogram(income, edges=[0., 10000., 50000.], null_value=-1) wn.dp_histogram(married, privacy_usage={"epsilon": 0.5}) wn.gaussian_mechanism(race_histogram, privacy_usage={ "epsilon": 0.5, "delta": .000001 }) wn.laplace_mechanism(race_histogram, privacy_usage={ "epsilon": 0.5, "delta": .000001 }) wn.kth_raw_sample_moment(educ, k=3) wn.log(wn.clamp(educ, 0.001, 50.)) wn.maximum(educ) wn.mean(educ) wn.minimum(educ) educ % 2. educ**2. wn.quantile(educ, .32) wn.resize(educ, 1200, 0., 50.) wn.resize(race, 1200, categories=["1", "2"], weights=[1, 2]) wn.resize(data[["age", "sex"]], 1200, categories=[["1", "2"], ["a", "b"]], weights=[1, 2]) wn.resize(data[["age", "sex"]], 1200, categories=[["1", "2"], ["a", "b", "c"]], weights=[[1, 2], [3, 7, 2]]) wn.sum(educ) wn.variance(educ) if run: analysis.release() return analysis
def test_dp_linear_stats(run=True): with wn.Analysis() as analysis: dataset_pums = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names) age = dataset_pums['age'] analysis.release() num_records = wn.dp_count(age, privacy_usage={'epsilon': .5}, lower=0, upper=10000) analysis.release() print("number of records:", num_records.value) vars = wn.to_float(dataset_pums[["age", "income"]]) covariance = wn.dp_covariance(data=vars, privacy_usage={'epsilon': .5}, data_lower=[0., 0.], data_upper=[150., 150000.], data_n=num_records) print("covariance released") num_means = wn.dp_mean(data=vars, privacy_usage={'epsilon': .5}, data_lower=[0., 0.], data_upper=[150., 150000.], data_n=num_records) analysis.release() print("covariance:\n", covariance.value) print("means:\n", num_means.value) age = wn.to_float(age) age_variance = wn.dp_variance(age, privacy_usage={'epsilon': .5}, data_lower=0., data_upper=150., data_n=num_records) analysis.release() print("age variance:", age_variance.value) # If I clamp, impute, resize, then I can reuse their properties for multiple statistics clamped_age = wn.clamp(age, lower=0., upper=100.) imputed_age = wn.impute(clamped_age) preprocessed_age = wn.resize(imputed_age, n=num_records) # properties necessary for mean are statically known mean = wn.dp_mean(preprocessed_age, privacy_usage={'epsilon': .5}) # properties necessary for variance are statically known variance = wn.dp_variance(preprocessed_age, privacy_usage={'epsilon': .5}) # sum doesn't need n, so I pass the data in before resizing age_sum = wn.dp_sum(imputed_age, privacy_usage={'epsilon': .5}) # mean with lower, upper properties propagated up from prior bounds transformed_mean = wn.dp_mean(-(preprocessed_age + 2.), privacy_usage={'epsilon': .5}) analysis.release() print("age transformed mean:", transformed_mean.value) # releases may be pieced together from combinations of smaller components custom_mean = wn.laplace_mechanism(wn.mean(preprocessed_age), privacy_usage={'epsilon': .5}) custom_maximum = wn.laplace_mechanism(wn.maximum(preprocessed_age), privacy_usage={'epsilon': .5}) custom_maximum = wn.laplace_mechanism(wn.maximum(preprocessed_age), privacy_usage={'epsilon': .5}) custom_quantile = wn.laplace_mechanism(wn.quantile(preprocessed_age, alpha=.5), privacy_usage={'epsilon': 500}) income = wn.to_float(dataset_pums['income']) income_max = wn.laplace_mechanism(wn.maximum(income, data_lower=0., data_upper=1000000.), privacy_usage={'epsilon': 10}) # releases may also be postprocessed and reused as arguments to more components age_sum + custom_maximum * 23. analysis.release() print("laplace quantile:", custom_quantile.value) age_histogram = wn.dp_histogram(wn.to_int(age, lower=0, upper=100), edges=list(range(0, 100, 25)), null_value=150, privacy_usage={'epsilon': 2.}) sex_histogram = wn.dp_histogram(wn.to_bool(dataset_pums['sex'], true_label="1"), privacy_usage={'epsilon': 2.}) education_histogram = wn.dp_histogram(dataset_pums['educ'], categories=["5", "7", "10"], null_value="-1", privacy_usage={'epsilon': 2.}) analysis.release() print("age histogram: ", age_histogram.value) print("sex histogram: ", sex_histogram.value) print("education histogram: ", education_histogram.value) if run: analysis.release() # get the mean computed when release() was called print(mean.value) print(variance.value) return analysis