Python Dataset Exemples, opendp.whitenoise.core.Dataset Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : dp_core.py Projet : Tecnarca/whitenoise-system

 def actual_release(self, dataset: object):
     """
     Returns non-private exact response from the algorithm
     """
     actual_res = {}
     if (self.algorithm == wn.dp_mean):
         with wn.Analysis(filter_level="all") as analysis:
             data = wn.to_float(wn.Dataset(value=dataset))
             agg = wn.mean(data=data,
                           data_lower=float(min(dataset)),
                           data_upper=float(max(dataset)),
                           data_rows=len(dataset),
                           data_columns=1)
             analysis.release()
             actual_res["__key__"] = agg.value
     if (self.algorithm == wn.dp_sum):
         with wn.Analysis(filter_level="all") as analysis:
             data = wn.to_float(wn.Dataset(value=dataset))
             agg = wn.sum(data=data,
                          data_lower=float(min(dataset)),
                          data_upper=float(max(dataset)),
                          data_rows=len(dataset),
                          data_columns=1)
             analysis.release()
             actual_res["__key__"] = agg.value
     return Report(actual_res)

Exemple #2

0

Afficher le fichier

Fichier : test_base.py Projet : srayagarwal/whitenoise-core-python

def test_histogram():
    import numpy as np

    # establish data information

    data = np.genfromtxt(TEST_CSV_PATH, delimiter=',', names=True)
    education_categories = [
        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
        "14", "15", "16", "17"
    ]

    income = list(data[:]['income'])
    income_edges = list(range(0, 100_000, 10_000))

    print('actual', np.histogram(income, bins=income_edges)[0])

    with wn.Analysis() as analysis:
        data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names)
        income = wn.to_int(data['income'], lower=0, upper=0)
        sex = wn.to_bool(data['sex'], true_label="1")

        income_histogram = wn.dp_histogram(income,
                                           edges=income_edges,
                                           privacy_usage={'epsilon': 1.})

    analysis.release()

    print("Income histogram Geometric DP release:   " +
          str(income_histogram.value))

Exemple #3

0

Afficher le fichier

Fichier : test_base.py Projet : srayagarwal/whitenoise-core-python

def test_properties():
    with wn.Analysis():
        # load data
        data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names)

        # establish data
        age_dt = wn.cast(data['age'], 'FLOAT')

        # ensure data are non-null
        non_null_age_dt = wn.impute(age_dt,
                                    distribution='Uniform',
                                    lower=0.,
                                    upper=100.)
        clamped = wn.clamp(age_dt, lower=0., upper=100.)

        # create potential for null data again
        potentially_null_age_dt = non_null_age_dt / 0.

        # print('original properties:\n{0}\n\n'.format(age_dt.properties))
        print('properties after imputation:\n{0}\n\n'.format(
            non_null_age_dt.nullity))
        print('properties after nan mult:\n{0}\n\n'.format(
            potentially_null_age_dt.nullity))

        print("lower", clamped.lower)
        print("upper", clamped.upper)
        print("releasable", clamped.releasable)
        # print("props", clamped.properties)
        print("data_type", clamped.data_type)
        print("categories", clamped.categories)

Exemple #4

0

Afficher le fichier

    def __init__(self, dataset, priv_budget=10):
        self._analysis = wn.Analysis()
        self._analysis.__enter__()
        if isinstance(dataset, dict):
            self._dataset = wn.Dataset(value=dataset,
                                       column_names=dataset.keys())
        elif isinstance(dataset, list):
            self._dataset = wn.Dataset(value=dataset, num_columns=1)

        else:
            raise ValueError("more complex types not yet handled")
        self._filterresult = wn.Dataset(value=[], num_columns=1)
        self._analysis.exit()
        self.priv_budget = priv_budget

        self.priv_used = 0

Exemple #5

0

Afficher le fichier

Fichier : test_base.py Projet : srayagarwal/whitenoise-core-python

def test_multilayer_analysis(run=True):
    with wn.Analysis() as analysis:
        PUMS = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names)

        age = wn.to_float(PUMS['age'])
        sex = wn.to_bool(PUMS['sex'], true_label="TRUE")

        age_clamped = wn.clamp(age, lower=0., upper=150.)
        age_resized = wn.resize(age_clamped, n=1000)

        mean_age = wn.dp_mean(data=wn.to_float(PUMS['race']),
                              privacy_usage={'epsilon': .65},
                              data_lower=0.,
                              data_upper=100.,
                              data_n=500)
        analysis.release()

        sex_plus_22 = wn.add(wn.to_float(sex),
                             22.,
                             left_n=1000,
                             left_lower=0.,
                             left_upper=1.)

        wn.dp_mean(age_resized / 2. + sex_plus_22,
                   privacy_usage={'epsilon': .1},
                   data_lower=mean_age - 5.2,
                   data_upper=102.,
                   data_n=500) + 5.

        wn.dp_variance(data=wn.to_float(PUMS['educ']),
                       privacy_usage={'epsilon': .15},
                       data_n=1000,
                       data_lower=0.,
                       data_upper=12.)

        # wn.dp_moment_raw(
        #     wn.to_float(PUMS['married']),
        #     privacy_usage={'epsilon': .15},
        #     data_n=1000000,
        #     data_lower=0.,
        #     data_upper=12.,
        #     order=3
        # )
        #
        # wn.dp_covariance(
        #     left=wn.to_float(PUMS['age']),
        #     right=wn.to_float(PUMS['married']),
        #     privacy_usage={'epsilon': .15},
        #     left_n=1000,
        #     right_n=1000,
        #     left_lower=0.,
        #     left_upper=1.,
        #     right_lower=0.,
        #     right_upper=1.
        # )

    if run:
        analysis.release()

    return analysis

Exemple #6

0

Afficher le fichier

Fichier : test_validator_properties.py Projet : srayagarwal/whitenoise-core-python

def test_equal():
    with wn.Analysis(filter_level='all') as analysis:
        data = wn.Dataset(**dataset_bools)

        equality = data[0] == data[1]

        analysis.release()
        assert np.array_equal(equality.value,
                              np.array([True, False, False, True]))

Exemple #7

0

Afficher le fichier

Fichier : test_validator_properties.py Projet : srayagarwal/whitenoise-core-python

def test_index():
    with wn.Analysis(filter_level='all') as analysis:
        data = wn.Dataset(**dataset_bools)

        index_0 = data[0]

        analysis.release()
        assert all(a == b
                   for a, b in zip(index_0.value, [True, True, False, False]))

Exemple #8

0

Afficher le fichier

Fichier : test_validator_properties.py Projet : srayagarwal/whitenoise-core-python

def test_partition():
    with wn.Analysis(filter_level='all') as analysis:
        data = wn.Dataset(**dataset_bools)[[0, 1]]

        partitioned = wn.partition(data, num_partitions=3)
        analysis.release()
        # print(partitioned.value)

        assert np.array_equal(partitioned.value[0],
                              np.array([[True, True], [True, False]]))
        assert np.array_equal(partitioned.value[1], np.array([[False, True]]))
        assert np.array_equal(partitioned.value[2], np.array([[False, False]]))

Exemple #9

0

Afficher le fichier

 def whitenoise_core_dp_multi_agg(self, f, dataset_path, col_names, args,
                                  epsilon, kwargs):
     releases = []
     with wn.Analysis() as analysis:
         for x in range(self.repeat_count):
             df = wn.Dataset(path=dataset_path, column_names=col_names)
             release = f(data=wn.to_float(df[[args[0], args[1]]]),
                         privacy_usage={'epsilon': epsilon},
                         **kwargs)
             releases.append(release)
     analysis.release()
     noisy_values = [release.value[0][0] for release in releases]
     return np.array(noisy_values)

Exemple #10

0

Afficher le fichier

Fichier : test_base.py Projet : srayagarwal/whitenoise-core-python

def test_dp_count(run=True):
    with wn.Analysis() as analysis:
        dataset_pums = wn.Dataset(path=TEST_CSV_PATH,
                                  column_names=test_csv_names)

        count = wn.dp_count(dataset_pums['sex'] == '1',
                            privacy_usage={'epsilon': 0.5})

    if run:
        analysis.release()
        print(count.value)

    return analysis

Exemple #11

0

Afficher le fichier

Fichier : test_base.py Projet : srayagarwal/whitenoise-core-python

def test_raw_dataset(run=True):
    with wn.Analysis() as analysis:
        wn.dp_mean(
            data=wn.Dataset(value=[1., 2., 3., 4., 5.], num_columns=1),
            privacy_usage={'epsilon': 1},
            data_lower=0.,
            data_upper=10.,
            data_n=10,
        )

    if run:
        analysis.release()

    return analysis

Exemple #12

0

Afficher le fichier

Fichier : test_validator_properties.py Projet : srayagarwal/whitenoise-core-python

def test_divide():
    with wn.Analysis():
        data_A = wn.Dataset(**generate_synthetic(float, variants=['Random']))

        f_random = data_A['F_Random']
        imputed = wn.impute(f_random, lower=0., upper=10.)
        clamped_nonzero = wn.clamp(imputed, lower=1., upper=10.)
        clamped_zero = wn.clamp(imputed, lower=0., upper=10.)

        # test properties
        assert f_random.nullity
        assert not imputed.nullity
        assert (2. / imputed).nullity
        assert (f_random / imputed).nullity
        assert (2. / clamped_zero).nullity

Exemple #13

0

Afficher le fichier

Fichier : test_validator_properties.py Projet : srayagarwal/whitenoise-core-python

def test_dp_mean():
    with wn.Analysis():
        data = wn.Dataset(**generate_synthetic(float, variants=['Random']))
        mean = wn.dp_mean(
            data['F_Random'],
            # privacy_usage={'epsilon': 0.1},
            accuracy={
                'value': .2,
                'alpha': .05
            },
            data_lower=0.,
            data_upper=10.,
            data_n=10)

        print("accuracy", mean.get_accuracy(0.05))
        print(mean.from_accuracy(2.3, .05))

Exemple #14

0

Afficher le fichier

Fichier : test_base.py Projet : srayagarwal/whitenoise-core-python

def test_covariance():
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    data = np.genfromtxt(TEST_CSV_PATH, delimiter=',', names=True)

    with wn.Analysis() as analysis:
        wn_data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names)
        # get full covariance matrix
        cov = wn.dp_covariance(data=wn.to_float(wn_data['age', 'sex', 'educ',
                                                        'income', 'married']),
                               privacy_usage={'epsilon': 10},
                               data_lower=[0., 0., 1., 0., 0.],
                               data_upper=[100., 1., 16., 500_000., 1.],
                               data_n=1000)
    analysis.release()

    # store DP covariance and correlation matrix
    dp_cov = cov.value
    print(dp_cov)
    dp_corr = dp_cov / np.outer(np.sqrt(np.diag(dp_cov)),
                                np.sqrt(np.diag(dp_cov)))

    # get non-DP covariance/correlation matrices
    age = list(data[:]['age'])
    sex = list(data[:]['sex'])
    educ = list(data[:]['educ'])
    income = list(data[:]['income'])
    married = list(data[:]['married'])
    non_dp_cov = np.cov([age, sex, educ, income, married])
    non_dp_corr = non_dp_cov / np.outer(np.sqrt(np.diag(non_dp_cov)),
                                        np.sqrt(np.diag(non_dp_cov)))

    print('Non-DP Covariance Matrix:\n{0}\n\n'.format(
        pd.DataFrame(non_dp_cov)))
    print('Non-DP Correlation Matrix:\n{0}\n\n'.format(
        pd.DataFrame(non_dp_corr)))
    print('DP Correlation Matrix:\n{0}'.format(pd.DataFrame(dp_corr)))

    # skip plot step
    if IS_CI_BUILD:
        return

    plt.imshow(non_dp_corr - dp_corr, interpolation='nearest')
    plt.colorbar()
    plt.show()

Exemple #15

0

Afficher le fichier

Fichier : dp_core.py Projet : Tecnarca/whitenoise-system

 def release(self, dataset: object) -> Report:
     """
     Releases report according to the OpenDP Core applying 
     functions on the dataset or return the actual report
     if actual is set to True
     """
     noisy_res = {"__key__": []}
     # Repeating analysis multiple times to collect enough samples for evaluation
     for i in range(self.eval_params.repeat_count):
         with wn.Analysis() as analysis:
             data = wn.to_float(wn.Dataset(value=dataset))
             agg = self.algorithm(
                 data=data,
                 privacy_usage={'epsilon': self.privacy_params.epsilon},
                 data_lower=float(min(dataset)),
                 data_upper=float(max(dataset)),
                 data_rows=len(dataset),
                 data_columns=1)
             analysis.release()
             noisy_res["__key__"].append(agg.value)
     return Report(noisy_res)

Exemple #16

0

Afficher le fichier

def test_insertion_simple():
    """
    Conduct a differentially private analysis with values inserted from other systems
    :return:
    """
    with wn.Analysis() as analysis:

        # construct a fake dataset that describes your actual data (will never be run)
        data = wn.Dataset(path="", column_names=["A", "B", "C", "D"])

        # pull a column out
        col_a = wn.to_float(data['A'])

        # describe the preprocessing you actually perform on the data
        col_a_clamped = wn.impute(wn.clamp(col_a, lower=0., upper=10.))
        col_a_resized = wn.resize(col_a_clamped, n=1000000)

        # run a fake aggregation
        actual_mean = wn.mean(col_a_resized)

        # insert aggregated data from an external system
        actual_mean.set(10)

        # describe the differentially private operation
        gaussian_mean = wn.gaussian_mechanism(actual_mean,
                                              privacy_usage={
                                                  "epsilon": .4,
                                                  "delta": 1e-6
                                              })

        # check if the analysis is permissible
        analysis.validate()

        # compute the missing releasable nodes- in this case, only the gaussian mean
        analysis.release()

        # retrieve the noised mean
        print("gaussian mean", gaussian_mean.value)

        # release a couple other statistics using other mechanisms in the same batch
        actual_sum = wn.sum(col_a_clamped)
        actual_sum.set(123456)
        laplace_sum = wn.laplace_mechanism(actual_sum,
                                           privacy_usage={"epsilon": .1})

        actual_count = wn.count(col_a)
        actual_count.set(9876)

        geo_count = wn.simple_geometric_mechanism(
            actual_count, 0, 10000, privacy_usage={"epsilon": .1})

        analysis.release()
        print("laplace sum", laplace_sum.value)
        print("geometric count", geo_count.value)

        actual_histogram_b = wn.histogram(
            wn.clamp(data['B'], categories=['X', 'Y', 'Z'], null_value="W"))
        actual_histogram_b.set([12, 1280, 2345, 12])
        geo_histogram_b = wn.simple_geometric_mechanism(
            actual_histogram_b, 0, 10000, privacy_usage={"epsilon": .1})

        col_c = wn.to_bool(data['C'], true_label="T")
        actual_histogram_c = wn.histogram(col_c)
        actual_histogram_c.set([5000, 5000])
        lap_histogram_c = wn.laplace_mechanism(actual_histogram_c,
                                               privacy_usage={"epsilon": .1})

        analysis.release()
        print("noised histogram b", geo_histogram_b.value)
        print("noised histogram c", lap_histogram_c.value)
        print("C dimensionality", col_c.dimensionality)
        print("C categories", col_c.categories)

        # multicolumnar insertion

        # pull a column out
        col_rest = wn.to_float(data[['C', 'D']])

        # describe the preprocessing you actually perform on the data
        col_rest_resized = wn.resize(wn.impute(
            wn.clamp(col_rest, lower=[0., 5.], upper=1000.)),
                                     n=10000)

        # run a fake aggregation
        actual_mean = wn.mean(col_rest_resized)

        # insert aggregated data from an external system
        actual_mean.set([[10., 12.]])

        # describe the differentially private operation
        gaussian_mean = wn.gaussian_mechanism(actual_mean,
                                              privacy_usage={
                                                  "epsilon": .4,
                                                  "delta": 1e-6
                                              })

        # check if the analysis is permissible
        analysis.validate()

        # compute the missing releasable nodes- in this case, only the gaussian mean
        analysis.release()

        # retrieve the noised mean
        print("rest gaussian mean", gaussian_mean.value)

Exemple #17

0

Afficher le fichier

def create_dicts(data, non_income_data, plausible_variable_combinations):
    count_dict = dict()
    priv_count_dict = dict()

    mean_income_dict = dict()
    priv_mean_income_dict = dict()

    median_income_dict = dict()
    priv_median_income_dict = dict()

    min_income_dict = dict()
    priv_min_income_dict = dict()

    max_income_dict = dict()
    priv_max_income_dict = dict()

    # get number of data elements with each set of variable values
    for i, combination in enumerate(plausible_variable_combinations):
        # print('run {0} of {1}'.format(i+1, len(plausible_variable_combinations)))

        if len(combination) == 1:
            dt = data[non_income_data[combination[0]] == 1]

        elif len(combination) == 2:
            dt = data[(non_income_data[combination[0]] == 1)
                      & (non_income_data[combination[1]] == 1)]

        elif len(combination) == 3:
            dt = data[(non_income_data[combination[0]] == 1)
                      & (non_income_data[combination[1]] == 1) &
                      (non_income_data[combination[2]] == 1)]

        elif len(combination) == 4:
            dt = data[(non_income_data[combination[0]] == 1)
                      & (non_income_data[combination[1]] == 1) &
                      (non_income_data[combination[2]] == 1) &
                      (non_income_data[combination[3]] == 1)]

        elif len(combination) == 5:
            dt = data[(non_income_data[combination[0]] == 1)
                      & (non_income_data[combination[1]] == 1) &
                      (non_income_data[combination[2]] == 1) &
                      (non_income_data[combination[3]] == 1) &
                      (non_income_data[combination[4]] == 1)]

        count_dict['__'.join(combination)] = dt.shape[0]
        mean_income_dict['__'.join(combination)] = np.mean(dt['income'])
        median_income_dict['__'.join(combination)] = np.median(dt['income'])
        min_income_dict['__'.join(combination)] = np.min(dt['income'])
        max_income_dict['__'.join(combination)] = np.max(dt['income'])

        with wn.Analysis() as analysis:
            # load data
            priv_data = wn.Dataset(value=list(dt['income']), num_columns=1)

            # estimate sample size
            count = wn.dp_count(data=wn.cast(priv_data, 'FLOAT'),
                                privacy_usage={'epsilon': .05},
                                lower=0,
                                upper=1000)
        analysis.release()
        priv_count_dict['__'.join(combination)] = max(0, count.value)

        with wn.Analysis() as analysis:
            # load data
            priv_data = wn.Dataset(value=list(dt['income']), num_columns=1)
            # get mean
            mean = wn.dp_mean(data=wn.cast(priv_data, 'FLOAT'),
                              privacy_usage={'epsilon': 0.1},
                              data_lower=0.,
                              data_upper=100_000.,
                              data_n=max(1, count.value))
            # get median
            median = wn.dp_median(data=wn.cast(priv_data, 'FLOAT'),
                                  privacy_usage={'epsilon': 0.1},
                                  data_lower=0.,
                                  data_upper=100_000.,
                                  data_n=max(1, count.value))
            # get min
            _min = wn.dp_minimum(data=wn.cast(priv_data, 'FLOAT'),
                                 privacy_usage={'epsilon': 0.1},
                                 data_lower=0.,
                                 data_upper=100_000.,
                                 data_n=max(1, count.value))

            # get max
            _max = wn.dp_maximum(data=wn.cast(priv_data, 'FLOAT'),
                                 privacy_usage={'epsilon': 0.1},
                                 data_lower=0.,
                                 data_upper=100_000.,
                                 data_n=max(1, count.value))
        analysis.release()
        priv_mean_income_dict['__'.join(combination)] = min(
            max(0, mean.value), 100_000)
        priv_median_income_dict['__'.join(combination)] = min(
            max(0, median.value), 100_000)
        priv_min_income_dict['__'.join(combination)] = min(
            max(0, _min.value), 100_000)
        priv_max_income_dict['__'.join(combination)] = min(
            max(0, _max.value), 100_000)

    return (count_dict, priv_count_dict, mean_income_dict,
            priv_mean_income_dict, median_income_dict, priv_median_income_dict,
            min_income_dict, priv_min_income_dict, max_income_dict,
            priv_max_income_dict)

Exemple #18

0

Afficher le fichier

Fichier : differential_privacy.py Projet : AzureAdvocateBit/ResponsibleAI-1

 def __get_dp_noise_dataset(self):
     df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe()
     df.to_csv('tmp.csv', index=False)
     return wn.Dataset(path='tmp.csv', column_names=self.get_columns()), len(df.index)

Exemple #19

0

Afficher le fichier

Fichier : test_base.py Projet : srayagarwal/whitenoise-core-python

def test_everything(run=True):
    with wn.Analysis(dynamic=True) as analysis:
        data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names)

        age_int = wn.to_int(data['age'], 0, 150)
        sex = wn.to_bool(data['sex'], "1")
        educ = wn.to_float(data['educ'])
        race = data['race']
        income = wn.to_float(data['income'])
        married = wn.to_bool(data['married'], "1")

        numerics = wn.to_float(data[['age', 'income']])

        # intentionally busted component
        # print("invalid component id ", (sex + "a").component_id)

        # broadcast scalar over 2d, broadcast scalar over 1d, columnar broadcasting, left and right mul
        numerics * 2. + 2. * educ

        # add different values for each column
        numerics + [[1., 2.]]

        # index into first column
        age = numerics[0]
        income = numerics[[False, True]]

        # boolean ops and broadcasting
        mask = sex & married | (~married ^ False) | (age > 50.) | (age_int
                                                                   == 25)

        # numerical clamping
        wn.clamp(numerics, 0., [150., 150_000.])
        wn.clamp(data['educ'],
                 categories=[str(i) for i in range(8, 10)],
                 null_value="-1")

        wn.count(mask)
        wn.covariance(age, income)
        wn.digitize(educ, edges=[1., 3., 10.], null_value=-1)

        # checks for safety against division by zero
        income / 2.
        income / wn.clamp(educ, 5., 20.)

        wn.dp_count(data, privacy_usage={"epsilon": 0.5})
        wn.dp_count(mask, privacy_usage={"epsilon": 0.5})

        wn.dp_histogram(mask, privacy_usage={"epsilon": 0.5})
        age = wn.impute(wn.clamp(age, 0., 150.))
        wn.dp_maximum(age, privacy_usage={"epsilon": 0.5})
        wn.dp_minimum(age, privacy_usage={"epsilon": 0.5})
        wn.dp_median(age, privacy_usage={"epsilon": 0.5})

        age_n = wn.resize(age, n=800)
        wn.dp_mean(age_n, privacy_usage={"epsilon": 0.5})
        wn.dp_moment_raw(age_n, order=3, privacy_usage={"epsilon": 0.5})

        wn.dp_sum(age, privacy_usage={"epsilon": 0.5})
        wn.dp_variance(age_n, privacy_usage={"epsilon": 0.5})

        wn.filter(income, mask)
        race_histogram = wn.histogram(race,
                                      categories=["1", "2", "3"],
                                      null_value="3")
        wn.histogram(income, edges=[0., 10000., 50000.], null_value=-1)

        wn.dp_histogram(married, privacy_usage={"epsilon": 0.5})

        wn.gaussian_mechanism(race_histogram,
                              privacy_usage={
                                  "epsilon": 0.5,
                                  "delta": .000001
                              })
        wn.laplace_mechanism(race_histogram,
                             privacy_usage={
                                 "epsilon": 0.5,
                                 "delta": .000001
                             })

        wn.kth_raw_sample_moment(educ, k=3)

        wn.log(wn.clamp(educ, 0.001, 50.))
        wn.maximum(educ)
        wn.mean(educ)
        wn.minimum(educ)

        educ % 2.
        educ**2.

        wn.quantile(educ, .32)

        wn.resize(educ, 1200, 0., 50.)
        wn.resize(race, 1200, categories=["1", "2"], weights=[1, 2])
        wn.resize(data[["age", "sex"]],
                  1200,
                  categories=[["1", "2"], ["a", "b"]],
                  weights=[1, 2])
        wn.resize(data[["age", "sex"]],
                  1200,
                  categories=[["1", "2"], ["a", "b", "c"]],
                  weights=[[1, 2], [3, 7, 2]])

        wn.sum(educ)
        wn.variance(educ)

    if run:
        analysis.release()

    return analysis

Exemple #20

0

Afficher le fichier

Fichier : test_base.py Projet : srayagarwal/whitenoise-core-python

def test_dp_linear_stats(run=True):
    with wn.Analysis() as analysis:
        dataset_pums = wn.Dataset(path=TEST_CSV_PATH,
                                  column_names=test_csv_names)

        age = dataset_pums['age']
        analysis.release()

        num_records = wn.dp_count(age,
                                  privacy_usage={'epsilon': .5},
                                  lower=0,
                                  upper=10000)
        analysis.release()

        print("number of records:", num_records.value)

        vars = wn.to_float(dataset_pums[["age", "income"]])

        covariance = wn.dp_covariance(data=vars,
                                      privacy_usage={'epsilon': .5},
                                      data_lower=[0., 0.],
                                      data_upper=[150., 150000.],
                                      data_n=num_records)
        print("covariance released")

        num_means = wn.dp_mean(data=vars,
                               privacy_usage={'epsilon': .5},
                               data_lower=[0., 0.],
                               data_upper=[150., 150000.],
                               data_n=num_records)

        analysis.release()
        print("covariance:\n", covariance.value)
        print("means:\n", num_means.value)

        age = wn.to_float(age)

        age_variance = wn.dp_variance(age,
                                      privacy_usage={'epsilon': .5},
                                      data_lower=0.,
                                      data_upper=150.,
                                      data_n=num_records)

        analysis.release()

        print("age variance:", age_variance.value)

        # If I clamp, impute, resize, then I can reuse their properties for multiple statistics
        clamped_age = wn.clamp(age, lower=0., upper=100.)
        imputed_age = wn.impute(clamped_age)
        preprocessed_age = wn.resize(imputed_age, n=num_records)

        # properties necessary for mean are statically known
        mean = wn.dp_mean(preprocessed_age, privacy_usage={'epsilon': .5})

        # properties necessary for variance are statically known
        variance = wn.dp_variance(preprocessed_age,
                                  privacy_usage={'epsilon': .5})

        # sum doesn't need n, so I pass the data in before resizing
        age_sum = wn.dp_sum(imputed_age, privacy_usage={'epsilon': .5})

        # mean with lower, upper properties propagated up from prior bounds
        transformed_mean = wn.dp_mean(-(preprocessed_age + 2.),
                                      privacy_usage={'epsilon': .5})

        analysis.release()
        print("age transformed mean:", transformed_mean.value)

        # releases may be pieced together from combinations of smaller components
        custom_mean = wn.laplace_mechanism(wn.mean(preprocessed_age),
                                           privacy_usage={'epsilon': .5})

        custom_maximum = wn.laplace_mechanism(wn.maximum(preprocessed_age),
                                              privacy_usage={'epsilon': .5})

        custom_maximum = wn.laplace_mechanism(wn.maximum(preprocessed_age),
                                              privacy_usage={'epsilon': .5})

        custom_quantile = wn.laplace_mechanism(wn.quantile(preprocessed_age,
                                                           alpha=.5),
                                               privacy_usage={'epsilon': 500})

        income = wn.to_float(dataset_pums['income'])
        income_max = wn.laplace_mechanism(wn.maximum(income,
                                                     data_lower=0.,
                                                     data_upper=1000000.),
                                          privacy_usage={'epsilon': 10})

        # releases may also be postprocessed and reused as arguments to more components
        age_sum + custom_maximum * 23.

        analysis.release()
        print("laplace quantile:", custom_quantile.value)

        age_histogram = wn.dp_histogram(wn.to_int(age, lower=0, upper=100),
                                        edges=list(range(0, 100, 25)),
                                        null_value=150,
                                        privacy_usage={'epsilon': 2.})

        sex_histogram = wn.dp_histogram(wn.to_bool(dataset_pums['sex'],
                                                   true_label="1"),
                                        privacy_usage={'epsilon': 2.})

        education_histogram = wn.dp_histogram(dataset_pums['educ'],
                                              categories=["5", "7", "10"],
                                              null_value="-1",
                                              privacy_usage={'epsilon': 2.})

        analysis.release()

        print("age histogram: ", age_histogram.value)
        print("sex histogram: ", sex_histogram.value)
        print("education histogram: ", education_histogram.value)

    if run:
        analysis.release()

        # get the mean computed when release() was called
        print(mean.value)
        print(variance.value)

    return analysis