def __create_sex_histograms(self, data):
        sex_histogram_geometric = wn.dp_histogram(
            wn.to_bool(data['sex'], true_label="0"),
            upper=self.__nsize,
            privacy_usage={'epsilon': .5, 'delta': 0.00001}
        )
        sex_prep = wn.histogram(wn.to_bool(
            data['sex'], true_label="0"), null_value=True)
        sex_histogram_laplace = wn.laplace_mechanism(
            sex_prep, privacy_usage={"epsilon": 0.4, "delta": .000001})

        return sex_histogram_geometric, sex_histogram_laplace
    def __create_state_histograms(self, data):
        states = self.get_states()
        state_histogram_geometric = wn.dp_histogram(
            data['state'],
            categories=states,
            null_value=states[0],
            privacy_usage={'epsilon': 0.2}
        )

        state_prep = wn.histogram(data['state'], categories=states,
                                  null_value=states[0])
        state_histogram_laplace = wn.laplace_mechanism(state_prep,
                                                       privacy_usage={"epsilon": 0.5, "delta": .000001})
        return state_histogram_geometric, state_histogram_laplace
    def __create_age_histograms(self, data):
        age_edges = list(range(20, 80, 10))
        age_histogram_geometric = wn.dp_histogram(
            wn.to_int(data['age'], lower=20, upper=80),
            edges=age_edges,
            upper=self.__nsize,
            null_value=20,
            privacy_usage={'epsilon': 0.5}
        )

        age_prep = wn.histogram(wn.to_int(data['age'], lower=20, upper=80),
                                edges=age_edges, null_value=20)
        age_histogram_laplace = wn.laplace_mechanism(
            age_prep, privacy_usage={"epsilon": 0.5, "delta": .000001})

        return age_histogram_geometric, age_histogram_laplace
Ejemplo n.º 4
0
def test_insertion_simple():
    """
    Conduct a differentially private analysis with values inserted from other systems
    :return:
    """
    with wn.Analysis() as analysis:

        # construct a fake dataset that describes your actual data (will never be run)
        data = wn.Dataset(path="", column_names=["A", "B", "C", "D"])

        # pull a column out
        col_a = wn.to_float(data['A'])

        # describe the preprocessing you actually perform on the data
        col_a_clamped = wn.impute(wn.clamp(col_a, lower=0., upper=10.))
        col_a_resized = wn.resize(col_a_clamped, n=1000000)

        # run a fake aggregation
        actual_mean = wn.mean(col_a_resized)

        # insert aggregated data from an external system
        actual_mean.set(10)

        # describe the differentially private operation
        gaussian_mean = wn.gaussian_mechanism(actual_mean,
                                              privacy_usage={
                                                  "epsilon": .4,
                                                  "delta": 1e-6
                                              })

        # check if the analysis is permissible
        analysis.validate()

        # compute the missing releasable nodes- in this case, only the gaussian mean
        analysis.release()

        # retrieve the noised mean
        print("gaussian mean", gaussian_mean.value)

        # release a couple other statistics using other mechanisms in the same batch
        actual_sum = wn.sum(col_a_clamped)
        actual_sum.set(123456)
        laplace_sum = wn.laplace_mechanism(actual_sum,
                                           privacy_usage={"epsilon": .1})

        actual_count = wn.count(col_a)
        actual_count.set(9876)

        geo_count = wn.simple_geometric_mechanism(
            actual_count, 0, 10000, privacy_usage={"epsilon": .1})

        analysis.release()
        print("laplace sum", laplace_sum.value)
        print("geometric count", geo_count.value)

        actual_histogram_b = wn.histogram(
            wn.clamp(data['B'], categories=['X', 'Y', 'Z'], null_value="W"))
        actual_histogram_b.set([12, 1280, 2345, 12])
        geo_histogram_b = wn.simple_geometric_mechanism(
            actual_histogram_b, 0, 10000, privacy_usage={"epsilon": .1})

        col_c = wn.to_bool(data['C'], true_label="T")
        actual_histogram_c = wn.histogram(col_c)
        actual_histogram_c.set([5000, 5000])
        lap_histogram_c = wn.laplace_mechanism(actual_histogram_c,
                                               privacy_usage={"epsilon": .1})

        analysis.release()
        print("noised histogram b", geo_histogram_b.value)
        print("noised histogram c", lap_histogram_c.value)
        print("C dimensionality", col_c.dimensionality)
        print("C categories", col_c.categories)

        # multicolumnar insertion

        # pull a column out
        col_rest = wn.to_float(data[['C', 'D']])

        # describe the preprocessing you actually perform on the data
        col_rest_resized = wn.resize(wn.impute(
            wn.clamp(col_rest, lower=[0., 5.], upper=1000.)),
                                     n=10000)

        # run a fake aggregation
        actual_mean = wn.mean(col_rest_resized)

        # insert aggregated data from an external system
        actual_mean.set([[10., 12.]])

        # describe the differentially private operation
        gaussian_mean = wn.gaussian_mechanism(actual_mean,
                                              privacy_usage={
                                                  "epsilon": .4,
                                                  "delta": 1e-6
                                              })

        # check if the analysis is permissible
        analysis.validate()

        # compute the missing releasable nodes- in this case, only the gaussian mean
        analysis.release()

        # retrieve the noised mean
        print("rest gaussian mean", gaussian_mean.value)
def test_everything(run=True):
    with wn.Analysis(dynamic=True) as analysis:
        data = wn.Dataset(path=TEST_CSV_PATH, column_names=test_csv_names)

        age_int = wn.to_int(data['age'], 0, 150)
        sex = wn.to_bool(data['sex'], "1")
        educ = wn.to_float(data['educ'])
        race = data['race']
        income = wn.to_float(data['income'])
        married = wn.to_bool(data['married'], "1")

        numerics = wn.to_float(data[['age', 'income']])

        # intentionally busted component
        # print("invalid component id ", (sex + "a").component_id)

        # broadcast scalar over 2d, broadcast scalar over 1d, columnar broadcasting, left and right mul
        numerics * 2. + 2. * educ

        # add different values for each column
        numerics + [[1., 2.]]

        # index into first column
        age = numerics[0]
        income = numerics[[False, True]]

        # boolean ops and broadcasting
        mask = sex & married | (~married ^ False) | (age > 50.) | (age_int
                                                                   == 25)

        # numerical clamping
        wn.clamp(numerics, 0., [150., 150_000.])
        wn.clamp(data['educ'],
                 categories=[str(i) for i in range(8, 10)],
                 null_value="-1")

        wn.count(mask)
        wn.covariance(age, income)
        wn.digitize(educ, edges=[1., 3., 10.], null_value=-1)

        # checks for safety against division by zero
        income / 2.
        income / wn.clamp(educ, 5., 20.)

        wn.dp_count(data, privacy_usage={"epsilon": 0.5})
        wn.dp_count(mask, privacy_usage={"epsilon": 0.5})

        wn.dp_histogram(mask, privacy_usage={"epsilon": 0.5})
        age = wn.impute(wn.clamp(age, 0., 150.))
        wn.dp_maximum(age, privacy_usage={"epsilon": 0.5})
        wn.dp_minimum(age, privacy_usage={"epsilon": 0.5})
        wn.dp_median(age, privacy_usage={"epsilon": 0.5})

        age_n = wn.resize(age, n=800)
        wn.dp_mean(age_n, privacy_usage={"epsilon": 0.5})
        wn.dp_moment_raw(age_n, order=3, privacy_usage={"epsilon": 0.5})

        wn.dp_sum(age, privacy_usage={"epsilon": 0.5})
        wn.dp_variance(age_n, privacy_usage={"epsilon": 0.5})

        wn.filter(income, mask)
        race_histogram = wn.histogram(race,
                                      categories=["1", "2", "3"],
                                      null_value="3")
        wn.histogram(income, edges=[0., 10000., 50000.], null_value=-1)

        wn.dp_histogram(married, privacy_usage={"epsilon": 0.5})

        wn.gaussian_mechanism(race_histogram,
                              privacy_usage={
                                  "epsilon": 0.5,
                                  "delta": .000001
                              })
        wn.laplace_mechanism(race_histogram,
                             privacy_usage={
                                 "epsilon": 0.5,
                                 "delta": .000001
                             })

        wn.kth_raw_sample_moment(educ, k=3)

        wn.log(wn.clamp(educ, 0.001, 50.))
        wn.maximum(educ)
        wn.mean(educ)
        wn.minimum(educ)

        educ % 2.
        educ**2.

        wn.quantile(educ, .32)

        wn.resize(educ, 1200, 0., 50.)
        wn.resize(race, 1200, categories=["1", "2"], weights=[1, 2])
        wn.resize(data[["age", "sex"]],
                  1200,
                  categories=[["1", "2"], ["a", "b"]],
                  weights=[1, 2])
        wn.resize(data[["age", "sex"]],
                  1200,
                  categories=[["1", "2"], ["a", "b", "c"]],
                  weights=[[1, 2], [3, 7, 2]])

        wn.sum(educ)
        wn.variance(educ)

    if run:
        analysis.release()

    return analysis