Exemple #1
0
def test_divide():
    with sn.Analysis():
        data_A = generate_synthetic(float, variants=['Random'])

        f_random = data_A['F_Random']
        imputed = sn.impute(f_random, lower=0., upper=10.)
        clamped_nonzero = sn.clamp(imputed, lower=1., upper=10.)
        clamped_zero = sn.clamp(imputed, lower=0., upper=10.)

        # test properties
        assert f_random.nullity
        assert not imputed.nullity
        assert (2. / imputed).nullity
        assert (f_random / imputed).nullity
        assert (2. / clamped_zero).nullity
Exemple #2
0
def test_fail_groupby():
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        bounds = {
            "data_lower": [0., 0.],
            "data_upper": [15., 200_000.],
            "data_rows": 500
        }

        union = sn.union({
            True:
            sn.mean(partitioned[True],
                    privacy_usage={"epsilon": 0.1},
                    **bounds),
            False:
            sn.mean(partitioned[False], **bounds),
        })

        sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0})

        print(analysis.privacy_usage)
Exemple #3
0
def test_groupby_c_stab():
    # use the same partition multiple times in union
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        def analyze(data):
            return sn.mean(sn.resize(data, number_rows=500))

        means = {
            True: analyze(partitioned[True]),
            False: analyze(partitioned[False]),
            "duplicate_that_inflates_c_stab": analyze(partitioned[True]),
        }

        union = sn.union(means)
        noised = sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0})

        # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print(noised.value)
Exemple #4
0
def test_groupby_4():
    # now union private data, and apply mechanism after
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        means = {}
        for cat in is_male.categories:
            part = partitioned[cat]
            part = sn.resize(part, number_rows=500)
            part = sn.mean(part)
            means[cat] = part

        union = sn.union(means)
        noised = sn.laplace_mechanism(union, privacy_usage={"epsilon": 1.0})

    # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print(noised.value)
Exemple #5
0
def test_groupby_3():
    # now union the released output
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        means = {}
        for cat in is_male.categories:
            part = partitioned[cat]
            part = sn.resize(part, number_rows=500)
            part = sn.dp_mean(part, privacy_usage={"epsilon": 1.0})
            # print("mean: ", part.properties)
            means[cat] = part

        union = sn.union(means)

    # analysis.plot()
    analysis.release()
    print(analysis.privacy_usage)
    print(union.value)
Exemple #6
0
def test_properties():
    with sn.Analysis():
        # load data
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        # establish data
        age_dt = sn.cast(data['age'], 'FLOAT')

        # ensure data are non-null
        non_null_age_dt = sn.impute(age_dt,
                                    distribution='Uniform',
                                    lower=0.,
                                    upper=100.)
        clamped = sn.clamp(age_dt, lower=0., upper=100.)

        # create potential for null data again
        potentially_null_age_dt = non_null_age_dt / 0.

        # print('original properties:\n{0}\n\n'.format(age_dt.properties))
        print('properties after imputation:\n{0}\n\n'.format(
            non_null_age_dt.nullity))
        print('properties after nan mult:\n{0}\n\n'.format(
            potentially_null_age_dt.nullity))

        print("lower", clamped.lower)
        print("upper", clamped.upper)
        print("releasable", clamped.releasable)
        # print("props", clamped.properties)
        print("data_type", clamped.data_type)
        print("categories", clamped.categories)
Exemple #7
0
def test_dataframe_partitioning_2():
    # dataframe partition with multi-index grouping
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        grouper = sn.clamp(data[['sex', 'educ']],
                           categories=[['0', '1'],
                                       [str(i) for i in range(14)]],
                           null_value='-1')
        partitioned = sn.partition(data, by=grouper)

        sn.union(
            {
                key: sn.dp_count(partitioned[key],
                                 privacy_usage={"epsilon": 0.5})
                for key in partitioned.partition_keys
            },
            flatten=False)

        print(
            sn.union({
                key: sn.dp_mean(
                    sn.to_float(partitioned[key]['income']),
                    implementation="plug-in",
                    # data_rows=100,
                    data_lower=0.,
                    data_upper=200_000.,
                    privacy_usage={"epsilon": 0.5})
                for key in partitioned.partition_keys
            }))
Exemple #8
0
def test_multilayer_partition_1():
    # multilayer partition with mechanisms applied inside partitions
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        is_male = sn.to_bool(data['sex'], true_label="1")
        educ_inc = sn.impute(
            sn.clamp(sn.to_float(data[['educ', 'income']]),
                     lower=[0., 0.],
                     upper=[15., 200_000.]))

        partitioned = sn.partition(educ_inc, by=is_male)

        def analyze(data):
            educ = sn.clamp(sn.to_int(sn.index(data, indices=0),
                                      lower=0,
                                      upper=15),
                            categories=list(range(15)),
                            null_value=-1)
            income = sn.index(data, indices=1)
            repartitioned = sn.partition(income, by=educ)

            inner_count = {}
            inner_means = {}
            for key in [5, 8, 12]:
                educ_level_part = repartitioned[key]

                inner_count[key] = sn.dp_count(educ_level_part,
                                               privacy_usage={"epsilon": 0.4})
                inner_means[key] = sn.dp_mean(educ_level_part,
                                              privacy_usage={"epsilon": 0.6},
                                              data_rows=sn.row_max(
                                                  1, inner_count[key]))

            return sn.union(inner_means,
                            flatten=False), sn.union(inner_count,
                                                     flatten=False)

        means = {}
        counts = {}
        for key in partitioned.partition_keys:
            part_means, part_counts = analyze(partitioned[key])
            means[key] = part_means
            counts[key] = part_counts

        means = sn.union(means, flatten=False)
        counts = sn.union(counts, flatten=False)

        # analysis.plot()
    print("releasing")
    print(len(analysis.components.items()))
    analysis.release()
    print(analysis.privacy_usage)
    print("Counts:")
    print(counts.value)

    print("Means:")
    print(means.value)
Exemple #9
0
def test_private_clamped_sum_helpers():
    # Compute the CI with smartnoise
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_DATA_PATH, column_names=TEST_DATA_COLUMNS)
        D = sn.to_float(data["age"])
        D_tilde = sn.resize(sn.clamp(data=D, lower=0.0, upper=100.0), number_rows=1000,)
        release = sn.dp_sum(data=sn.impute(D_tilde), privacy_usage={"epsilon": 1.0})
    smartnoise_ci = release.get_accuracy(0.05)

    op = PrivateClampedSum(lower_bound=0, upper_bound=100)
    eeprivacy_ci = op.confidence_interval(epsilon=1, confidence=0.95)

    assert pytest.approx(smartnoise_ci, abs=0.001) == eeprivacy_ci
Exemple #10
0
def try_sn():
    # establish data information
    #data_path = 'https://raw.githubusercontent.com/opendp/smartnoise-samples/86-requirements-fix/analysis/data/PUMS_california_demographics_1000/data.csv'
    data_path = os.path.join('.', 'data', 'PUMS_california_demographics_1000',
                             'data.csv')
    data_path = os.path.abspath(data_path)
    print('data_path', data_path)
    var_names = ["age", "sex", "educ", "race", "income", "married", "pid"]
    D = pd.read_csv(data_path)['age']
    D_mean_age = np.mean(D)
    print('D_mean_age', D_mean_age)

    # establish extra information for this simulation
    age_lower_bound = 0.
    age_upper_bound = 100.
    D_tilde = np.clip(D, age_lower_bound, age_upper_bound)
    D_tilde_mean_age = np.mean(D_tilde)
    data_size = 1000

    df = pd.read_csv(data_path)
    df_as_array = [list(row) for row in df.itertuples()]
    #df.values.tolist()
    print('D.values', df_as_array)

    n_sims = 2
    releases = []
    with sn.Analysis(dynamic=True) as analysis:
        data = sn.Dataset(path=data_path, column_names=var_names)
        #data = sn.Dataset(value=df_as_array, column_names=var_names)
        D = sn.to_float(data['age'])
        # preprocess data (resize is a no-op because we have the correct data size)
        D_tilde = sn.resize(sn.clamp(data=D, lower=0., upper=100.),
                            number_rows=data_size)

        for index in range(n_sims):
            # get DP mean of age
            releases.append(
                sn.dp_mean(data=sn.impute(D_tilde),
                           privacy_usage={'epsilon': 1}))

    accuracy = releases[0].get_accuracy(0.05)

    analysis.release()
    dp_values = [release.value for release in releases]
    print(
        'Accuracy interval (with accuracy value {0}) contains the true mean on D_tilde with probability {1}'
        .format(
            round(accuracy, 4),
            np.mean([(D_tilde_mean_age >= val - accuracy) &
                     (D_tilde_mean_age <= val + accuracy)
                     for val in dp_values])))
Exemple #11
0
def test_map_2():
    # map a count over a large number of tuple partitions of dataframes
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        grouper = sn.clamp(data[['sex', 'educ']],
                           categories=[['0', '1'],
                                       [str(i) for i in range(14)]],
                           null_value='-1')
        partitioned = sn.partition(data, by=grouper)

        counts = sn.dp_count(partitioned, privacy_usage={"epsilon": 0.5})

        print(counts.value)
        print(analysis.privacy_usage)
def test_mechanism(args, constructor):
    with sn.Analysis() as analysis:
        PUMS = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)
        categorical = sn.resize(sn.clamp(PUMS['sex'],
                                         categories=["0", "1"],
                                         null_value="0"),
                                number_rows=1000)

        numeric = sn.impute(sn.to_float(PUMS['age']),
                            data_lower=0.,
                            data_upper=100.,
                            data_rows=1000)

        all = constructor(numeric, categorical, args)

        analysis.release()
        all_values = {stat: all[stat].value for stat in all}
        print()
        pprint(all_values)

        for value in all_values.values():
            assert value is not None
Exemple #13
0
        def analyze(data):
            educ = sn.clamp(sn.to_int(sn.index(data, indices=0),
                                      lower=0,
                                      upper=15),
                            categories=list(range(15)),
                            null_value=-1)
            income = sn.index(data, indices=1)
            repartitioned = sn.partition(income, by=educ)

            inner_count = {}
            inner_means = {}
            for key in [5, 8, 12]:
                educ_level_part = repartitioned[key]

                inner_count[key] = sn.dp_count(educ_level_part,
                                               privacy_usage={"epsilon": 0.4})
                inner_means[key] = sn.mean(
                    sn.resize(educ_level_part,
                              number_rows=sn.row_min(1, inner_count[key] * 4 //
                                                     5)))

            return sn.union(inner_means), sn.union(inner_count)
Exemple #14
0
def test_private_clamped_mean_helpers():
    # Compute the CI with smartnoise
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_DATA_PATH, column_names=TEST_DATA_COLUMNS)
        D = sn.to_float(data["age"])
        D_tilde = sn.resize(sn.clamp(data=D, lower=0.0, upper=100.0), number_rows=1000,)
        release = sn.dp_mean(data=sn.impute(D_tilde), privacy_usage={"epsilon": 1.0})
    smartnoise_ci = release.get_accuracy(0.05)

    # Compute the CI with eeprivacy
    op = PrivateClampedMean(lower_bound=0, upper_bound=100)
    eeprivacy_ci = op.confidence_interval(epsilon=1, N=1000, confidence=0.95)

    # Compare computed confidence intervals
    assert pytest.approx(smartnoise_ci, abs=0.001) == eeprivacy_ci

    smartnoise_epsilon = release.from_accuracy(value=1, alpha=0.05)[0]["epsilon"]
    eeprivacy_epsilon = op.epsilon_for_confidence_interval(
        target_ci=1, N=1000, confidence=0.95
    )

    # Compare computed epsilons for confidence interval
    assert pytest.approx(smartnoise_epsilon, abs=0.001) == eeprivacy_epsilon
def create_dicts(data, non_income_data, plausible_variable_combinations):
    count_dict = dict()
    priv_count_dict = dict()

    mean_income_dict = dict()
    priv_mean_income_dict = dict()

    median_income_dict = dict()
    priv_median_income_dict = dict()

    min_income_dict = dict()
    priv_min_income_dict = dict()

    max_income_dict = dict()
    priv_max_income_dict = dict()

    # get number of data elements with each set of variable values
    for i, combination in enumerate(plausible_variable_combinations):
        # print('run {0} of {1}'.format(i+1, len(plausible_variable_combinations)))

        if len(combination) == 1:
            dt = data[non_income_data[combination[0]] == 1]

        elif len(combination) == 2:
            dt = data[(non_income_data[combination[0]] == 1)
                      & (non_income_data[combination[1]] == 1)]

        elif len(combination) == 3:
            dt = data[(non_income_data[combination[0]] == 1)
                      & (non_income_data[combination[1]] == 1) &
                      (non_income_data[combination[2]] == 1)]

        elif len(combination) == 4:
            dt = data[(non_income_data[combination[0]] == 1)
                      & (non_income_data[combination[1]] == 1) &
                      (non_income_data[combination[2]] == 1) &
                      (non_income_data[combination[3]] == 1)]

        elif len(combination) == 5:
            dt = data[(non_income_data[combination[0]] == 1)
                      & (non_income_data[combination[1]] == 1) &
                      (non_income_data[combination[2]] == 1) &
                      (non_income_data[combination[3]] == 1) &
                      (non_income_data[combination[4]] == 1)]

        count_dict['__'.join(combination)] = dt.shape[0]
        mean_income_dict['__'.join(combination)] = np.mean(dt['income'])
        median_income_dict['__'.join(combination)] = np.median(dt['income'])
        min_income_dict['__'.join(combination)] = np.min(dt['income'])
        max_income_dict['__'.join(combination)] = np.max(dt['income'])

        with sn.Analysis() as analysis:
            # load data
            priv_data = sn.Dataset(value=dt['income'])
            # estimate sample size
            count = sn.dp_count(priv_data, privacy_usage={'epsilon': .05})
            # preprocess data
            priv_data = sn.resize(sn.to_float(priv_data),
                                  number_columns=1,
                                  number_rows=sn.row_max(1, count),
                                  lower=0.,
                                  upper=100_000.)
            priv_data = sn.impute(sn.clamp(priv_data, lower=0.,
                                           upper=100_000.))
            # get mean
            mean = sn.dp_mean(priv_data, privacy_usage={'epsilon': 0.1})
            # get median
            median = sn.dp_median(priv_data, privacy_usage={'epsilon': 0.1})
            # get min
            _min = sn.dp_minimum(priv_data, privacy_usage={'epsilon': 0.1})
            # get max
            _max = sn.dp_maximum(priv_data, privacy_usage={'epsilon': 0.1})
            analysis.release()

            priv_count_dict['__'.join(combination)] = max(0, count.value)
            priv_mean_income_dict['__'.join(combination)] = min(
                max(0, mean.value), 100_000)
            priv_median_income_dict['__'.join(combination)] = min(
                max(0, median.value), 100_000)
            priv_min_income_dict['__'.join(combination)] = min(
                max(0, _min.value), 100_000)
            priv_max_income_dict['__'.join(combination)] = min(
                max(0, _max.value), 100_000)

    return (count_dict, priv_count_dict, mean_income_dict,
            priv_mean_income_dict, median_income_dict, priv_median_income_dict,
            min_income_dict, priv_min_income_dict, max_income_dict,
            priv_max_income_dict)
Exemple #16
0
def test_dp_linear_stats(run=True):
    with sn.Analysis() as analysis:
        dataset_pums = sn.Dataset(path=TEST_PUMS_PATH,
                                  column_names=TEST_PUMS_NAMES)

        age = dataset_pums['age']
        analysis.release()

        num_records = sn.dp_count(age,
                                  privacy_usage={'epsilon': .5},
                                  lower=0,
                                  upper=10000)
        analysis.release()

        print("number of records:", num_records.value)

        vars = sn.to_float(dataset_pums[["age", "income"]])

        covariance = sn.dp_covariance(data=vars,
                                      privacy_usage={'epsilon': .5},
                                      data_lower=[0., 0.],
                                      data_upper=[150., 150000.],
                                      data_rows=num_records)
        print("covariance released")

        num_means = sn.dp_mean(data=vars,
                               privacy_usage={'epsilon': .5},
                               data_lower=[0., 0.],
                               data_upper=[150., 150000.],
                               data_rows=num_records)

        analysis.release()
        print("covariance:\n", covariance.value)
        print("means:\n", num_means.value)

        age = sn.to_float(age)

        age_variance = sn.dp_variance(age,
                                      privacy_usage={'epsilon': .5},
                                      data_lower=0.,
                                      data_upper=150.,
                                      data_rows=num_records)

        analysis.release()

        print("age variance:", age_variance.value)

        # If I clamp, impute, resize, then I can reuse their properties for multiple statistics
        clamped_age = sn.clamp(age, lower=0., upper=100.)
        imputed_age = sn.impute(clamped_age)
        preprocessed_age = sn.resize(imputed_age, number_rows=num_records)

        # properties necessary for mean are statically known
        mean = sn.dp_mean(preprocessed_age, privacy_usage={'epsilon': .5})

        # properties necessary for variance are statically known
        variance = sn.dp_variance(preprocessed_age,
                                  privacy_usage={'epsilon': .5})

        # sum doesn't need n, so I pass the data in before resizing
        age_sum = sn.dp_sum(imputed_age, privacy_usage={'epsilon': .5})

        # mean with lower, upper properties propagated up from prior bounds
        transformed_mean = sn.dp_mean(-(preprocessed_age + 2.),
                                      privacy_usage={'epsilon': .5})

        analysis.release()
        print("age transformed mean:", transformed_mean.value)

        # releases may be pieced together from combinations of smaller components
        custom_mean = sn.laplace_mechanism(sn.mean(preprocessed_age),
                                           privacy_usage={'epsilon': .5})

        custom_maximum = sn.laplace_mechanism(sn.maximum(preprocessed_age),
                                              privacy_usage={'epsilon': .5})

        custom_maximum = sn.laplace_mechanism(sn.maximum(preprocessed_age),
                                              privacy_usage={'epsilon': .5})

        custom_quantile = sn.laplace_mechanism(sn.quantile(preprocessed_age,
                                                           alpha=.5),
                                               privacy_usage={'epsilon': 500})

        income = sn.to_float(dataset_pums['income'])
        income_max = sn.laplace_mechanism(sn.maximum(income,
                                                     data_lower=0.,
                                                     data_upper=1000000.),
                                          privacy_usage={'epsilon': 10})

        # releases may also be postprocessed and reused as arguments to more components
        age_sum + custom_maximum * 23.

        analysis.release()
        print("laplace quantile:", custom_quantile.value)

        age_histogram = sn.dp_histogram(sn.to_int(age, lower=0, upper=100),
                                        edges=list(range(0, 100, 25)),
                                        null_value=150,
                                        privacy_usage={'epsilon': 2.})

        sex_histogram = sn.dp_histogram(sn.to_bool(dataset_pums['sex'],
                                                   true_label="1"),
                                        privacy_usage={'epsilon': 2.})

        education_histogram = sn.dp_histogram(dataset_pums['educ'],
                                              categories=["5", "7", "10"],
                                              null_value="-1",
                                              privacy_usage={'epsilon': 2.})

        analysis.release()

        print("age histogram: ", age_histogram.value)
        print("sex histogram: ", sex_histogram.value)
        print("education histogram: ", education_histogram.value)

    if run:
        analysis.release()

        # get the mean computed when release() was called
        print(mean.value)
        print(variance.value)

    return analysis
Exemple #17
0
def test_everything(run=True):
    with sn.Analysis() as analysis:
        data = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        age_int = sn.to_int(data['age'], 0, 150)
        sex = sn.to_bool(data['sex'], "1")
        educ = sn.to_float(data['educ'])
        race = data['race']
        income = sn.to_float(data['income'])
        married = sn.to_bool(data['married'], "1")

        numerics = sn.to_float(data[['age', 'income']])

        # intentionally busted component
        # print("invalid component id ", (sex + "a").component_id)

        # broadcast scalar over 2d, broadcast scalar over 1d, columnar broadcasting, left and right mul
        numerics * 2. + 2. * educ

        # add different values for each column
        numerics + [[1., 2.]]

        # index into first column
        age = sn.index(numerics, indices=0)
        income = sn.index(numerics, mask=[False, True])

        # boolean ops and broadcasting
        mask = sex & married | (~married ^ False) | (age > 50.) | (age_int
                                                                   == 25)

        # numerical clamping
        sn.clamp(numerics, 0., [150., 150_000.])
        sn.clamp(data['educ'],
                 categories=[str(i) for i in range(8, 10)],
                 null_value="-1")

        sn.count(mask)
        sn.covariance(age, income)
        sn.digitize(educ, edges=[1., 3., 10.], null_value=-1)

        # checks for safety against division by zero
        income / 2.
        income / sn.clamp(educ, 5., 20.)

        sn.dp_count(data, privacy_usage={"epsilon": 0.5})
        sn.dp_count(mask, privacy_usage={"epsilon": 0.5})

        sn.dp_histogram(mask, privacy_usage={"epsilon": 0.5})
        age = sn.impute(sn.clamp(age, 0., 150.))
        sn.dp_maximum(age, privacy_usage={"epsilon": 0.5})
        sn.dp_minimum(age, privacy_usage={"epsilon": 0.5})
        sn.dp_median(age, privacy_usage={"epsilon": 0.5})

        age_n = sn.resize(age, number_rows=800)
        sn.dp_mean(age_n, privacy_usage={"epsilon": 0.5})
        sn.dp_raw_moment(age_n, order=3, privacy_usage={"epsilon": 0.5})

        sn.dp_sum(age, privacy_usage={"epsilon": 0.5})
        sn.dp_variance(age_n, privacy_usage={"epsilon": 0.5})

        sn.filter(income, mask)
        race_histogram = sn.histogram(race,
                                      categories=["1", "2", "3"],
                                      null_value="3")
        sn.histogram(income, edges=[0., 10000., 50000.], null_value=-1)

        sn.dp_histogram(married, privacy_usage={"epsilon": 0.5})

        sn.gaussian_mechanism(race_histogram,
                              privacy_usage={
                                  "epsilon": 0.5,
                                  "delta": .000001
                              })
        sn.laplace_mechanism(race_histogram,
                             privacy_usage={
                                 "epsilon": 0.5,
                                 "delta": .000001
                             })

        sn.raw_moment(educ, order=3)

        sn.log(sn.clamp(educ, 0.001, 50.))
        sn.maximum(educ)
        sn.mean(educ)
        sn.minimum(educ)

        educ % 2.
        educ**2.

        sn.quantile(educ, .32)

        sn.resize(educ, number_rows=1200, lower=0., upper=50.)
        sn.resize(race,
                  number_rows=1200,
                  categories=["1", "2"],
                  weights=[1, 2])
        sn.resize(data[["age", "sex"]],
                  1200,
                  categories=[["1", "2"], ["a", "b"]],
                  weights=[1, 2])
        sn.resize(data[["age", "sex"]],
                  1200,
                  categories=[["1", "2"], ["a", "b", "c"]],
                  weights=[[1, 2], [3, 7, 2]])

        sn.sum(educ)
        sn.variance(educ)

    if run:
        analysis.release()

    return analysis
Exemple #18
0
def test_multilayer_analysis(run=True):
    with sn.Analysis() as analysis:
        PUMS = sn.Dataset(path=TEST_PUMS_PATH, column_names=TEST_PUMS_NAMES)

        age = sn.to_float(PUMS['age'])
        sex = sn.to_bool(PUMS['sex'], true_label="TRUE")

        age_clamped = sn.clamp(age, lower=0., upper=150.)
        age_resized = sn.resize(age_clamped, number_rows=1000)

        race = sn.to_float(PUMS['race'])
        mean_age = sn.dp_mean(data=race,
                              privacy_usage={'epsilon': .65},
                              data_lower=0.,
                              data_upper=100.,
                              data_rows=500)
        analysis.release()

        sex_plus_22 = sn.add(sn.to_float(sex),
                             22.,
                             left_rows=1000,
                             left_lower=0.,
                             left_upper=1.)

        sn.dp_mean(age_resized / 2. + sex_plus_22,
                   privacy_usage={'epsilon': .1},
                   data_lower=mean_age - 5.2,
                   data_upper=102.,
                   data_rows=500) + 5.

        sn.dp_variance(data=sn.to_float(PUMS['educ']),
                       privacy_usage={'epsilon': .15},
                       data_rows=1000,
                       data_lower=0.,
                       data_upper=12.)

        # sn.dp_raw_moment(
        #     sn.to_float(PUMS['married']),
        #     privacy_usage={'epsilon': .15},
        #     data_rows=1000000,
        #     data_lower=0.,
        #     data_upper=12.,
        #     order=3
        # )
        #
        # sn.dp_covariance(
        #     left=sn.to_float(PUMS['age']),
        #     right=sn.to_float(PUMS['married']),
        #     privacy_usage={'epsilon': .15},
        #     left_rows=1000,
        #     right_rows=1000,
        #     left_lower=0.,
        #     left_upper=1.,
        #     right_lower=0.,
        #     right_upper=1.
        # )

    if run:
        analysis.release()

    return analysis