def __init__(self, total_privacy_budget: PrivacyBudget): """ :param total_privacy_budget: The total privacy budget that can be consumed by the private table. When is there is no privacy budget left, stop answering queries. """ self.total_privacy_budget = total_privacy_budget self.consumed_privacy_budget = PrivacyBudget(0., 0.)
def example_private_table(): adult_data = pd.read_csv(os.path.join("dataset", "adult_data.txt"), names=[ "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status", "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss", "Hours per week", "Country", "Target" ]) data = { 'Age': adult_data["Age"].tolist(), 'Education': adult_data["Education"].tolist() } df = pd.DataFrame(data) domains = { 'Age': RealDataDomain(17., 90.), 'Education': CategoricalDataDomain([ ' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th', ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th', ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th', ' Preschool', ' 12th' ]) } return PrivateTable(df, domains, PrivacyBudget(100000.0, 1.))
def test_private_gaussian_mean(example_private_table: PrivateTable): """check private guassian mean implementation.""" noisy_mean = np.mean([ example_private_table.gaussian_mean('Age', PrivacyBudget(0.99, 0.5)) for i in range(100) ]) check_absolute_error(noisy_mean, 33.2, 10.)
def update_privacy_loss(self, sampling_ratio: float, sigma: float, steps: int, moment_order: int = 32, target_eps: Union[float, None] = None, target_delta: Union[float, None] = None): """Calculate and update privacy loss. Must specify exactly either one of `target_eps` or `target_delta`. :param sampling_ratio: Ratio of data used to total data in one step :param sigma: Noise scale :param steps: Number of update performed :param moment_order: Maximum order of moment to calculate privacy budget, defaults to 32 :param target_eps: Target value of :math:`\epsilon`, defaults to None :param target_delta: Target value of :math:`\delta`, defaults to None """ assert (target_eps is None) or ( target_eps > 0), "Value of epsilon should be positive" assert (target_delta is None) or ( target_delta > 0), "Value of delta should be positive" log_moments = [(i, compute_log_moment(sampling_ratio, sigma, steps, i)) for i in range(1, moment_order + 1)] privacy = get_privacy_spent(log_moments, target_eps, target_delta) privacy_budget = PrivacyBudget(privacy[0], privacy[1]) e = self.consumed_privacy_budget + privacy_budget assert e <= self.total_privacy_budget, "there is not enough privacy budget." self.consumed_privacy_budget = e
def test_private_categorical_hist(example_private_table: PrivateTable): """check private hist implementation for categorical column.""" noisy_hist = example_private_table.cat_hist('Name', PrivacyBudget(10000.)) err = [1, 1, 1] noisy_hist.sort() assert all(np.abs(noisy_hist - [1, 1, 2]) < err) del noisy_hist
def test_private_SGD(data): train_data, test_data = data[:800], data[800:] param = np.random.rand(2) # y = param[0]*x+param[1] def gradient_function(batch_data): x, y = batch_data y_pred = param[0] * x + param[1] d0 = -2.0 * x * (y - y_pred) d1 = -2.0 * (y - y_pred) return [d0, d1] def get_weights_function(): return np.copy(param) def learning_rate_function(step): if step < 10: return 0.1 elif step < 50: return 0.01 else: return 0.005 def update_weights_function(new_weight): param[:] = new_weight def test_function(): n = len(test_data) x = np.array([i[0] for i in test_data]) y = np.array([i[1] for i in test_data]) y_pred = param[0] * x + param[1] loss = 1.0 / n * np.sum((y_pred - y)**2) check_absolute_error(loss, 0., 20.) moment_accountant = MomentPrivacyBudgetTracker(PrivacyBudget(10, 0.001)) private_SGD(gradient_function=gradient_function, get_weights_function=get_weights_function, update_weights_function=update_weights_function, learning_rate_function=learning_rate_function, train_data=train_data, group_size=100, gradient_norm_bound=10, number_of_steps=100, sigma=1, moment_privacy_budget_tracker=moment_accountant, test_interval=100, test_function=test_function) check_absolute_error(moment_accountant.consumed_privacy_budget.epsilon, 8.805554, 1e-6) check_absolute_error(moment_accountant.consumed_privacy_budget.delta, 0.000625, 1e-6)
def test_private_categorical_hist(example_private_table: PrivateTable): """check private hist implementation for categorical column of Classes in iris dataset. bins: Iris-setosa, Iris-versicolor, Iris-virginica """ noisy_hist = example_private_table.cat_hist('Class', PrivacyBudget(10000.)) err = [1, 1, 1] noisy_hist.sort() assert all(np.abs(noisy_hist-[50, 50, 50]) < err) del noisy_hist
def example_private_table(): data = {'Name': ['Tom', 'Jack', 'Steve', 'Jack'], 'Age': [28, 34, 29, 42]} df = pd.DataFrame(data) domains = { 'Name': CategoricalDataDomain( ['Tom', 'Jack', 'Steve', 'Eve', 'Adam', 'Lucifer']), 'Age': RealDataDomain(0., 130.) } return PrivateTable(df, domains, PrivacyBudget(100000.0, 1000.))
def test_column_names(example_table: DataFrame): domains = { 'Name': CategoricalDataDomain( ['Tom', 'Jack', 'Steve', 'Eve', 'Adam', 'Lucifer']), 'Age': RealDataDomain(0., 130.) } t = PrivateTable(example_table, domains, PrivacyBudget(1.0, 0.)) assert 'Age' in t._columns assert 'Name' in t._columns
def test_column_names(example_table: DataFrame): """check to ensure column names correspond to the domains""" domains = {'Sepal Length': RealDataDomain(0., 10.), 'Sepal Width': RealDataDomain(0., 10.), 'Petal Length': RealDataDomain(0., 10.), 'Petal Width': RealDataDomain(0., 10.), 'Class': CategoricalDataDomain(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])} t = PrivateTable(example_table, domains, PrivacyBudget(1.0, 0.)) assert 'Sepal Length' in t._columns assert 'Sepal Width' in t._columns assert 'Petal Length' in t._columns assert 'Petal Width' in t._columns assert 'Class' in t._columns
def test_private_numerical_hist(example_private_table: PrivateTable): """check private hist implementation for numerical column. bins: |.......|.......|.......| boundaries: a0 a1 a2 a3 """ bins: List[float] = [20, 30, 40, 50] # [a0, a1, a2, a3] noisy_hist = example_private_table.num_hist('Age', bins, PrivacyBudget(10000.)) err = [1, 1, 1] noisy_hist.sort() assert all(np.abs(noisy_hist - [1, 1, 2]) < err) del noisy_hist, bins
def test_column_names(example_table: DataFrame): """check to ensure column names correspond to the domains""" domains = { 'Age': RealDataDomain(0., 130.), 'Education': CategoricalDataDomain([ ' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th', ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th', ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th', ' Preschool', ' 12th' ]) } t = PrivateTable(example_table, domains, PrivacyBudget(1.0, 0.)) assert 'Age' in t._columns assert 'Education' in t._columns
def example_private_table(): iris_data = pd.read_csv(os.path.join("dataset", "iris_data.txt"), names=["Sepal Length", "Sepal Width", "Petal Length", "Petal Width", "Class"]) data = {'Sepal Length': iris_data["Sepal Length"].tolist(), 'Sepal Width': iris_data["Sepal Width"].tolist(), 'Petal Length': iris_data["Petal Length"].tolist(), 'Petal Width': iris_data["Petal Width"].tolist(), 'Class': iris_data["Class"].tolist()} df = pd.DataFrame(data) domains = {'Sepal Length': RealDataDomain(0., 10.), 'Sepal Width': RealDataDomain(0., 10.), 'Petal Length': RealDataDomain(0., 10.), 'Petal Width': RealDataDomain(0., 10.), 'Class': CategoricalDataDomain(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])} return PrivateTable(df, domains, PrivacyBudget(100000.0, 1.))
def test_private_categorical_hist(example_private_table: PrivateTable): """check private hist implementation for categorical column of Education in adult dataset. bins: HS-grad, Bachelors etc """ noisy_hist = example_private_table.cat_hist('Education', PrivacyBudget(10000.)) err = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] noisy_hist.sort() assert all( np.abs(noisy_hist - [ 51, 168, 333, 413, 433, 514, 576, 646, 933, 1067, 1175, 1382, 1723, 5355, 7291, 10501 ]) < err) del noisy_hist
def test_private_numerical_hist(example_private_table: PrivateTable): """check private hist implementation for numerical column of Age in adult dataset. bins: 17, 18, 19 ... 90 """ bins: List[float] = [int(i) for i in range(17, 91)] noisy_hist = example_private_table.num_hist('Age', bins, PrivacyBudget(10000.)) err = [int(1) for i in range(1, 74)] noisy_hist.sort() assert all( np.abs(noisy_hist - [ 1, 1, 3, 3, 6, 10, 12, 20, 22, 22, 23, 29, 43, 45, 46, 51, 64, 67, 72, 89, 108, 120, 150, 151, 178, 208, 230, 258, 300, 312, 355, 358, 366, 366, 395, 415, 419, 464, 478, 543, 550, 577, 595, 602, 708, 712, 720, 724, 734, 737, 753, 765, 770, 780, 785, 794, 798, 808, 813, 816, 827, 828, 835, 841, 858, 861, 867, 875, 876, 877, 886, 888, 898 ]) < err) del noisy_hist, bins
def update_privacy_loss(self, privacy_budget: PrivacyBudget, delta_prime: float, k: int = 1): """Calculate and update privacy loss of multiple query with same privacy_budget. :param privacy_budget: Privacy budget of query :param delta_prime: Value of :math:`\epsilon'` :param k: Number of query, defaults to 1 """ assert delta_prime > 0, "Value of delta should be positive" kfold_privacy_budget = PrivacyBudget( np.sqrt(2 * k * np.log(1 / delta_prime)) * privacy_budget.epsilon + k * privacy_budget.epsilon * (np.exp(privacy_budget.epsilon) - 1), k * privacy_budget.delta + delta_prime) e = self.consumed_privacy_budget + kfold_privacy_budget assert e <= self.total_privacy_budget, "there is not enough privacy budget." self.consumed_privacy_budget = e
def test_private_std(example_private_table: PrivateTable): """check private std implementation.""" noisy_std = example_private_table.std('Age', PrivacyBudget(10000.)) check_absolute_error(noisy_std, 5.54, 1.) del noisy_std
def test_private_var(example_private_table: PrivateTable): """check private var implementation using Age in adult dataset.""" noisy_var = example_private_table.var('Age', PrivacyBudget(10000.)) check_absolute_error(noisy_var, 186.06140024879625, 2.) del noisy_var
def test_private_std(example_private_table: PrivateTable): """check private std implementation using Age in adult dataset.""" noisy_std = example_private_table.std('Age', PrivacyBudget(10000.)) check_absolute_error(noisy_std, 13.640432553581146, 1.) del noisy_std
def test_private_mean(example_private_table: PrivateTable): """check private mean implementation.""" noisy_mean = example_private_table.mean('Age', PrivacyBudget(10000.)) check_absolute_error(noisy_mean, 33.2, 1.)
def test_privacy_budget_class(): e1 = PrivacyBudget(1., 0.01) e2 = PrivacyBudget(0.2, 0.004) e3 = PrivacyBudget(1 + 0.2, 0.01 + 0.004) assert e3 == e1 + e2
def test_private_mode(example_private_table: PrivateTable): """check private mode implementation using Education in adult dataset.""" noisy_mode = example_private_table.mode('Education', PrivacyBudget(10000.)) assert noisy_mode == " HS-grad" del noisy_mode
def test_private_min(example_private_table: PrivateTable): """check private min implementation using Age in adult dataset.""" noisy_min = example_private_table.min('Age', PrivacyBudget(10000.)) check_absolute_error(noisy_min, 17., 1.) del noisy_min
def test_private_mode(example_private_table: PrivateTable): """check private mode implementation.""" noisy_mode = example_private_table.mode('Name', PrivacyBudget(10000.)) assert noisy_mode == "Jack" del noisy_mode
def test_private_min(example_private_table: PrivateTable): """check private min implementation.""" noisy_min = example_private_table.min('Age', PrivacyBudget(10000.)) check_absolute_error(noisy_min, 28., 1.) del noisy_min
def test_combine_privacy_losses(): e1 = PrivacyBudget(1., 0.01) e2 = PrivacyBudget(0.2, 0.004) e3 = combine_privacy_losses([e1, e2]) expected_e3 = PrivacyBudget(1. + 0.2, 0.01 + 0.004) assert e3 == expected_e3
def test_private_var(example_private_table: PrivateTable): """check private var implementation.""" noisy_var = example_private_table.var('Age', PrivacyBudget(10000.)) check_absolute_error(noisy_var, 30.69, 2.) del noisy_var
def test_private_gaussian_mean(example_private_table: PrivateTable): """check private guassian mean implementation using Age in adult dataset.""" noisy_mean = example_private_table.gaussian_mean('Age', PrivacyBudget(0.99, 0.5)) check_absolute_error(noisy_mean, 38.58164675532078, 1.)
def test_private_mean(example_private_table: PrivateTable): """check private mean implementation using Age in adult dataset.""" noisy_mean = example_private_table.mean('Age', PrivacyBudget(10000.)) check_absolute_error(noisy_mean, 38.58164675532078, 1.)
def test_private_max(example_private_table: PrivateTable): """check private max implementation.""" noisy_max = example_private_table.max('Age', PrivacyBudget(10000.)) check_absolute_error(noisy_max, 42., 1.) del noisy_max