def test_check_params(art_warning, tabular_dl_estimator_for_attack): try: classifier = tabular_dl_estimator_for_attack( AttributeInferenceBlackBox) with pytest.raises(ValueError): AttributeInferenceBlackBox(classifier, attack_feature="a") with pytest.raises(ValueError): AttributeInferenceBlackBox(classifier, attack_feature=-1) with pytest.raises(ValueError): AttributeInferenceBlackBox(classifier, prediction_normal_factor=-1) except ARTTestException as e: art_warning(e)
def test_black_box_no_values(art_warning, decision_tree_estimator, get_iris_dataset, model_type): try: attack_feature = 2 # petal length # need to transform attacked feature into categorical def transform_feature(x): x[x > 0.5] = 2.0 x[(x > 0.2) & (x <= 0.5)] = 1.0 x[x <= 0.2] = 0.0 (x_train_iris, y_train_iris), (x_test_iris, y_test_iris) = get_iris_dataset # training data without attacked feature x_train_for_attack = np.delete(x_train_iris, attack_feature, 1) # only attacked feature x_train_feature = x_train_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_train_feature) # training data with attacked feature (after transformation) x_train = np.concatenate( (x_train_for_attack[:, :attack_feature], x_train_feature), axis=1) x_train = np.concatenate( (x_train, x_train_for_attack[:, attack_feature:]), axis=1) # test data without attacked feature x_test_for_attack = np.delete(x_test_iris, attack_feature, 1) # only attacked feature x_test_feature = x_test_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_test_feature) classifier = decision_tree_estimator() attack = AttributeInferenceBlackBox(classifier, attack_feature=attack_feature, attack_model_type=model_type) # get original model's predictions x_train_predictions = np.array([ np.argmax(arr) for arr in classifier.predict(x_train_iris) ]).reshape(-1, 1) x_test_predictions = np.array([ np.argmax(arr) for arr in classifier.predict(x_test_iris) ]).reshape(-1, 1) # train attack model attack.fit(x_train) # infer attacked feature inferred_train = attack.infer(x_train_for_attack, pred=x_train_predictions) inferred_test = attack.infer(x_test_for_attack, pred=x_test_predictions) # check accuracy train_acc = np.sum(inferred_train == x_train_feature.reshape( 1, -1)) / len(inferred_train) test_acc = np.sum(inferred_test == x_test_feature.reshape( 1, -1)) / len(inferred_test) assert pytest.approx(0.8285, abs=0.12) == train_acc assert pytest.approx(0.8888, abs=0.18) == test_acc except ARTTestException as e: art_warning(e)
def test_black_box_one_hot(art_warning, get_iris_dataset): try: attack_feature = 2 # petal length # need to transform attacked feature into categorical def transform_feature(x): x[x > 0.5] = 2 x[(x > 0.2) & (x <= 0.5)] = 1 x[x <= 0.2] = 0 (x_train_iris, y_train_iris), (x_test_iris, y_test_iris) = get_iris_dataset # training data without attacked feature x_train_for_attack = np.delete(x_train_iris, attack_feature, 1) # only attacked feature x_train_feature = x_train_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_train_feature) # transform to one-hot encoding train_one_hot = np.zeros((x_train_feature.size, int(x_train_feature.max()) + 1)) train_one_hot[np.arange(x_train_feature.size), x_train_feature.reshape(1, -1).astype(int)] = 1 # training data with attacked feature (after transformation) x_train = np.concatenate((x_train_for_attack[:, :attack_feature], train_one_hot), axis=1) x_train = np.concatenate((x_train, x_train_for_attack[:, attack_feature:]), axis=1) y_train = np.array([np.argmax(y) for y in y_train_iris]).reshape(-1, 1) # test data without attacked feature x_test_for_attack = np.delete(x_test_iris, attack_feature, 1) # only attacked feature x_test_feature = x_test_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_test_feature) # transform to one-hot encoding test_one_hot = np.zeros((x_test_feature.size, int(x_test_feature.max()) + 1)) test_one_hot[np.arange(x_test_feature.size), x_test_feature.reshape(1, -1).astype(int)] = 1 # test data with attacked feature (after transformation) x_test = np.concatenate((x_test_for_attack[:, :attack_feature], test_one_hot), axis=1) x_test = np.concatenate((x_test, x_test_for_attack[:, attack_feature:]), axis=1) tree = DecisionTreeClassifier() tree.fit(x_train, y_train) classifier = ScikitlearnDecisionTreeClassifier(tree) attack = AttributeInferenceBlackBox(classifier, attack_feature=slice(attack_feature, attack_feature + 3)) # get original model's predictions x_train_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_train)]).reshape(-1, 1) x_test_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_test)]).reshape(-1, 1) # train attack model attack.fit(x_train) # infer attacked feature inferred_train = attack.infer(x_train_for_attack, x_train_predictions) inferred_test = attack.infer(x_test_for_attack, x_test_predictions) # check accuracy train_acc = np.sum(np.all(inferred_train == train_one_hot, axis=1)) / len(inferred_train) test_acc = np.sum(np.all(inferred_test == test_one_hot, axis=1)) / len(inferred_test) assert pytest.approx(0.9145, abs=0.03) == train_acc assert pytest.approx(0.9333, abs=0.03) == test_acc except ARTTestException as e: art_warning(e)
def test_black_box_with_model(art_warning, decision_tree_estimator, get_iris_dataset): try: attack_feature = 2 # petal length # need to transform attacked feature into categorical def transform_feature(x): x[x > 0.5] = 2.0 x[(x > 0.2) & (x <= 0.5)] = 1.0 x[x <= 0.2] = 0.0 values = [0.0, 1.0, 2.0] (x_train_iris, y_train_iris), (x_test_iris, y_test_iris) = get_iris_dataset # training data without attacked feature x_train_for_attack = np.delete(x_train_iris, attack_feature, 1) # only attacked feature x_train_feature = x_train_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_train_feature) # training data with attacked feature (after transformation) x_train = np.concatenate((x_train_for_attack[:, :attack_feature], x_train_feature), axis=1) x_train = np.concatenate((x_train, x_train_for_attack[:, attack_feature:]), axis=1) # test data without attacked feature x_test_for_attack = np.delete(x_test_iris, attack_feature, 1) # only attacked feature x_test_feature = x_test_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_test_feature) model = nn.Linear(4, 3) # Define a loss function and optimizer loss_fn = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.01) attack_model = PyTorchClassifier( model=model, clip_values=(0, 1), loss=loss_fn, optimizer=optimizer, input_shape=(4,), nb_classes=3 ) classifier = decision_tree_estimator() attack = AttributeInferenceBlackBox(classifier, attack_model=attack_model, attack_feature=attack_feature) # get original model's predictions x_train_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_train_iris)]).reshape(-1, 1) x_test_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_test_iris)]).reshape(-1, 1) # train attack model attack.fit(x_train) # infer attacked feature inferred_train = attack.infer(x_train_for_attack, x_train_predictions, values=values) inferred_test = attack.infer(x_test_for_attack, x_test_predictions, values=values) # check accuracy # train_acc _ = np.sum(inferred_train == x_train_feature.reshape(1, -1)) / len(inferred_train) # test_acc _ = np.sum(inferred_test == x_test_feature.reshape(1, -1)) / len(inferred_test) # assert train_acc == pytest.approx(0.5523, abs=0.03) # assert test_acc == pytest.approx(0.5777, abs=0.03) except ARTTestException as e: art_warning(e)
def test_black_box_with_model(get_tabular_classifier_list, get_iris_dataset): classifier_list = get_tabular_classifier_list(AttributeInferenceBlackBox) if not classifier_list: logging.warning("Couldn't perform this test because no classifier is defined") return attack_feature = 2 # petal length # need to transform attacked feature into categorical def transform_feature(x): x[x > 0.5] = 2.0 x[(x > 0.2) & (x <= 0.5)] = 1.0 x[x <= 0.2] = 0.0 values = [0.0, 1.0, 2.0] (x_train_iris, y_train_iris), (x_test_iris, y_test_iris) = get_iris_dataset # training data without attacked feature x_train_for_attack = np.delete(x_train_iris, attack_feature, 1) # only attacked feature x_train_feature = x_train_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_train_feature) # training data with attacked feature (after transformation) x_train = np.concatenate((x_train_for_attack[:, :attack_feature], x_train_feature), axis=1) x_train = np.concatenate((x_train, x_train_for_attack[:, attack_feature:]), axis=1) # test data without attacked feature x_test_for_attack = np.delete(x_test_iris, attack_feature, 1) # only attacked feature x_test_feature = x_test_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_test_feature) model = nn.Linear(4, 3) # Define a loss function and optimizer loss_fn = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.01) attack_model = PyTorchClassifier( model=model, clip_values=(0, 1), loss=loss_fn, optimizer=optimizer, input_shape=(4,), nb_classes=3 ) for classifier in classifier_list: if type(classifier).__name__ == "ScikitlearnDecisionTreeClassifier": attack = AttributeInferenceBlackBox(classifier, attack_model=attack_model, attack_feature=attack_feature) # get original model's predictions x_train_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_train_iris)]).reshape(-1, 1) x_test_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_test_iris)]).reshape(-1, 1) # train attack model attack.fit(x_train) # infer attacked feature inferred_train = attack.infer(x_train_for_attack, x_train_predictions, values=values) inferred_test = attack.infer(x_test_for_attack, x_test_predictions, values=values) # check accuracy train_acc = np.sum(inferred_train == x_train_feature.reshape(1, -1)) / len(inferred_train) test_acc = np.sum(inferred_test == x_test_feature.reshape(1, -1)) / len(inferred_test)
def test_black_box(get_tabular_classifier_list, get_iris_dataset): classifier_list = get_tabular_classifier_list(AttributeInferenceBlackBox) if not classifier_list: logging.warning("Couldn't perform this test because no classifier is defined") return attack_feature = 2 # petal length # need to transform attacked feature into categorical def transform_feature(x): x[x > 0.5] = 2.0 x[(x > 0.2) & (x <= 0.5)] = 1.0 x[x <= 0.2] = 0.0 values = [0.0, 1.0, 2.0] (x_train_iris, y_train_iris), (x_test_iris, y_test_iris) = get_iris_dataset # training data without attacked feature x_train_for_attack = np.delete(x_train_iris, attack_feature, 1) # only attacked feature x_train_feature = x_train_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_train_feature) # training data with attacked feature (after transformation) x_train = np.concatenate((x_train_for_attack[:, :attack_feature], x_train_feature), axis=1) x_train = np.concatenate((x_train, x_train_for_attack[:, attack_feature:]), axis=1) # test data without attacked feature x_test_for_attack = np.delete(x_test_iris, attack_feature, 1) # only attacked feature x_test_feature = x_test_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_test_feature) for classifier in classifier_list: # print(type(classifier).__name__) if type(classifier).__name__ == "ScikitlearnDecisionTreeClassifier": attack = AttributeInferenceBlackBox(classifier, attack_feature=attack_feature) # get original model's predictions x_train_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_train_iris)]).reshape(-1, 1) x_test_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_test_iris)]).reshape(-1, 1) # train attack model attack.fit(x_train) # infer attacked feature inferred_train = attack.infer(x_train_for_attack, x_train_predictions, values=values) inferred_test = attack.infer(x_test_for_attack, x_test_predictions, values=values) # check accuracy train_acc = np.sum(inferred_train == x_train_feature.reshape(1, -1)) / len(inferred_train) test_acc = np.sum(inferred_test == x_test_feature.reshape(1, -1)) / len(inferred_test) assert train_acc == pytest.approx(0.8285, abs=0.03) assert test_acc == pytest.approx(0.8888, abs=0.03)
def test_black_box_baseline(art_warning, decision_tree_estimator, get_iris_dataset): try: attack_feature = 2 # petal length # need to transform attacked feature into categorical def transform_feature(x): x[x > 0.5] = 2.0 x[(x > 0.2) & (x <= 0.5)] = 1.0 x[x <= 0.2] = 0.0 values = [0.0, 1.0, 2.0] (x_train_iris, y_train_iris), (x_test_iris, y_test_iris) = get_iris_dataset # training data without attacked feature x_train_for_attack = np.delete(x_train_iris, attack_feature, 1) # only attacked feature x_train_feature = x_train_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_train_feature) # training data with attacked feature (after transformation) x_train = np.concatenate( (x_train_for_attack[:, :attack_feature], x_train_feature), axis=1) x_train = np.concatenate( (x_train, x_train_for_attack[:, attack_feature:]), axis=1) # test data without attacked feature x_test_for_attack = np.delete(x_test_iris, attack_feature, 1) # only attacked feature x_test_feature = x_test_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_test_feature) classifier = decision_tree_estimator() attack = AttributeInferenceBlackBox(classifier, attack_feature=attack_feature) # get original model's predictions x_train_predictions = np.array([ np.argmax(arr) for arr in classifier.predict(x_train_iris) ]).reshape(-1, 1) x_test_predictions = np.array([ np.argmax(arr) for arr in classifier.predict(x_test_iris) ]).reshape(-1, 1) # train attack model attack.fit(x_train) # infer attacked feature # inferred_train _ = attack.infer(x_train_for_attack, x_train_predictions, values=values) inferred_test = attack.infer(x_test_for_attack, x_test_predictions, values=values) # check accuracy # train_acc = np.sum(inferred_train == x_train_feature.reshape(1, -1)) / len(inferred_train) test_acc = np.sum(inferred_test == x_test_feature.reshape( 1, -1)) / len(inferred_test) baseline_attack = AttributeInferenceBaseline( attack_feature=attack_feature) # train attack model baseline_attack.fit(x_train) # infer attacked feature # baseline_inferred_train _ = baseline_attack.infer(x_train_for_attack, values=values) baseline_inferred_test = baseline_attack.infer(x_test_for_attack, values=values) # check accuracy # baseline_train_acc = np.sum(baseline_inferred_train == x_train_feature.reshape(1, -1)) / len( # baseline_inferred_train # ) baseline_test_acc = np.sum( baseline_inferred_test == x_test_feature.reshape(1, -1)) / len( baseline_inferred_test) assert test_acc >= baseline_test_acc except ARTTestException as e: art_warning(e)
def test_errors(art_warning, tabular_dl_estimator_for_attack, get_iris_dataset): try: classifier = tabular_dl_estimator_for_attack(AttributeInferenceBlackBox) (x_train, y_train), (x_test, y_test) = get_iris_dataset with pytest.raises(ValueError): AttributeInferenceBlackBox(classifier, attack_feature="a") with pytest.raises(ValueError): AttributeInferenceBlackBox(classifier, attack_feature=-3) attack = AttributeInferenceBlackBox(classifier, attack_feature=8) with pytest.raises(ValueError): attack.fit(x_train) attack = AttributeInferenceBlackBox(classifier) with pytest.raises(ValueError): attack.fit(np.delete(x_train, 1, 1)) with pytest.raises(ValueError): attack.infer(x_train, y_test) with pytest.raises(ValueError): attack.infer(x_train, y_train) except ARTTestException as e: art_warning(e)
def test_black_box_one_hot_float(art_warning, get_iris_dataset): try: attack_feature = 2 # petal length # need to transform attacked feature into categorical def transform_feature(x): x[x > 0.5] = 2 x[(x > 0.2) & (x <= 0.5)] = 1 x[x <= 0.2] = 0 (x_train_iris, y_train_iris), (x_test_iris, y_test_iris) = get_iris_dataset # training data without attacked feature x_train_for_attack = np.delete(x_train_iris, attack_feature, 1) # only attacked feature x_train_feature = x_train_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_train_feature) # transform to one-hot encoding num_columns = int(x_train_feature.max()) + 1 train_one_hot = np.zeros((x_train_feature.size, num_columns)) train_one_hot[np.arange(x_train_feature.size), x_train_feature.reshape(1, -1).astype(int)] = 1 # training data with attacked feature (after transformation) x_train = np.concatenate((x_train_for_attack[:, :attack_feature], train_one_hot), axis=1) x_train = np.concatenate((x_train, x_train_for_attack[:, attack_feature:]), axis=1) y_train = np.array([np.argmax(y) for y in y_train_iris]).reshape(-1, 1) # test data without attacked feature x_test_for_attack = np.delete(x_test_iris, attack_feature, 1) # only attacked feature x_test_feature = x_test_iris[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_test_feature) # transform to one-hot encoding test_one_hot = np.zeros((x_test_feature.size, int(x_test_feature.max()) + 1)) test_one_hot[np.arange(x_test_feature.size), x_test_feature.reshape(1, -1).astype(int)] = 1 # test data with attacked feature (after transformation) x_test = np.concatenate((x_test_for_attack[:, :attack_feature], test_one_hot), axis=1) x_test = np.concatenate((x_test, x_test_for_attack[:, attack_feature:]), axis=1) # scale before training scaler = StandardScaler().fit(x_train) x_test = scaler.transform(x_test).astype(np.float32) x_train = scaler.transform(x_train).astype(np.float32) # derive dataset for attack (after scaling) attack_feature = slice(attack_feature, attack_feature + 3) x_train_for_attack = np.delete(x_train, attack_feature, 1) x_test_for_attack = np.delete(x_test, attack_feature, 1) train_one_hot = x_train[:, attack_feature] test_one_hot = x_test[:, attack_feature] tree = DecisionTreeClassifier() tree.fit(x_train, y_train) classifier = ScikitlearnDecisionTreeClassifier(tree) attack = AttributeInferenceBlackBox(classifier, attack_feature=attack_feature) # get original model's predictions x_train_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_train)]).reshape(-1, 1) x_test_predictions = np.array([np.argmax(arr) for arr in classifier.predict(x_test)]).reshape(-1, 1) # train attack model attack.fit(x_train) # infer attacked feature values = [[-0.6324555, 1.5811388], [-0.4395245, 2.2751858], [-1.1108746, 0.9001915]] inferred_train = attack.infer(x_train_for_attack, x_train_predictions, values=values) inferred_test = attack.infer(x_test_for_attack, x_test_predictions, values=values) # check accuracy train_acc = np.sum( np.all(np.around(inferred_train, decimals=3) == np.around(train_one_hot, decimals=3), axis=1) ) / len(inferred_train) test_acc = np.sum( np.all(np.around(inferred_test, decimals=3) == np.around(test_one_hot, decimals=3), axis=1) ) / len(inferred_test) assert pytest.approx(0.9145, abs=0.03) == train_acc assert pytest.approx(0.9333, abs=0.03) == test_acc except ARTTestException as e: art_warning(e)
def test_black_box_regressor(art_warning, get_diabetes_dataset, model_type): try: attack_feature = 0 # age bins = [ -0.96838121, -0.77154309, -0.57470497, -0.37786684, -0.18102872, 0.0158094, 0.21264752, 0.40948564, 0.60632376, 0.80316188, 1.0, ] # need to transform attacked feature into categorical def transform_feature(x): for i in range(len(bins) - 1): x[(x >= bins[i]) & (x <= bins[i + 1])] = i values = list(range(len(bins) - 1)) (x_train_diabetes, y_train_diabetes), (x_test_diabetes, y_test_diabetes) = get_diabetes_dataset # training data without attacked feature x_train_for_attack = np.delete(x_train_diabetes, attack_feature, 1) # only attacked feature x_train_feature = x_train_diabetes[:, attack_feature].copy().reshape( -1, 1) transform_feature(x_train_feature) # training data with attacked feature (after transformation) x_train = np.concatenate( (x_train_for_attack[:, :attack_feature], x_train_feature), axis=1) x_train = np.concatenate( (x_train, x_train_for_attack[:, attack_feature:]), axis=1) # test data without attacked feature x_test_for_attack = np.delete(x_test_diabetes, attack_feature, 1) # only attacked feature x_test_feature = x_test_diabetes[:, attack_feature].copy().reshape(-1, 1) transform_feature(x_test_feature) from sklearn import linear_model regr_model = linear_model.LinearRegression() regr_model.fit(x_train_diabetes, y_train_diabetes) regressor = ScikitlearnRegressor(regr_model) attack = AttributeInferenceBlackBox(regressor, attack_feature=attack_feature, prediction_normal_factor=1 / 250, attack_model_type=model_type) # get original model's predictions x_train_predictions = regressor.predict(x_train_diabetes).reshape( -1, 1) x_test_predictions = regressor.predict(x_test_diabetes).reshape(-1, 1) # train attack model attack.fit(x_train) # infer attacked feature inferred_train = attack.infer(x_train_for_attack, pred=x_train_predictions, values=values) inferred_test = attack.infer(x_test_for_attack, pred=x_test_predictions, values=values) # check accuracy train_acc = np.sum(inferred_train == x_train_feature.reshape( 1, -1)) / len(inferred_train) test_acc = np.sum(inferred_test == x_test_feature.reshape( 1, -1)) / len(inferred_test) assert pytest.approx(0.0258, abs=0.12) == train_acc assert pytest.approx(0.0375, abs=0.12) == test_acc except ARTTestException as e: art_warning(e)