def validate(X, y, model, k=5, run_all=True): """ input: df_train (dataframe): train set target (series): target variable k: number of folds for k-fold (default: 5) run_all (bool): True for running all folds, False for running one fold. output: list_logloss: list of log loss values (validation) for each fold list_logloss_train: list of log loss values (train set) for each fold """ # Perform k-fold validation kf = KFold(X.shape[0], k) list_logloss = [] list_logloss_train = [] if run_all: for i, (train_index, val_index) in enumerate(kf): print " validation set", i+1, "started." X_train = X[train_index] y_train = y[train_index] X_val = X[val_index] y_val = y[val_index] model.fit(X_train, y_train) probs = model.predict_proba(X_val)[:,1] probs_train = model.predict_proba(X_train)[:,1] list_logloss.append(log_loss(y_val, probs)) list_logloss_train.append(log_loss(y_train, probs_train)) return list_logloss, list_logloss_train
def validate(X, y, model, k=5, run_all=True): """ input: df_train (dataframe): train set target (series): target variable k: number of folds for k-fold (default: 5) run_all (bool): True for running all folds, False for running one fold. output: list_logloss: list of log loss values (validation) for each fold list_logloss_train: list of log loss values (train set) for each fold """ # Perform k-fold validation kf = KFold(X.shape[0], k) list_logloss = [] list_logloss_train = [] if run_all: for i, (train_index, val_index) in enumerate(kf): print " validation set", i + 1, "started." X_train = X[train_index] y_train = y[train_index] X_val = X[val_index] y_val = y[val_index] model.fit(X_train, y_train) probs = model.predict_proba(X_val)[:, 1] probs_train = model.predict_proba(X_train)[:, 1] list_logloss.append(log_loss(y_val, probs)) list_logloss_train.append(log_loss(y_train, probs_train)) return list_logloss, list_logloss_train
def yeom_attribute_inference(true_x, true_y, classifier, membership, features, train_loss, test_loss=None): print('-' * 10 + 'YEOM\'S ATTRIBUTE INFERENCE' + '-' * 10 + '\n') pred_membership_all = [] for feature in features: orignial_attribute = np.copy(true_x[:, feature]) low_value, high_value, true_attribute_value = get_attribute_variations( true_x, feature) true_x[:, feature] = low_value pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x={'x': true_x}, num_epochs=1, shuffle=False) predictions = classifier.predict(input_fn=pred_input_fn) _, low_op = get_predictions(predictions) low_op = low_op.astype('float32') low_op = log_loss(true_y, low_op) true_x[:, feature] = high_value pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x={'x': true_x}, num_epochs=1, shuffle=False) predictions = classifier.predict(input_fn=pred_input_fn) _, high_op = get_predictions(predictions) high_op = high_op.astype('float32') high_op = log_loss(true_y, high_op) high_prob = np.sum(true_attribute_value) / len(true_attribute_value) low_prob = 1 - high_prob if test_loss == None: pred_attribute_value = np.where( low_prob * stats.norm(0, train_loss).pdf(low_op) >= high_prob * stats.norm(0, train_loss).pdf(high_op), 0, 1) mask = [1] * len(pred_attribute_value) else: low_mem = np.where( stats.norm(0, train_loss).pdf(low_op) >= stats.norm( 0, test_loss).pdf(low_op), 1, 0) high_mem = np.where( stats.norm(0, train_loss).pdf(high_op) >= stats.norm( 0, test_loss).pdf(high_op), 1, 0) pred_attribute_value = [ np.argmax([low_prob * a, high_prob * b]) for a, b in zip(low_mem, high_mem) ] mask = [a | b for a, b in zip(low_mem, high_mem)] pred_membership = mask & (pred_attribute_value ^ true_attribute_value ^ [1] * len(pred_attribute_value)) prety_print_result(membership, pred_membership) pred_membership_all.append(pred_membership) true_x[:, feature] = orignial_attribute return pred_membership_all
def proposed_membership_inference(v_dataset, true_x, true_y, classifier, per_instance_loss, args): print('-' * 10 + 'PROPOSED MEMBERSHIP INFERENCE' + '-' * 10 + '\n') v_train_x, v_train_y, v_test_x, v_test_y = v_dataset v_true_x = np.vstack([v_train_x, v_test_x]) v_true_y = np.concatenate([v_train_y, v_test_y]) v_pred_y, v_membership, v_test_classes, v_classifier, aux = train_target_model( dataset=v_dataset, epochs=args.target_epochs, batch_size=args.target_batch_size, learning_rate=args.target_learning_rate, n_hidden=args.target_n_hidden, l2_ratio=args.target_l2_ratio, model=args.target_model, privacy=args.target_privacy, dp=args.target_dp, epsilon=args.target_epsilon, delta=args.target_delta, save=args.save_model) v_per_instance_loss = np.array(log_loss(v_true_y, v_pred_y)) noise_params = (args.attack_noise_type, args.attack_noise_coverage, args.attack_noise_magnitude) v_counts = loss_increase_counts(v_true_x, v_true_y, v_classifier, v_per_instance_loss, noise_params) counts = loss_increase_counts(true_x, true_y, classifier, per_instance_loss, noise_params) return (true_y, v_true_y, v_membership, v_per_instance_loss, v_counts, counts)
def proposed_attribute_inference(true_x, true_y, classifier, membership, features, args): print('-' * 10 + 'PROPOSED ATTRIBUTE INFERENCE' + '-' * 10 + '\n') low_per_instance_loss_all, high_per_instance_loss_all = [], [] low_counts_all, high_counts_all = [], [] true_attribute_value_all = [] for feature in features: orignial_attribute = np.copy(true_x[:, feature]) low_value, high_value, true_attribute_value = get_attribute_variations( true_x, feature) noise_params = (args.attack_noise_type, args.attack_noise_coverage, args.attack_noise_magnitude) true_x[:, feature] = low_value pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x={'x': true_x}, num_epochs=1, shuffle=False) predictions = classifier.predict(input_fn=pred_input_fn) _, low_op = get_predictions(predictions) low_op = low_op.astype('float32') low_op = log_loss(true_y, low_op) low_counts = loss_increase_counts(true_x, true_y, classifier, low_op, noise_params) true_x[:, feature] = high_value pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x={'x': true_x}, num_epochs=1, shuffle=False) predictions = classifier.predict(input_fn=pred_input_fn) _, high_op = get_predictions(predictions) high_op = high_op.astype('float32') high_op = log_loss(true_y, high_op) high_counts = loss_increase_counts(true_x, true_y, classifier, high_op, noise_params) true_attribute_value_all.append(true_attribute_value) low_per_instance_loss_all.append(low_op) high_per_instance_loss_all.append(high_op) low_counts_all.append(low_counts) high_counts_all.append(high_counts) true_x[:, feature] = orignial_attribute return (true_attribute_value_all, low_per_instance_loss_all, high_per_instance_loss_all, low_counts_all, high_counts_all)
def loss_increase_counts(true_x, true_y, classifier, per_instance_loss, noise_params, max_t=100): counts = np.zeros(len(true_x)) for t in range(max_t): noisy_x = true_x + generate_noise(true_x.shape, true_x.dtype, noise_params) pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x={'x': noisy_x}, num_epochs=1, shuffle=False) predictions = classifier.predict(input_fn=pred_input_fn) _, pred_y = get_predictions(predictions) noisy_per_instance_loss = np.array(log_loss(true_y, pred_y)) counts += np.where(noisy_per_instance_loss > per_instance_loss, 1, 0) return counts
def run_experiment(args): print('-' * 10 + 'TRAIN TARGET' + '-' * 10 + '\n') dataset = load_data('target_data.npz', args) v_dataset = load_data('shadow0_data.npz', args) train_x, train_y, test_x, test_y = dataset true_x = np.vstack((train_x, test_x)) true_y = np.append(train_y, test_y) batch_size = args.target_batch_size pred_y, membership, test_classes, classifier, aux = train_target_model( args=args, dataset=dataset, epochs=args.target_epochs, batch_size=args.target_batch_size, learning_rate=args.target_learning_rate, clipping_threshold=args.target_clipping_threshold, n_hidden=args.target_n_hidden, l2_ratio=args.target_l2_ratio, model=args.target_model, privacy=args.target_privacy, dp=args.target_dp, epsilon=args.target_epsilon, delta=args.target_delta, save=args.save_model) train_loss, train_acc, test_loss, test_acc = aux per_instance_loss = np.array(log_loss(true_y, pred_y)) # Yeom's membership inference attack when only train_loss is known yeom_mi_outputs_1 = yeom_membership_inference(per_instance_loss, membership, train_loss) # Yeom's membership inference attack when both train_loss and test_loss are known - Adversary 2 of Yeom et al. yeom_mi_outputs_2 = yeom_membership_inference(per_instance_loss, membership, train_loss, test_loss) # Proposed membership inference attacks proposed_mi_outputs = proposed_membership_inference(v_dataset, true_x, true_y, classifier, per_instance_loss, args) evaluate_proposed_membership_inference(per_instance_loss, membership, proposed_mi_outputs, fpr_threshold=0.01) evaluate_proposed_membership_inference(per_instance_loss, membership, proposed_mi_outputs, fpr_threshold=0.01, per_class_thresh=True) if not os.path.exists(RESULT_PATH+args.train_dataset): os.makedirs(RESULT_PATH+args.train_dataset) if args.target_privacy == 'no_privacy': pickle.dump([aux, membership, per_instance_loss, yeom_mi_outputs_1, yeom_mi_outputs_2, proposed_mi_outputs], open(RESULT_PATH+args.train_dataset+'/'+str(args.target_test_train_ratio)+'_'+args.target_model+'_'+args.target_privacy+'_'+str(args.target_l2_ratio)+'_'+str(args.run)+'.p', 'wb')) else: pickle.dump([aux, membership, per_instance_loss, yeom_mi_outputs_1, yeom_mi_outputs_2, proposed_mi_outputs], open(RESULT_PATH+args.train_dataset+'/'+str(args.target_test_train_ratio)+'_'+args.target_model+'_'+args.target_privacy+'_'+args.target_dp+'_'+str(args.target_epsilon)+'_'+str(args.run)+'.p', 'wb'))
def run_experiment(args): print('-' * 10 + 'TRAIN TARGET' + '-' * 10 + '\n') dataset = load_data('target_data.npz', args) train_x, train_y, test_x, test_y = dataset true_x = np.vstack((train_x, test_x)) true_y = np.append(train_y, test_y) batch_size = args.target_batch_size pred_y, membership, test_classes, classifier, aux = train_target_model( args=args, dataset=dataset, epochs=args.target_epochs, batch_size=args.target_batch_size, learning_rate=args.target_learning_rate, clipping_threshold=args.target_clipping_threshold, n_hidden=args.target_n_hidden, l2_ratio=args.target_l2_ratio, model=args.target_model, privacy=args.target_privacy, dp=args.target_dp, epsilon=args.target_epsilon, delta=args.target_delta, save=args.save_model) train_loss, train_acc, test_loss, test_acc = aux per_instance_loss = np.array(log_loss(true_y, pred_y)) features = get_random_features(true_x, range(true_x.shape[1]), 5) print(features) # Yeom's membership inference attack when only train_loss is known pred_membership = yeom_membership_inference(per_instance_loss, membership, train_loss) fpr, tpr, thresholds = roc_curve(membership, pred_membership, pos_label=1) yeom_mem_adv = tpr[1] - fpr[1] # Shokri's membership inference attack based on shadow model training shokri_mem_adv, shokri_mem_confidence = shokri_membership_inference( args, pred_y, membership, test_classes) # Yeom's attribute inference attack when train_loss is known - Adversary 4 of Yeom et al. pred_membership_all = yeom_attribute_inference(true_x, true_y, classifier, membership, features, train_loss) yeom_attr_adv = [] for pred_membership in pred_membership_all: fpr, tpr, thresholds = roc_curve(membership, pred_membership, pos_label=1) yeom_attr_adv.append(tpr[1] - fpr[1]) if not os.path.exists(RESULT_PATH + args.train_dataset): os.makedirs(RESULT_PATH + args.train_dataset) if args.target_privacy == 'no_privacy': pickle.dump( [ train_acc, test_acc, train_loss, membership, shokri_mem_adv, shokri_mem_confidence, yeom_mem_adv, per_instance_loss, yeom_attr_adv, pred_membership_all, features ], open( RESULT_PATH + args.train_dataset + '/' + args.target_model + '_' + 'no_privacy_' + str(args.l2_ratio) + '.p', 'wb')) else: pickle.dump( [ train_acc, test_acc, train_loss, membership, shokri_mem_adv, shokri_mem_confidence, yeom_mem_adv, per_instance_loss, yeom_attr_adv, pred_membership_all, features ], open( RESULT_PATH + args.train_dataset + '/' + args.target_model + '_' + args.target_privacy + '_' + args.target_dp + '_' + str(args.target_epsilon) + '_' + str(args.run) + '.p', 'wb'))
############################ # test log loss print('computing log loss') kf = cross_validation.KFold(ntrain, n_folds=4) _logloss = 0.0 for trainIndex, testIndex in kf: print("TRAIN:", trainIndex, "TEST:", testIndex) X_train, X_test = X[trainIndex], X[testIndex] y_train, y_test = y[trainIndex], y[testIndex] clf.fit(X_train, y_train) pred = clf.predict_proba(X_test) _logloss += utilities.log_loss(pred, y_test) print('log loss = ', _logloss/len(kf)) ############################ clf.fit(X, y) print('training completed') del X del y # Dimensions for train set ntest = 10873 nfeature = 16 ** 2 + 1 # For two_byte_codes, no_que_marks test, Ids = utilities.read_test(ntest, nfeature, ftest)