def main(argv=None): # Load the test dataset from MNIST test_data, test_labels = input.ld_mnist(test_only=True) # DATA_DIR? # Compute shape of array that will hold probabilities produced by each # teacher, for each training point, and each output class result_shape = (nb_teachers, len(test_data), nb_classes) # Create array that will hold result result = np.zeros(result_shape, dtype=np.float32) # Get predictions from each teacher for teacher_id in xrange(nb_teachers): # Compute path of checkpoint file for teacher model with ID teacher_id ckpt_path = "../RESULTS/MNIST_250/TRAIN_DIR/mnist_250_teachers_"+str(teacher_id)+".ckpt-2999" # Get predictions on our training data and store in result array preds_for_teacher = deep_cnn.softmax_preds(test_data, ckpt_path) # This can take a while when there are a lot of teachers so output status print("Computed Teacher " + str(teacher_id) + " softmax predictions") # Save in a numpy array np.save("PREDOS/predictions_teacher_"+str(teacher_id)+".npy", preds_for_teacher) return True
def train_student(dataset, nb_teachers, shift_dataset,inverse_w=None, weight = True): """ This function trains a student using predictions made by an ensemble of teachers. The student and teacher models are trained using the same neural network architecture. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param weight: whether this is an importance weight sampling :return: True if student training went well """ assert input.create_dir_if_needed(FLAGS.train_dir) # Call helper function to prepare student data using teacher predictions stdnt_data = shift_dataset['data'] stdnt_labels = shift_dataset['pred'] print('number for deep is {}'.format(len(stdnt_labels))) if FLAGS.deeper: ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_student_deeper.ckpt' #NOLINT(long-line) else: ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_student.ckpt' # NOLINT(long-line) if FLAGS.cov_shift == True: """ need to compute the weight for student curve weight into some bound, in case the weight is too large """ weights = inverse_w else: print('len of shift data'.format(len(shift_dataset['data']))) weights = np.zeros(len(stdnt_data)) print('len of weight={} len of labels= {} '.format(len(weights), len(stdnt_labels))) for i, x in enumerate(weights): weights[i] = np.float32(inverse_w[stdnt_labels[i]]) if weight == True: assert deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path, weights= weights) else: deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path) # Compute final checkpoint name for student (with max number of steps) ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1) if dataset == 'adult': private_data, private_labels = input.ld_adult(test_only = False, train_only= True) elif dataset =='mnist': private_data, private_labels = input.ld_mnist(test_only = False, train_only = True) elif dataset =="svhn": private_data, private_labels = input.ld_svhn(test_only=False, train_only=True) # Compute student label predictions on remaining chunk of test set teacher_preds = deep_cnn.softmax_preds(private_data, ckpt_path_final) student_preds = deep_cnn.softmax_preds(stdnt_data, ckpt_path_final) # Compute teacher accuracy precision_t = metrics.accuracy(teacher_preds, private_labels) precision_s = metrics.accuracy(student_preds, stdnt_labels) precision_true = metrics.accuracy(student_preds, shift_dataset['label']) print('Precision of teacher after training:{} student={} true precision for student {}'.format(precision_t, precision_s,precision_true)) return precision_t, precision_s
def train_teacher(FLAGS, dataset, nb_teachers, teacher_id): """ This function trains a teacher (teacher id) among an ensemble of nb_teachers models for the dataset specified. :param dataset: string corresponding to dataset (svhn, cifar10) :param nb_teachers: total number of teachers in the ensemble :param teacher_id: id of the teacher being trained :return: True if everything went well """ # If working directories do not exist, create them assert input.create_dir_if_needed(FLAGS.data_dir) assert input.create_dir_if_needed(FLAGS.train_dir) # Load the dataset if dataset == 'svhn': train_data,train_labels,test_data,test_labels = input.ld_svhn(extended=True) elif dataset == 'cifar10': train_data, train_labels, test_data, test_labels = input.ld_cifar10() elif dataset == 'mnist': train_data, train_labels, test_data, test_labels = input.ld_mnist() else: print("Check value of dataset flag") return False if FLAGS.cov_shift == True: teacher_file_name = FLAGS.data + 'PCA_teacher' + FLAGS.dataset + '.pkl' student_file_name = FLAGS.data + 'PCA_student' + FLAGS.dataset + '.pkl' f = open(teacher_file_name, 'rb') train_data = pickle.load(f) f = open(student_file_name, 'rb') test_data = pickle.load(f) # Retrieve subset of data for this teacher data, labels = input.partition_dataset(train_data, train_labels, nb_teachers, teacher_id) print("Length of training data: " + str(len(labels))) # Define teacher checkpoint filename and full path if FLAGS.deeper: filename = str(nb_teachers) + 'pca_teachers_' + str(teacher_id) + '_deep.ckpt' else: filename = str(nb_teachers) + 'pca_teachers_' + str(teacher_id) + '.ckpt' ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + filename # Perform teacher training assert deep_cnn.train(data, labels, ckpt_path) # Append final step value to checkpoint for evaluation ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1) # Retrieve teacher probability estimates on the test data teacher_preds = deep_cnn.softmax_preds(test_data, ckpt_path_final) # Compute teacher accuracy precision = metrics.accuracy(teacher_preds, test_labels) print('Precision of teacher after training: ' + str(precision)) return True
def pca_transform(dataset, FLAGS): """ Do PCA transform on both teacher and student dataset :param dataset: :return: pca transformed teacher and student dataset """ teacher_file_name = FLAGS.data + '/PCA_teacher' + dataset + '.pkl' student_file_name = FLAGS.data + '/PCA_student' + dataset + '.pkl' #if os.path.exists(teacher_file_name): #return test_only = False train_only = False dim = 784 # Load the dataset if dataset == 'svhn': train_data, train_labels, test_data, test_labels = input.ld_svhn( test_only, train_only) elif dataset == 'cifar10': train_data, train_labels, test_data, test_labels = input.ld_cifar10( test_only, train_only) elif dataset == 'mnist': train_data, train_labels, test_data, test_labels = input.ld_mnist( test_only, train_only) else: print("Check value of dataset flag") return False ori_train = train_data.shape ori_test = test_data.shape test_data = test_data.reshape((-1, dim)) train_data = train_data.reshape((-1, dim)) pca = PCA(n_components=1) pca.fit(test_data) max_component = pca.components_.T projection = np.dot(test_data, max_component) min_v = min(projection) mean_v = np.mean(projection) a = 1 b = 1 mu = min_v + (mean_v - min_v) / a var = (mean_v - min_v) / b prob = scipy.stats.norm(mu, var).pdf(projection) prob = np.ravel(prob.T) # transform into 1d dim index = np.where(prob > 0)[0] sample = np.random.choice(index, len(index), replace=True, p=prob / sum(prob)) test_data = test_data[sample] train_data = np.reshape(train_data, ori_train) test_data = np.reshape(test_data, ori_test) f = open(teacher_file_name, 'wb') pickle.dump(train_data, f) f = open(student_file_name, 'wb') pickle.dump(test_data, f) print('finish pca transform')
def load_data(dataset): if dataset == 'svhn': train_data, train_labels, test_data, test_labels = input.ld_svhn( extended=True) elif dataset == 'cifar10': train_data, train_labels, test_data, test_labels = input.ld_cifar10() elif dataset == 'mnist': train_data, train_labels, test_data, test_labels = input.ld_mnist() else: print("Check value of dataset flag") return False return train_data, train_labels, test_data, test_labels
def train_teacher(dataset, nb_teachers, teacher_id): """ This function trains a teacher (teacher id) among an ensemble of nb_teachers models for the dataset specified. :param dataset: string corresponding to dataset (svhn, cifar10) :param nb_teachers: total number of teachers in the ensemble :param teacher_id: id of the teacher being trained :return: True if everything went well """ # If working directories do not exist, create them assert input.create_dir_if_needed(FLAGS.data_dir) assert input.create_dir_if_needed(FLAGS.train_dir) # Load the dataset if dataset == 'svhn': train_data,train_labels,test_data,test_labels = input.ld_svhn(extended=True) elif dataset == 'cifar10': train_data, train_labels, test_data, test_labels = input.ld_cifar10() elif dataset == 'mnist': train_data, train_labels, test_data, test_labels = input.ld_mnist() else: print("Check value of dataset flag") return False # Retrieve subset of data for this teacher data, labels = input.partition_dataset(train_data, train_labels, nb_teachers, teacher_id) print("Length of training data: " + str(len(labels))) # Define teacher checkpoint filename and full path if FLAGS.deeper: filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '_deep.ckpt' else: filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '.ckpt' ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + filename # Perform teacher training assert deep_cnn.train(data, labels, ckpt_path) # Append final step value to checkpoint for evaluation ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1) # Retrieve teacher probability estimates on the test data teacher_preds = deep_cnn.softmax_preds(test_data, ckpt_path_final) # Compute teacher accuracy precision = metrics.accuracy(teacher_preds, test_labels) print('Precision of teacher after training: ' + str(precision)) return True
def load_dataset(dataset, test_only=False, train_only=False): if dataset == 'svhn': test_data, test_labels = input.ld_svhn(test_only=test_only) return test_data, test_labels elif dataset == 'cifar10': test_data, test_labels = input.ld_cifar10(test_only=test_only) elif dataset == 'mnist': test_data, test_labels = input.ld_mnist(test_only=test_only) elif dataset == 'adult': test_data, test_labels = input.ld_adult(test_only = test_only) else: print("Check value of dataset flag") return test_data, test_labels
def predict_teacher(dataset, nb_teachers): """ This is for obtaining the weight from student / teache, don't involve any noise :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param teacher: if teacher is true, then predict with training dataset, else students :return: out prediction based on cnn """ assert input.create_dir_if_needed(FLAGS.train_dir) train_only = True test_only = False # create path to save teacher predict teacher model filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_teacher_clean_votes_label_shift' + str( FLAGS.lap_scale) + '.npy' # Load the dataset if dataset == 'svhn': test_data, test_labels = input.ld_svhn(test_only, train_only) elif dataset == 'cifar10': test_data, test_labels = input.ld_cifar10(test_only, train_only) elif dataset == 'mnist': test_data, test_labels = input.ld_mnist(test_only, train_only) elif dataset == 'adult': test_data, test_labels = input.ld_adult(test_only, train_only) else: print("Check value of dataset flag") return False if os.path.exists(filepath): pred_labels = np.load(filepath) return pred_labels, test_labels teachers_preds = ensemble_preds(dataset, nb_teachers, test_data) # Aggregate teacher predictions to get student training labels pred_labels = aggregation.noisy_max(FLAGS.nb_teachers, teachers_preds, 0) utils.save_file(filepath, pred_labels) # Print accuracy of aggregated labels ac_ag_labels = metrics.accuracy(pred_labels, test_labels) print("obtain_weight Accuracy of the aggregated labels: " + str(ac_ag_labels)) return pred_labels, test_labels
def prepare_student_data(dataset, nb_teachers,shift_idx,nb_q=None): """ Takes a dataset name and the size of the teacher ensemble and prepares training data for the student model, according to parameters indicated in flags above. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param save: if set to True, will dump student training labels predicted by the ensemble of teachers (with Laplacian noise) as npy files. It also dumps the clean votes for each class (without noise) and the labels assigned by teachers :return: pairs of (data, labels) to be used for student training and testing """ if dataset == 'svhn': test_data, test_labels = input.ld_svhn(test_only=True) elif dataset == 'cifar10': test_data, test_labels = input.ld_cifar10(test_only=True) elif dataset == 'mnist': test_data, test_labels = input.ld_mnist(test_only=True) elif dataset == 'adult': test_data, test_labels = input.ld_adult(test_only = True) else: print("Check value of dataset flag") return False if nb_q !=None: shift_idx = np.random.choice(shift_idx, nb_q) # Prepare filepath for numpy dump of clean votessvhn_250_student_clean_test.npy filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_clean_test.npy' # NOLINT(long-line) if os.path.exists(filepath): with open(filepath,'rb')as f: clean_votes = np.load(f) keep_idx, result = gaussian(FLAGS.nb_labels, clean_votes,shift_idx) precision_true = metrics.accuracy(result, test_labels[keep_idx]) print('number of idx={} precision_true from gaussian for shift data={}'.format(len(keep_idx[0]), precision_true)) return keep_idx, test_data[keep_idx], result print('not find file for clean student vote')
def predict_data(dataset, nb_teachers, teacher=False): """ This is for obtaining the weight from student / teache, don't involve any noise :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param teacher: if teacher is true, then predict with training dataset, else students :return: out prediction based on cnn """ assert input.create_dir_if_needed(FLAGS.train_dir) if teacher: train_only = True test_only = False else: train_only = False test_only = True # Load the dataset if dataset == 'svhn': test_data, test_labels = input.ld_svhn(test_only, train_only) elif dataset == 'cifar10': test_data, test_labels = input.ld_cifar10(test_only, train_only) elif dataset == 'mnist': test_data, test_labels = input.ld_mnist(test_only, train_only) elif dataset == 'adult': test_data, test_labels = input.ld_adult(test_only, train_only) else: print("Check value of dataset flag") return False teachers_preds = ensemble_preds(dataset, nb_teachers, test_data) # Aggregate teacher predictions to get student training labels pred_labels = aggregation.noisy_max(FLAGS.nb_teachers, teachers_preds, 0) # Print accuracy of aggregated labels ac_ag_labels = metrics.accuracy(pred_labels, test_labels) print("obtain_weight Accuracy of the aggregated labels: " + str(ac_ag_labels)) return test_data, pred_labels, test_labels
def prepare_student_data(dataset, nb_teachers, save=False): """ Takes a dataset name and the size of the teacher ensemble and prepares training data for the student model, according to parameters indicated in flags above. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param save: if set to True, will dump student training labels predicted by the ensemble of teachers (with Laplacian noise) as npy files. It also dumps the clean votes for each class (without noise) and the labels assigned by teachers :return: pairs of (data, labels) to be used for student training and testing """ assert input.create_dir_if_needed(FLAGS.train_dir) # Load the dataset if dataset == 'svhn': test_data, test_labels = input.ld_svhn(test_only=True) elif dataset == 'cifar10': test_data, test_labels = input.ld_cifar10(test_only=True) elif dataset == 'mnist': test_data, test_labels = input.ld_mnist(test_only=True) else: print("Check value of dataset flag") return False # Make sure there is data leftover to be used as a test set assert FLAGS.stdnt_share < len(test_data) # Prepare [unlabeled] student training data (subset of test set) stdnt_data = test_data[:FLAGS.stdnt_share] # Compute teacher predictions for student training data teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data) # Aggregate teacher predictions to get student training labels if not save: stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale) else: # Request clean votes and clean labels as well stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale, return_clean_votes=True) #NOLINT(long-line) # Prepare filepath for numpy dump of clean votes filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_clean_votes_lap_' + str(FLAGS.lap_scale) + '.npy' # NOLINT(long-line) # Prepare filepath for numpy dump of clean labels filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_teachers_labels_lap_' + str(FLAGS.lap_scale) + '.npy' # NOLINT(long-line) # Dump clean_votes array with gfile.Open(filepath, mode='w') as file_obj: np.save(file_obj, clean_votes) # Dump labels_for_dump array with gfile.Open(filepath_labels, mode='w') as file_obj: np.save(file_obj, labels_for_dump) # Print accuracy of aggregated labels ac_ag_labels = metrics.accuracy(stdnt_labels, test_labels[:FLAGS.stdnt_share]) print("Accuracy of the aggregated labels: " + str(ac_ag_labels)) # Store unused part of test set for use as a test set after student training stdnt_test_data = test_data[FLAGS.stdnt_share:] stdnt_test_labels = test_labels[FLAGS.stdnt_share:] if save: # Prepare filepath for numpy dump of labels produced by noisy aggregation filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_labels_lap_' + str(FLAGS.lap_scale) + '.npy' #NOLINT(long-line) # Dump student noisy labels array with gfile.Open(filepath, mode='w') as file_obj: np.save(file_obj, stdnt_labels) return stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels
from __future__ import absolute_import from __future__ import division from __future__ import print_function import input import tensorflow as tf import numpy as np FLAGS = tf.flags.FLAGS tf.flags.DEFINE_string('data_dir', 'data_dir', 'Temporary storage') test_data, test_labels = input.ld_svhn(test_only=True) test_data_1000 = test_data[:1000, :, :, :] np.save(FLAGS.data_dir + '/svhn_test_1000', test_data_1000) test_data, test_labels = input.ld_cifar10(test_only=True) test_data_1000 = test_data[:1000, :, :, :] np.save(FLAGS.data_dir + '/cifar10_test_1000', test_data_1000) test_data, test_labels = input.ld_mnist(test_only=True) test_data_1000 = test_data[:1000, :, :, :] np.save(FLAGS.data_dir + '/mnist_test_1000', test_data_1000)
def train_teacher(dataset, nb_teachers, teacher_id): """ This function trains a teacher (teacher id) among an ensemble of nb_teachers models for the dataset specified. :param dataset: string corresponding to dataset (svhn, cifar10) :param nb_teachers: total number of teachers in the ensemble :param teacher_id: id of the teacher being trained :return: True if everything went well """ # If working directories do not exist, create them assert input.create_dir_if_needed(FLAGS.data_dir) assert input.create_dir_if_needed(FLAGS.train_dir) print("teacher {}:".format(teacher_id)) # Load the dataset if dataset == 'svhn': train_data, train_labels, test_data, test_labels = input.ld_svhn( extended=True) elif dataset == 'cifar10': train_data, train_labels, test_data, test_labels = input.ld_cifar10() elif dataset == 'mnist': train_data, train_labels, test_data, test_labels = input.ld_mnist() else: print("Check value of dataset flag") return False path = os.path.abspath('.') path1 = path + '\\plts_nodisturb\\' # 对标签进行干扰 import copy train_labels1 = copy.copy(train_labels) train_labels2 = disturb(train_labels, 0.1) disturb(test_labels, 0.1) #path1 = path + '\\plts_withdisturb\\' # Retrieve subset of data for this teacher #干扰前 data, labels = input.partition_dataset(train_data, train_labels, nb_teachers, teacher_id) from pca import K_S import operator print(operator.eq(train_labels1, train_labels2)) print("干扰前: ", K_S.tst_norm(train_labels1)) print("干扰后: ", K_S.tst_norm(train_labels2)) print(K_S.tst_samp(train_labels1, train_labels2)) print("Length of training data: " + str(len(labels))) # Define teacher checkpoint filename and full path if FLAGS.deeper: filename = str(nb_teachers) + '_teachers_' + str( teacher_id) + '_deep.ckpt' else: filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '.ckpt' ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + filename # Perform teacher training losses = deep_cnn.train(data, labels, ckpt_path) # Append final step value to checkpoint for evaluation ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1) # Retrieve teacher probability estimates on the test data teacher_preds = deep_cnn.softmax_preds(test_data, ckpt_path_final) # Compute teacher accuracy precision = metrics.accuracy(teacher_preds, test_labels) print('Precision of teacher after training: ' + str(precision)) print("each n step loss: ", losses) #x = list(range(1, len(losses)+1)) #plt.plot(x, losses, 'bo-', markersize=20) #plt.savefig(path1 + 'loss' + str(teacher_id) + '.jpg') #plt.show() #print("x: ",x) #print("loss: ", losses) return True
def prepare_student_data(dataset, nb_teachers, save=False): """ Takes a dataset name and the size of the teacher ensemble and prepares training data for the student model, according to parameters indicated in flags above. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param save: if set to True, will dump student training labels predicted by the ensemble of teachers (with Laplacian noise) as npy files. It also dumps the clean votes for each class (without noise) and the labels assigned by teachers :return: pairs of (data, labels) to be used for student training and testing """ assert input.create_dir_if_needed(FLAGS.train_dir) # Load the dataset if dataset == 'svhn': test_data, test_labels = input.ld_svhn(test_only=True) elif dataset == 'cifar10': test_data, test_labels = input.ld_cifar10(test_only=True) elif dataset == 'mnist': test_data, test_labels = input.ld_mnist(test_only=True) elif dataset == 'digit': test_data, test_labels = input.ld_digit_test(test_name=FLAGS.test_name, num=2000) else: print("Check value of dataset flag") return False # Make sure there is data leftover to be used as a test set assert FLAGS.stdnt_share < len(test_data) # Prepare [unlabeled] student training data (subset of test set) if (FLAGS.d_stu > -1): # stdnt_data = [] # for i in range(FLAGS.stdnt_share): # new_img = transform.resize(skimage.img_as_ubyte(test_data[i].astype(int)),(28,28)) # if FLAGS.d_stu == 3: # new_img = color.rgb2gray(new_img) # else: # new_img = new_img[ :,:, FLAGS.d_stu] # stdnt_data.append(new_img.reshape(28,28,1).astype(np.float32)) # stdnt_data = np.array(stdnt_data) trimmed = test_data[:FLAGS.stdnt_share, 2:30, 2:30, :] # grey scale if (FLAGS.d_stu == 3): stdnt_data = 0.2125 * trimmed[:, :, :, 0] + 0.7154 * trimmed[:, :, :, 1] + 0.0721 * trimmed[:, :, :, 2] else: stdnt_data = trimmed[:, :, :, FLAGS.d_stu] stdnt_data = stdnt_data.reshape((-1, 28, 28, 1)) else: stdnt_data = test_data[:FLAGS.stdnt_share] # Compute teacher predictions for student training data teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data) # Aggregate teacher predictions to get student training labels if not save: stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale) else: # Request clean votes and clean labels as well stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max( teachers_preds, FLAGS.lap_scale, return_clean_votes=True) #NOLINT(long-line) # Prepare filepath for numpy dump of clean votes filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_student_clean_votes_lap_' + str( FLAGS.lap_scale) + '.npy' # NOLINT(long-line) # Prepare filepath for numpy dump of clean labels filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_teachers_labels_lap_' + str( FLAGS.lap_scale) + '.npy' # NOLINT(long-line) # Dump clean_votes array with tf.gfile.Open(filepath, mode='w') as file_obj: np.save(file_obj, clean_votes) # Dump labels_for_dump array with tf.gfile.Open(filepath_labels, mode='w') as file_obj: np.save(file_obj, labels_for_dump) # Print accuracy of aggregated labels ac_ag_labels = metrics.accuracy(stdnt_labels, test_labels[:FLAGS.stdnt_share]) print("Accuracy of the aggregated labels: " + str(ac_ag_labels)) # Store unused part of test set for use as a test set after student training if FLAGS.dataset_teacher == 'mnist': test_data, test_labels = input.ld_mnist(test_only=True) else: assert 0 == 1, "Non implemented error: dataset_teacher not equals to mnist" # if FLAGS.d_stu > -1: # stdnt_test_data = test_data[FLAGS.stdnt_share:, 2:30, 2:30, FLAGS.d_stu : FLAGS.d_stu+1] # else: stdnt_test_data = test_data[FLAGS.stdnt_share:] stdnt_test_labels = test_labels[FLAGS.stdnt_share:] if save: # Prepare filepath for numpy dump of labels produced by noisy aggregation filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_student_labels_lap_' + str( FLAGS.lap_scale) + '.npy' #NOLINT(long-line) # Dump student noisy labels array with tf.gfile.Open(filepath, mode='w') as file_obj: np.save(file_obj, stdnt_labels) return stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels
def train_student(dataset, nb_teachers, weight=True, inverse_w=None, shift_dataset=None): """ This function trains a student using predictions made by an ensemble of teachers. The student and teacher models are trained using the same neural network architecture. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param weight: whether this is an importance weight sampling :return: True if student training went well """ assert input.create_dir_if_needed(FLAGS.train_dir) # Call helper function to prepare student data using teacher predictions if shift_dataset is not None: stdnt_data, stdnt_labels = prepare_student_data( dataset, nb_teachers, save=True, shift_data=shift_dataset) else: if FLAGS.PATE2 == True: keep_idx, stdnt_data, stdnt_labels = prepare_student_data( dataset, nb_teachers, save=True) else: stdnt_data, stdnt_labels = prepare_student_data(dataset, nb_teachers, save=True) rng = np.random.RandomState(FLAGS.dataset_seed) rand_ix = rng.permutation(len(stdnt_labels)) stdnt_data = stdnt_data[rand_ix] stdnt_labels = stdnt_labels[rand_ix] print('number for deep is {}'.format(len(stdnt_labels))) # Unpack the student dataset, here stdnt_labels are already the ensemble noisy version # Prepare checkpoint filename and path if FLAGS.deeper: ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str( nb_teachers) + '_student_deeper.ckpt' #NOLINT(long-line) else: ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str( nb_teachers) + '_student.ckpt' # NOLINT(long-line) # Start student training if FLAGS.cov_shift == True: """ need to compute the weight for student curve weight into some bound, in case the weight is too large """ weights = inverse_w #y_s = np.expand_dims(y_s, axis=1) else: print('len of shift data'.format(len(shift_dataset['data']))) weights = np.zeros(len(stdnt_data)) print('len of weight={} len of labels= {} '.format( len(weights), len(stdnt_labels))) for i, x in enumerate(weights): weights[i] = np.float32(inverse_w[stdnt_labels[i]]) if weight == True: if FLAGS.PATE2 == True: assert deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path, weights=weights[keep_idx]) else: assert deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path, weights=weights) else: deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path) # Compute final checkpoint name for student (with max number of steps) ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1) if dataset == 'adult': private_data, private_labels = input.ld_adult(test_only=False, train_only=True) elif dataset == 'mnist': private_data, private_labels = input.ld_mnist(test_only=False, train_only=True) elif dataset == "svhn": private_data, private_labels = input.ld_svhn(test_only=False, train_only=True) # Compute student label predictions on remaining chunk of test set teacher_preds = deep_cnn.softmax_preds(private_data, ckpt_path_final) student_preds = deep_cnn.softmax_preds(stdnt_data, ckpt_path_final) # Compute teacher accuracy precision_t = metrics.accuracy(teacher_preds, private_labels) precision_s = metrics.accuracy(student_preds, stdnt_labels) if FLAGS.cov_shift == True: student_file_name = FLAGS.data + 'PCA_student' + FLAGS.dataset + '.pkl' f = open(student_file_name, 'rb') test = pickle.load(f) if FLAGS.PATE2 == True: test_labels = test['label'][keep_idx] else: test_labels = test['label'] precision_true = metrics.accuracy(student_preds, test_labels) print( 'Precision of teacher after training:{} student={} true precision for student {}' .format(precision_t, precision_s, precision_true)) return len(test_labels), precision_t, precision_s
def train_student(dataset, nb_teachers, knock, weight=True, inverse_w=None, shift_dataset=None): """ This function trains a student using predictions made by an ensemble of teachers. The student and teacher models are trained using the same neural network architecture. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :return: True if student training went well """ assert input.create_dir_if_needed(FLAGS.train_dir) print('len of shift data'.format(len(shift_dataset['data']))) # Call helper function to prepare student data using teacher predictions stdnt_data, stdnt_labels = prepare_student_data(dataset, nb_teachers, save=True, shift_data=shift_dataset) # Unpack the student dataset, here stdnt_labels are already the ensemble noisy version # Prepare checkpoint filename and path if FLAGS.deeper: ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str( nb_teachers) + '_student_deeper.ckpt' #NOLINT(long-line) else: ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str( nb_teachers) + '_student.ckpt' # NOLINT(long-line) # Start student training weights = np.zeros(len(stdnt_data)) print('len of weight={} len of labels= {} '.format(len(weights), len(stdnt_labels))) for i, x in enumerate(weights): weights[i] = np.float32(inverse_w[stdnt_labels[i]]) if weight == True: assert deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path, weights=weights) else: deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path) # Compute final checkpoint name for student (with max number of steps) ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1) private_data, private_labels = input.ld_mnist(test_only=False, train_only=True) # Compute student label predictions on remaining chunk of test set teacher_preds = deep_cnn.softmax_preds(private_data, ckpt_path_final) student_preds = deep_cnn.softmax_preds(stdnt_data, ckpt_path_final) # Compute teacher accuracy precision_t = metrics.accuracy(teacher_preds, private_labels) precision_s = metrics.accuracy(student_preds, stdnt_labels) if knock == True: print( 'weight is {} shift_ratio={} Precision of teacher after training:{} student={}' .format(weight, shift_dataset['shift_ratio'], precision_t, precision_s)) else: print( 'weight is {} shift_ratio={} Precision of teacher after training:{} student={}' .format(weight, shift_dataset['alpha'], precision_t, precision_s)) return True
def main(_): rng = np.random.RandomState(FLAGS.seed) train_count = COUNTS[FLAGS.dataset_name]["train"] validation_count = COUNTS[FLAGS.dataset_name]["valid"] test_count = COUNTS[FLAGS.dataset_name]["test"] extra_count = COUNTS[FLAGS.dataset_name]["extra"] extra_set = None # In general, there won't be extra data. if FLAGS.dataset_name == "svhn": train_set, test_set, extra_set = _load_svhn() elif FLAGS.dataset_name == "cifar10": train_set, test_set = _load_cifar10(normalize=True) elif FLAGS.dataset_name == "cifar_unnormalized": train_set, test_set = _load_cifar10(normalize=False) elif FLAGS.dataset_name == "imagenet_32": train_set, test_set = _load_imagenet_32() elif FLAGS.dataset_name == 'mnist': train_set, test_set = input.ld_mnist() else: raise ValueError("Unknown dataset", FLAGS.dataset_name) # Shuffle the training data indices = rng.permutation(train_set["images"].shape[0]) train_set["images"] = train_set["images"][indices] train_set["labels"] = train_set["labels"][indices] # If the extra set exists, shuffle it. if extra_set is not None: extra_indices = rng.permutation(extra_set["images"].shape[0]) extra_set["images"] = extra_set["images"][extra_indices] extra_set["labels"] = extra_set["labels"][extra_indices] # Split the training data into training and validation data train_images = train_set["images"][validation_count:] train_labels = train_set["labels"][validation_count:] validation_images = train_set["images"][:validation_count] validation_labels = train_set["labels"][:validation_count] validation_set = {"images": validation_images, "labels": validation_labels} train_set = {"images": train_images, "labels": train_labels} # Convert to Examples and write the result to TFRecords. dataset_utils.convert_to( train_set["images"], train_set["labels"], train_count - validation_count, "train", FLAGS.directory, FLAGS.dataset_name, ) dataset_utils.convert_to( test_set["images"], test_set["labels"], test_count, "test", FLAGS.directory, FLAGS.dataset_name, ) dataset_utils.convert_to( validation_set["images"], validation_set["labels"], validation_count, "validation", FLAGS.directory, FLAGS.dataset_name, ) if extra_set is not None: dataset_utils.convert_to( extra_set["images"], extra_set["labels"], extra_count, "extra", FLAGS.directory, FLAGS.dataset_name, )
def prepare_student_data(dataset, nb_teachers, save=False): """ Takes a dataset name and the size of the teacher ensemble and prepares f training data for the student model, according to parameters indicated in flags above. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param save: if set to True, will dump student training labels predicted by the ensemble of teachers (with Laplacian noise) as npy files. It also dumps the clean votes for each class (without noise) and the labels assigned by teachers :return: pairs of (data, labels) to be used for student training and testing """ assert input.create_dir_if_needed(FLAGS.train_dir) # Load the dataset if dataset == 'svhn': train_data, train_labels, test_data, test_labels = input.ld_svhn( extended=True) train_data = np.reshape(train_data, [-1, 32 * 32 * 3]) test_data = test_data.reshape([-1, 32 * 32 * 3]) elif dataset == 'cifar10': train_data, train_labels, test_data, test_labels = input.ld_cifar10() train_data = np.reshape(train_data, [-1, 32 * 32 * 3]) test_data = test_data.reshape([-1, 32 * 32 * 3]) elif dataset == 'mnist': #test_data, test_labels = input.ld_mnist(test_only=True) train_data, train_labels, test_data, test_labels = input.ld_mnist() train_data = np.reshape(train_data, [-1, 28 * 28]) test_data = test_data.reshape([-1, 28 * 28]) else: print("Check value of dataset flag") return False # Make sure there is data leftover to be used as a test set """ If FLAGS.extra >0, means we remove the first FLAGS.extra data point from private dataset to student dataset. Default train_data is private. Ori_test_data records the original feature of test data, since we will apply PCA later. iF FLAGS.vat == True, then '..ckpt-2000.py' is the prediction of student queries(A+B) from VAT, (A+B) is defined later """ if FLAGS.extra > 0: test_data = np.vstack((test_data, train_data[:FLAGS.extra])) test_labels = np.concatenate((test_labels, train_labels[:FLAGS.extra])) #print('test_label.shape',test_labels.shape) train_data = train_data[FLAGS.extra:] train_labels = train_labels[FLAGS.extra:] #print('train_size {} query_size {}'.format(train_data.shape[0], test_data.shape[0])) ori_test_data = test_data if FLAGS.vat == True and os.path.exists('record/svhn_model.ckpt-2000.npy'): vat_labels = np.load('record/svhn_model.ckpt-2000.npy') vat_labels = np.array(vat_labels, dtype=np.int32) print('vat_label.shape', vat_labels.shape) stdnt_test_data = ori_test_data[-1000:] stdnt_test_labels = test_labels[-1000:] return ori_test_data[: -1000], vat_labels, stdnt_test_data, stdnt_test_labels if FLAGS.pca == True: train_data, test_data = pca(train_data, test_data) stdnt_data = test_data[:FLAGS.stdnt_share] assert FLAGS.stdnt_share < len(test_data) """ Compute teacher predictions for student queries There is a subsample scheme here, each query will subsample a prob*train_data for KNN, distance is based on Euclidean distance. autodp is used track privacy loss(compose_subsample_mechanisms) TO privately release every query, we add gaussian noise """ num_train = train_data.shape[0] teachers_preds = np.zeros([stdnt_data.shape[0], FLAGS.nb_teachers]) for idx in range(len(stdnt_data)): if idx % 100 == 0: print('idx=', idx) query_data = stdnt_data[idx] select_teacher = np.random.choice(train_data.shape[0], int(prob * num_train)) dis = np.linalg.norm(train_data[select_teacher] - query_data, axis=1) k_index = select_teacher[np.argsort(dis)[:FLAGS.nb_teachers]] teachers_preds[idx] = train_labels[k_index] acct.compose_poisson_subsampled_mechanisms(gaussian, prob, coeff=1) #compute privacy loss print("Composition of student subsampled Gaussian mechanisms gives ", (acct.get_eps(delta), delta)) teachers_preds = np.asarray(teachers_preds, dtype=np.int32) if not save: major_vote = aggregation.aggregation_knn(teachers_preds, sigma) stdnt_labels = major_vote else: # Request clean votes and clean labels as well stdnt_labels, clean_votes, labels_for_dump = aggregation.aggregation_knn( teachers_preds, sigma, return_clean_votes=True) #NOLINT(long-line) # Prepare filepath for numpy dump of clean votes filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_student_clean_votes_gau_' + str( FLAGS.gau_scale) + '.npy' # NOLINT(long-line) # Prepare filepath for numpy dump of clean labels filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_teachers_labels_gau_' + str( FLAGS.gau_scale) + '.npy' # NOLINT(long-line) # Dump clean_votes array with tf.gfile.Open(filepath, mode='w') as file_obj: np.save(file_obj, clean_votes) # Dump labels_for_dump array with tf.gfile.Open(filepath_labels, mode='w') as file_obj: np.save(file_obj, labels_for_dump) ac_ag_labels = metrics.accuracy(stdnt_labels, test_labels[:FLAGS.stdnt_share]) print("Accuracy of the aggregated labels: " + str(ac_ag_labels)) """ split data point for semi-supervised training (VAT) Suppose original test data is SVHN, then split it into 3 part A, B, C A has FLAGS.stdnt_share points, which are student queries answered by noisy KNN B has test_data[FLAGS.stdnt_share:-1000] data point, which is used as unlabeled feature for VAT C has the last 1k point for test if don't use VAT, then ignore convert_vat """ convert_vat(ori_test_data, test_labels, stdnt_labels) stdnt_test_data = ori_test_data[-1000:] stdnt_test_labels = test_labels[-1000:] if save: # Prepare filepath for numpy dump of labels produced by noisy aggregation filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_student_labels_lap_' + str( FLAGS.gau_scale) + '.npy' #NOLINT(long-line) # Dump student noisy labels array with tf.gfile.Open(filepath, mode='w') as file_obj: np.save(file_obj, stdnt_labels) return ori_test_data[:FLAGS. stdnt_share], stdnt_labels, stdnt_test_data, stdnt_test_labels
linewidth=2) # [acgfacct3.get_rdp(i + 1) for i in range(acgfacct.m)]) plt.legend(['dp_glm', 'object puerb', 'non-private', 'Bound for Gaussian'], loc='lower right') plt.savefig("hhh.pdf", bbox_inches='tight') plt.xlabel(r'eps') plt.ylabel(r'accuracy') plt.grid(True) plt.show() if __name__ == '__main__': if FLAGS.dataset == 'mnist': X_train, y_train, X_test, y_test = input.ld_mnist(test_only=False, train_only=False) X_train = normalize(X_train.reshape([-1, 784]))[:1000, :] y_train = y_train[:1000] X_test = normalize(X_test.reshape([-1, 784])) elif FLAGS.dataset == 'adult': file_Name = "adult/adult.data" # open the file for writing fileObject = open(file_Name, 'rb') dataset = pickle.load(fileObject) X_train = dataset['train_data'] y_train = dataset['train_label'] X_test = dataset['test_data'] y_test = dataset['test_label'] else:
def prepare_student_data(dataset, nb_teachers, save=False, shift_data=None): """ Takes a dataset name and the size of the teacher ensemble and prepares training data for the student model, according to parameters indicated in flags above. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param save: if set to True, will dump student training labels predicted by the ensemble of teachers (with Laplacian noise) as npy files. It also dumps the clean votes for each class (without noise) and the labels assigned by teachers :return: pairs of (data, labels) to be used for student training and testing """ if dataset == 'svhn': test_data, test_labels = input.ld_svhn(test_only=True) elif dataset == 'cifar10': test_data, test_labels = input.ld_cifar10(test_only=True) elif dataset == 'mnist': test_data, test_labels = input.ld_mnist(test_only=True) elif dataset == 'adult': test_data, test_labels = input.ld_adult(test_only=True) else: print("Check value of dataset flag") return False if FLAGS.cov_shift == True: student_file_name = FLAGS.data + 'PCA_student' + FLAGS.dataset + '.pkl' f = open(student_file_name, 'rb') test = pickle.load(f) test_data = test['data'] test_labels = test['label'] # Prepare [unlabeled] student training data (subset of test set) stdnt_data = test_data assert input.create_dir_if_needed(FLAGS.train_dir) gau_filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_student_votes_sigma1:' + str( FLAGS.sigma1) + '_sigma2:' + str( FLAGS.sigma2) + '.npy' # NOLINT(long-line) # Prepare filepath for numpy dump of clean votes filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_student_clean_votes' + str( FLAGS.lap_scale) + '.npy' # NOLINT(long-line) # Prepare filepath for numpy dump of clean labels filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_teachers_labels_lap_' + str( FLAGS.lap_scale) + '.npy' # NOLINT(long-line) """ if os.path.exists(filepath): if FLAGS.PATE2 == True: with open(filepath,'rb')as f: clean_votes = np.load(f) keep_idx, result = gaussian(FLAGS.nb_labels, clean_votes) precision_true = metrics.accuracy(result, test_labels[keep_idx]) print('number of idx={}'.format(len(keep_idx[0]))) return keep_idx, stdnt_data[keep_idx], result """ # Load the dataset # Make sure there is data leftover to be used as a test set assert FLAGS.stdnt_share < len(test_data) if shift_data is not None: #no noise # replace original student data with shift data stdnt_data = shift_data['data'] test_labels = shift_data['label'] print('*** length of shift_data {} lable length={}********'.format( len(stdnt_data), len(test_labels))) # Compute teacher predictions for student training data teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data) # Aggregate teacher predictions to get student training labels if not save: stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale) else: # Request clean votes and clean labels as well stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max( FLAGS.nb_labels, teachers_preds, FLAGS.lap_scale, return_clean_votes=True) #NOLINT(long-line) if FLAGS.PATE2 == True: keep_idx, result = gaussian(FLAGS.nb_labels, clean_votes) # Dump clean_votes array with tf.gfile.Open(filepath, mode='w') as file_obj: np.save(file_obj, clean_votes) # Dump labels_for_dump array with tf.gfile.Open(filepath_labels, mode='w') as file_obj: np.save(file_obj, labels_for_dump) # Print accuracy of aggregated labels if FLAGS.PATE2 == True: with tf.gfile.Open(gau_filepath, mode='w') as file_obj: np.save(file_obj, result) ac_ag_labels = metrics.accuracy(result, test_labels[keep_idx]) print( "number of gaussian student {} Accuracy of the aggregated labels:{} " .format(len(result), ac_ag_labels)) return keep_idx, stdnt_data[keep_idx], result else: ac_ag_labels = metrics.accuracy(stdnt_labels, test_labels) print("Accuracy of the aggregated labels: " + str(ac_ag_labels)) if save: # Prepare filepath for numpy dump of labels produced by noisy aggregation filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str( nb_teachers) + '_student_labels_lap_' + str( FLAGS.lap_scale) + '.npy' #NOLINT(long-line) # Dump student noisy labels array with tf.gfile.Open(filepath, mode='w') as file_obj: np.save(file_obj, stdnt_labels) return stdnt_data, stdnt_labels