def train_student(nb_teachers): """ This function trains a student using predictions made by an ensemble of teachers. The student and teacher models are trained using the same neural network architecture. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :return: True if student training went well """ # Call helper function to prepare student data using teacher predictions stdnt_dataset = prepare_student_data(nb_teachers, save=True) # Unpack the student dataset stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels = stdnt_dataset if config.resnet: dir_path = os.path.join(config.save_model, config.dataset) dir_path = os.path.join(dir_path, 'pate_num_teacher_' + str(config.nb_teachers)) #dir_path = os.path.join(config.save_model,'pate_'+str(config.nb_teachers)) utils.mkdir_if_missing(dir_path) filename = os.path.join(dir_path, '_stndent_resnet.checkpoint.pth.tar') print('stdnt_label used for train', stdnt_labels.shape) network.train_each_teacher(config.student_epoch, stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels, filename) final_preds = network.pred(stdnt_test_data, filename) precision = hamming_accuracy(final_preds, stdnt_test_labels, torch=False) print('Precision of student after training: ' + str(precision)) return True
def prepare_student_data(nb_teachers, save=False): """ Takes a dataset name and the size of the teacher ensemble and prepares training data for the student model, according to parameters indicated in flags above. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param save: if set to True, will dump student training labels predicted by the ensemble of teachers (with Laplacian noise) as npy files. It also dumps the clean votes for each class (without noise) and the labels assigned by teachers :return: pairs of (data, labels) to be used for student training and testing """ # Load the dataset if config.dataset == 'celeba': dataset = data_manager.init_img_dataset(root=config.data_dir, name=config.dataset) test_data = dataset.test_data test_labels = dataset.test_label elif config.dataset == 'market': data_dir = '../dataset/market1501' test_dataset = Test_Dataset(data_dir, dataset_name=dataset_dict[config.dataset], query_gallery='gallery') test_data = test_dataset.data test_labels = test_dataset.label test_labels = np.array(test_labels, dtype=np.int32) else: print("Check value of dataset flag") return False # Make sure there is data leftover to be used as a test set assert config.stdnt_share < len(test_data) # Prepare [unlabeled] student training data (subset of test set) stdnt_data = test_data[:config.stdnt_share] # Compute teacher predictions for student training data if config.reuse_vote: #reuse previous saved clean votes, but stdnt_share maybe various #dir_path = os.path.join(config.save_model,'pate_'+str(config.nb_teachers)) dir_path = os.path.join(config.save_model, config.dataset) dir_path = os.path.join(dir_path, 'pate_num_teacher_' + str(config.nb_teachers)) utils.mkdir_if_missing(dir_path) filepath = dir_path + '/_teacher_votes.npy' # Prepare filepath for numpy dump of labels produced by noisy aggregation teachers_preds = np.load(filepath) teachers_preds = teachers_preds[:config.stdnt_share] ori_filepath = dir_path + '_ori_teacher_votes.npy' ori_teachers_preds = np.load(ori_filepath) else: teachers_preds = ensemble_preds(nb_teachers, stdnt_data) ori_teachers_preds = teachers_preds # in the shape of (nb_teacher, nb_data, dim) teachers_preds = np.sum(teachers_preds, axis=0) dir_path = os.path.join(config.save_model, config.dataset) dir_path = os.path.join(dir_path, 'pate_num_teacher_' + str(config.nb_teachers)) utils.mkdir_if_missing(dir_path) filepath = dir_path + '/_teacher_votes.npy' ori_filepath = dir_path + '_ori_teacher_votes.npy' with open(filepath, mode='wb') as file_obj: np.save(file_obj, teachers_preds) with open(ori_filepath, mode='wb') as file_obj: np.save(file_obj, ori_teachers_preds) if config.use_tau: tau_teachers_preds = np.zeros(teachers_preds.shape) for idx in range(len(tau_teachers_preds)): tau_teachers_preds[idx] = tau_limit(ori_teachers_preds[:, idx, :]) preds_tau = np.asarray(tau_teachers_preds, dtype=np.float32) print('preds_tau', preds_tau[1, ]) count_zero_list = config.nb_teachers * np.ones( [config.stdnt_share, config.nb_labels]) - teachers_preds print('shape of count_zero', count_zero_list.shape) idx, stdnt_labels = aggregation.aggregation_knn( teachers_preds, config.gau_scale, count_zero_list=count_zero_list) acct.compose_mechanism(gaussian, coeff=config.stdnt_share) else: acct.compose_mechanism(gaussian, coeff=config.stdnt_share) idx, stdnt_labels = aggregation.aggregation_knn( teachers_preds, config.gau_scale) print('shape of teachers_pred', teachers_preds.shape) # Aggregate teacher predictions to get student training labels # Print accuracy of aggregated label ac_ag_labels = hamming_accuracy(stdnt_labels, test_labels[:config.stdnt_share], torch=False) print("Accuracy of the aggregated labels: " + str(ac_ag_labels)) current_eps = acct.get_eps(config.delta) print('eps after data independent composition', current_eps) # Store unused part of test set for use as a test set after student training stdnt_test_data = test_data[config.stdnt_share:] stdnt_test_labels = test_labels[config.stdnt_share:] return stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels
def prepare_student_data(nb_teachers, save=False): """ Takes a dataset name and the size of the teacher ensemble and prepares training data for the student model, according to parameters indicated in flags above. :param dataset: string corresponding to mnist, cifar10, or svhn :param nb_teachers: number of teachers (in the ensemble) to learn from :param save: if set to True, will dump student training labels predicted by the ensemble of teachers (with Laplacian noise) as npy files. It also dumps the clean votes for each class (without noise) and the labels assigned by teachers :return: pairs of (data, labels) to be used for student training and testing """ # Load the dataset if config.dataset == 'celeba': dataset = data_manager.init_img_dataset(root=config.data_dir, name=config.dataset) test_data = dataset.test_data test_labels = dataset.test_label train_data = dataset.train_data train_labels = dataset.train_label elif config.dataset =='market': data_dir = '../dataset/market1501' train_dataset = Train_Dataset(data_dir, dataset_name=dataset_dict[config.dataset], train_val='train') test_dataset = Test_Dataset(data_dir, dataset_name=dataset_dict[config.dataset], query_gallery='gallery') train_data = train_dataset.train_data train_labels = train_dataset.train_label test_data = test_dataset.data test_labels = test_dataset.label train_labels = np.array(train_labels,dtype =np.int32) test_labels = np.array(test_labels,dtype = np.int32) print('len of total test data in market',len(test_labels)) else: return False # Make sure there is data leftover to be used as a test set assert config.stdnt_share < len(test_data) ori_test_data = test_data # for test train_data, test_data = extract_feature(train_data, test_data) stdnt_data = test_data[:config.stdnt_share] # the remaining 1000 records is the holdout for evaluating share_index =np.random.choice(test_data[:-1000].shape[0],config.stdnt_share) stdnt_data = test_data[share_index] picked_stdnt_data = [ori_test_data[idx] for idx in share_index] num_train = train_data.shape[0] teachers_preds = np.zeros([stdnt_data.shape[0], config.nb_labels]) tau_teachers_preds=[] # a weighted teacher predtion with clippling for idx in range(len(stdnt_data)): if idx % 100 == 0: print('idx=', idx) query_data = stdnt_data[idx] select_teacher = np.random.choice(train_data.shape[0], int(prob * num_train)) dis = np.linalg.norm(train_data[select_teacher] - query_data, axis=1) k_index = select_teacher[np.argsort(dis)[:config.nb_teachers]] # sum over the number of teachers, which make it easy to compute their votings if config.use_tau: tau_teachers_preds.append(tau_limit(train_labels[k_index,:])) teachers_preds[idx] = np.sum(train_labels[k_index, :], axis=0) teachers_preds = np.asarray(teachers_preds, dtype=np.int32) if config.use_tau: preds_tau = np.asarray(tau_teachers_preds, dtype = np.float32) acct.compose_poisson_subsampled_mechanisms(gaussian, prob, coeff=config.stdnt_share) count_zero_list = config.nb_teachers * np.ones([config.stdnt_share,config.nb_labels]) - teachers_preds idx, stdnt_labels = aggregation.aggregation_knn(teachers_preds, config.gau_scale,count_zero_list=count_zero_list) else: acct.compose_poisson_subsampled_mechanisms(gaussian, prob, coeff=config.stdnt_share) idx, stdnt_labels = aggregation.aggregation_knn(teachers_preds, config.gau_scale) # compute privacy loss print("Composition of student subsampled Gaussian mechanisms gives ", (acct.get_eps(delta), delta)) # Print accuracy of aggregated label #ac_ag_labels = hamming_accuracy(stdnt_labels, test_labels[:config.stdnt_share], torch=False) ac_ag_labels = hamming_accuracy(stdnt_labels, test_labels[share_index], torch=False) precision = hamming_precision(stdnt_labels, test_labels[share_index], torch=False) print("Accuracy of the aggregated labels: " + str(ac_ag_labels)) print('precision of the aggregated labels'+str(precision)) current_eps = acct.get_eps(config.delta) # Store unused part of test set for use as a test set after student training stdnt_test_data = ori_test_data[-1000:] stdnt_test_labels = test_labels[-1000:] if save: # Prepare filepath for numpy dump of labels produced by noisy aggregation dir_path = os.path.join(config.save_model, 'knn_num_neighbor_' + str(config.nb_teachers)) utils.mkdir_if_missing(dir_path) filepath = dir_path + '_knn_voting.npy' #NOLINT(long-line) # Dump student noisy labels array with open(filepath, 'wb') as file_obj: np.save(file_obj, teachers_preds) return picked_stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels