Beispiel #1
0
def train_student(nb_teachers):
    """
  This function trains a student using predictions made by an ensemble of
  teachers. The student and teacher models are trained using the same
  neural network architecture.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :return: True if student training went well
  """

    # Call helper function to prepare student data using teacher predictions
    stdnt_dataset = prepare_student_data(nb_teachers, save=True)

    # Unpack the student dataset
    stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels = stdnt_dataset

    if config.resnet:
        dir_path = os.path.join(config.save_model, config.dataset)
        dir_path = os.path.join(dir_path,
                                'pate_num_teacher_' + str(config.nb_teachers))
        #dir_path = os.path.join(config.save_model,'pate_'+str(config.nb_teachers))
        utils.mkdir_if_missing(dir_path)
        filename = os.path.join(dir_path, '_stndent_resnet.checkpoint.pth.tar')

    print('stdnt_label used for train', stdnt_labels.shape)
    network.train_each_teacher(config.student_epoch, stdnt_data, stdnt_labels,
                               stdnt_test_data, stdnt_test_labels, filename)

    final_preds = network.pred(stdnt_test_data, filename)

    precision = hamming_accuracy(final_preds, stdnt_test_labels, torch=False)
    print('Precision of student after training: ' + str(precision))

    return True
Beispiel #2
0
def prepare_student_data(nb_teachers, save=False):
    """
  Takes a dataset name and the size of the teacher ensemble and prepares
  training data for the student model, according to parameters indicated
  in flags above.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param save: if set to True, will dump student training labels predicted by
               the ensemble of teachers (with Laplacian noise) as npy files.
               It also dumps the clean votes for each class (without noise) and
               the labels assigned by teachers
  :return: pairs of (data, labels) to be used for student training and testing
  """

    # Load the dataset
    if config.dataset == 'celeba':
        dataset = data_manager.init_img_dataset(root=config.data_dir,
                                                name=config.dataset)
        test_data = dataset.test_data
        test_labels = dataset.test_label

    elif config.dataset == 'market':
        data_dir = '../dataset/market1501'
        test_dataset = Test_Dataset(data_dir,
                                    dataset_name=dataset_dict[config.dataset],
                                    query_gallery='gallery')
        test_data = test_dataset.data
        test_labels = test_dataset.label
        test_labels = np.array(test_labels, dtype=np.int32)
    else:
        print("Check value of dataset flag")
        return False

    # Make sure there is data leftover to be used as a test set
    assert config.stdnt_share < len(test_data)

    # Prepare [unlabeled] student training data (subset of test set)
    stdnt_data = test_data[:config.stdnt_share]
    # Compute teacher predictions for student training data
    if config.reuse_vote:
        #reuse previous saved clean votes, but stdnt_share maybe various
        #dir_path = os.path.join(config.save_model,'pate_'+str(config.nb_teachers))
        dir_path = os.path.join(config.save_model, config.dataset)
        dir_path = os.path.join(dir_path,
                                'pate_num_teacher_' + str(config.nb_teachers))
        utils.mkdir_if_missing(dir_path)
        filepath = dir_path + '/_teacher_votes.npy'
        # Prepare filepath for numpy dump of labels produced by noisy aggregation
        teachers_preds = np.load(filepath)
        teachers_preds = teachers_preds[:config.stdnt_share]
        ori_filepath = dir_path + '_ori_teacher_votes.npy'
        ori_teachers_preds = np.load(ori_filepath)
    else:
        teachers_preds = ensemble_preds(nb_teachers, stdnt_data)
        ori_teachers_preds = teachers_preds  # in the shape of (nb_teacher, nb_data, dim)
        teachers_preds = np.sum(teachers_preds, axis=0)
        dir_path = os.path.join(config.save_model, config.dataset)
        dir_path = os.path.join(dir_path,
                                'pate_num_teacher_' + str(config.nb_teachers))
        utils.mkdir_if_missing(dir_path)
        filepath = dir_path + '/_teacher_votes.npy'
        ori_filepath = dir_path + '_ori_teacher_votes.npy'
        with open(filepath, mode='wb') as file_obj:
            np.save(file_obj, teachers_preds)
        with open(ori_filepath, mode='wb') as file_obj:
            np.save(file_obj, ori_teachers_preds)

    if config.use_tau:
        tau_teachers_preds = np.zeros(teachers_preds.shape)
        for idx in range(len(tau_teachers_preds)):
            tau_teachers_preds[idx] = tau_limit(ori_teachers_preds[:, idx, :])

        preds_tau = np.asarray(tau_teachers_preds, dtype=np.float32)
        print('preds_tau', preds_tau[1, ])
        count_zero_list = config.nb_teachers * np.ones(
            [config.stdnt_share, config.nb_labels]) - teachers_preds
        print('shape of count_zero', count_zero_list.shape)
        idx, stdnt_labels = aggregation.aggregation_knn(
            teachers_preds, config.gau_scale, count_zero_list=count_zero_list)
        acct.compose_mechanism(gaussian, coeff=config.stdnt_share)
    else:
        acct.compose_mechanism(gaussian, coeff=config.stdnt_share)
        idx, stdnt_labels = aggregation.aggregation_knn(
            teachers_preds, config.gau_scale)
    print('shape of teachers_pred', teachers_preds.shape)
    # Aggregate teacher predictions to get student training labels

    # Print accuracy of aggregated label
    ac_ag_labels = hamming_accuracy(stdnt_labels,
                                    test_labels[:config.stdnt_share],
                                    torch=False)
    print("Accuracy of the aggregated labels: " + str(ac_ag_labels))
    current_eps = acct.get_eps(config.delta)
    print('eps after data independent composition', current_eps)
    # Store unused part of test set for use as a test set after student training
    stdnt_test_data = test_data[config.stdnt_share:]
    stdnt_test_labels = test_labels[config.stdnt_share:]

    return stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels
Beispiel #3
0
def prepare_student_data(nb_teachers, save=False):
    """
    Takes a dataset name and the size of the teacher ensemble and prepares
    training data for the student model, according to parameters indicated
    in flags above.
    :param dataset: string corresponding to mnist, cifar10, or svhn
    :param nb_teachers: number of teachers (in the ensemble) to learn from
    :param save: if set to True, will dump student training labels predicted by
                 the ensemble of teachers (with Laplacian noise) as npy files.
                 It also dumps the clean votes for each class (without noise) and
                 the labels assigned by teachers
    :return: pairs of (data, labels) to be used for student training and testing

    """

    # Load the dataset
    if config.dataset == 'celeba':
        dataset = data_manager.init_img_dataset(root=config.data_dir, name=config.dataset)
        test_data = dataset.test_data
        test_labels = dataset.test_label
        train_data = dataset.train_data
        train_labels = dataset.train_label

    elif config.dataset =='market':
        data_dir = '../dataset/market1501'
        train_dataset = Train_Dataset(data_dir, dataset_name=dataset_dict[config.dataset],
                                        train_val='train')
        test_dataset = Test_Dataset(data_dir, dataset_name=dataset_dict[config.dataset],
                                             query_gallery='gallery')

        train_data = train_dataset.train_data
        train_labels = train_dataset.train_label
        test_data = test_dataset.data
        test_labels = test_dataset.label
        train_labels = np.array(train_labels,dtype =np.int32)
        test_labels = np.array(test_labels,dtype = np.int32)
        print('len of total test data in market',len(test_labels))
    else:
        return False



    # Make sure there is data leftover to be used as a test set
    assert config.stdnt_share < len(test_data)



    ori_test_data = test_data
    # for test


    train_data, test_data = extract_feature(train_data, test_data)

    stdnt_data = test_data[:config.stdnt_share]
    # the remaining 1000 records is the holdout for evaluating
    share_index =np.random.choice(test_data[:-1000].shape[0],config.stdnt_share)
    stdnt_data = test_data[share_index]
    picked_stdnt_data = [ori_test_data[idx] for idx in share_index]
    num_train = train_data.shape[0]
    teachers_preds = np.zeros([stdnt_data.shape[0], config.nb_labels])

    tau_teachers_preds=[]
    # a weighted teacher predtion with clippling
    for idx in range(len(stdnt_data)):
        if idx % 100 == 0:
            print('idx=', idx)
        query_data = stdnt_data[idx]
        select_teacher = np.random.choice(train_data.shape[0], int(prob * num_train))
        dis = np.linalg.norm(train_data[select_teacher] - query_data, axis=1)
        k_index = select_teacher[np.argsort(dis)[:config.nb_teachers]]
        # sum over the number of teachers, which make it easy to compute their votings
        if config.use_tau:
            tau_teachers_preds.append(tau_limit(train_labels[k_index,:]))
        teachers_preds[idx] = np.sum(train_labels[k_index, :], axis=0)


    teachers_preds = np.asarray(teachers_preds, dtype=np.int32)
    if config.use_tau:
    
        preds_tau = np.asarray(tau_teachers_preds, dtype = np.float32)
        acct.compose_poisson_subsampled_mechanisms(gaussian, prob, coeff=config.stdnt_share)
        count_zero_list = config.nb_teachers * np.ones([config.stdnt_share,config.nb_labels]) - teachers_preds
        idx, stdnt_labels = aggregation.aggregation_knn(teachers_preds, config.gau_scale,count_zero_list=count_zero_list)
    else:    
        acct.compose_poisson_subsampled_mechanisms(gaussian, prob, coeff=config.stdnt_share)
        idx, stdnt_labels = aggregation.aggregation_knn(teachers_preds, config.gau_scale)
    # compute privacy loss
    print("Composition of student  subsampled Gaussian mechanisms gives ", (acct.get_eps(delta), delta))

    # Print accuracy of aggregated label
    #ac_ag_labels = hamming_accuracy(stdnt_labels, test_labels[:config.stdnt_share], torch=False)
    ac_ag_labels = hamming_accuracy(stdnt_labels, test_labels[share_index], torch=False)
    precision = hamming_precision(stdnt_labels, test_labels[share_index], torch=False)
    print("Accuracy of the aggregated labels: " + str(ac_ag_labels))
    print('precision of the aggregated labels'+str(precision))
    current_eps = acct.get_eps(config.delta)
    # Store unused part of test set for use as a test set after student training
    stdnt_test_data = ori_test_data[-1000:]
    stdnt_test_labels = test_labels[-1000:]

    if save:
      # Prepare filepath for numpy dump of labels produced by noisy aggregation
      dir_path = os.path.join(config.save_model, 'knn_num_neighbor_' + str(config.nb_teachers))
      utils.mkdir_if_missing(dir_path)
      filepath = dir_path + '_knn_voting.npy' #NOLINT(long-line)

      # Dump student noisy labels array
      with open(filepath, 'wb') as file_obj:
        np.save(file_obj, teachers_preds)

    return picked_stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels