Exemple #1
0
def predict_teacher(dataset, nb_teachers):
    """
  This is for obtaining the weight from student / teache, don't involve any noise
  :param dataset:  string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param teacher: if teacher is true, then predict with training dataset, else students
  :return: out prediction based on cnn
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)

    train_only = True
    test_only = False

    # create path to save teacher predict teacher model
    filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
        nb_teachers) + '_teacher_clean_votes_label_shift' + str(
            FLAGS.lap_scale) + '.npy'
    # Load the dataset
    if dataset == 'svhn':
        test_data, test_labels = input.ld_svhn(test_only, train_only)
    elif dataset == 'cifar10':
        test_data, test_labels = input.ld_cifar10(test_only, train_only)
    elif dataset == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only, train_only)
    elif dataset == 'adult':
        test_data, test_labels = input.ld_adult(test_only, train_only)
    else:
        print("Check value of dataset flag")
        return False
    if os.path.exists(filepath):
        pred_labels = np.load(filepath)
        return pred_labels, test_labels
    teachers_preds = ensemble_preds(dataset, nb_teachers, test_data)

    # Aggregate teacher predictions to get student training labels
    pred_labels = aggregation.noisy_max(FLAGS.nb_teachers, teachers_preds, 0)
    utils.save_file(filepath, pred_labels)
    # Print accuracy of aggregated labels
    ac_ag_labels = metrics.accuracy(pred_labels, test_labels)
    print("obtain_weight Accuracy of the aggregated labels: " +
          str(ac_ag_labels))
    return pred_labels, test_labels
Exemple #2
0
def predict_data(dataset, nb_teachers, teacher=False):
    """
  This is for obtaining the weight from student / teache, don't involve any noise
  :param dataset:  string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param teacher: if teacher is true, then predict with training dataset, else students
  :return: out prediction based on cnn
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)
    if teacher:
        train_only = True
        test_only = False
    else:
        train_only = False
        test_only = True

    # Load the dataset
    if dataset == 'svhn':
        test_data, test_labels = input.ld_svhn(test_only, train_only)
    elif dataset == 'cifar10':
        test_data, test_labels = input.ld_cifar10(test_only, train_only)
    elif dataset == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only, train_only)
    elif dataset == 'adult':
        test_data, test_labels = input.ld_adult(test_only, train_only)
    else:
        print("Check value of dataset flag")
        return False

    teachers_preds = ensemble_preds(dataset, nb_teachers, test_data)

    # Aggregate teacher predictions to get student training labels
    pred_labels = aggregation.noisy_max(FLAGS.nb_teachers, teachers_preds, 0)
    # Print accuracy of aggregated labels
    ac_ag_labels = metrics.accuracy(pred_labels, test_labels)
    print("obtain_weight Accuracy of the aggregated labels: " +
          str(ac_ag_labels))
    return test_data, pred_labels, test_labels
Exemple #3
0
def prepare_student_data(test_data, nb_teachers, lap_scale):
    """
    Takes a dataset name and the size of the teacher ensemble and prepares
    training data for the student model
    :param dataset: string corresponding to mnist, cifar10, or svhn
    :param nb_teachers: number of teachers (in the ensemble) to learn from
    :Param: lap_scale: scale of the Laplacian noise added for privacy
    :return: pairs of (data, labels) to be used for student training and testing
    """

    # Compute teacher predictions for student training data
    teachers_preds = ensemble_preds(nb_teachers, test_data, 2)

    # Aggregate teacher predictions to get student training labels
    stdnt_labels = aggregation.noisy_max(teachers_preds, lap_scale)
    print('stdnt_labels')
    stdnt_labels = keras.utils.to_categorical(stdnt_labels, 2)
    print(len(stdnt_labels))
    print(stdnt_labels.shape)

    # Store unused part of test set for use as a test set after student training

    return stdnt_labels
Exemple #4
0
def prepare_student_data(test_data,nb_teachers,epsilon=0.1):
    """
    Takes a dataset name and the size of the teacher ensemble and prepares
    training data for the student model
    :param dataset: string corresponding to mnist, cifar10, or svhn
    :param nb_teachers: number of teachers (in the ensemble) to learn from
    :Param: epsilon: epsilon in (epsilon, delta) differential privacy
    :return: pairs of (data, labels) to be used for student training and testing
    """

    # Compute teacher predictions for student training data
    teachers_preds = ensemble_preds(nb_teachers, test_data, 1)
    

    # Aggregate teacher predictions to get student training labels
    stdnt_labels = aggregation.noisy_max(teachers_preds, epsilon=epsilon)
    print('stdnt_labels')
    #stdnt_labels = tensorflow.keras.utils.to_categorical(stdnt_labels, 1)
    print(len(stdnt_labels))
    print(stdnt_labels.shape)

    # Store unused part of test set for use as a test set after student training
    
    return stdnt_labels
Exemple #5
0
def prepare_student_data(dataset, nb_teachers, save=False, shift_data=None):
    """
  Takes a dataset name and the size of the teacher ensemble and prepares
  training data for the student model, according to parameters indicated
  in flags above.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param save: if set to True, will dump student training labels predicted by
               the ensemble of teachers (with Laplacian noise) as npy files.
               It also dumps the clean votes for each class (without noise) and
               the labels assigned by teachers
  :return: pairs of (data, labels) to be used for student training and testing
  """
    stdnt_data = shift_data['data']
    test_labels = shift_data['label']

    gau_filepath, filepath, filepath_labels = utils.create_path(
        FLAGS, dataset, nb_teachers)
    if os.path.exists(filepath):
        if FLAGS.PATE2 == True:
            with open(filepath, 'rb') as f:
                clean_votes = np.load(f)
                keep_idx, result = gaussian(FLAGS.nb_labels, clean_votes)
                precision_true = metrics.accuracy(result,
                                                  test_labels[keep_idx])
                print('number of idx={} precision_true ={}'.format(
                    len(keep_idx[0]), precision_true))
                return keep_idx, stdnt_data[keep_idx], result

    print('*** length of shift_data {} lable length={}********'.format(
        len(stdnt_data), len(test_labels)))

    # Compute teacher predictions for student training data

    teacher_path = 'teacher_pred.npy'
    if os.path.exists(teacher_path):
        teachers_preds = np.load(teacher_path)
    else:

        teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data)
        np.save(teacher_path, teachers_preds)
    # Aggregate teacher predictions to get student training labels
    if not save:
        stdnt_labels = aggregation.noisy_max(FLAGS.nb_labels, teachers_preds,
                                             FLAGS.lap_scale)
    else:
        # Request clean votes and clean labels as well
        stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max(
            FLAGS.nb_labels,
            teachers_preds,
            FLAGS.lap_scale,
            return_clean_votes=True)  #NOLINT(long-line)

        # Dump clean_votes array
        utils.save_file(filepath, clean_votes)
        utils.save_file(filepath_labels, labels_for_dump)

        if FLAGS.PATE2 == True:
            keep_idx, result = gaussian(FLAGS.nb_labels, clean_votes)
            utils.save_file(gau_filepath, result)
            ac_ag_labels = metrics.accuracy(result, test_labels[keep_idx])
            print(
                "number of gaussian student {}  Accuracy of the aggregated labels:{} "
                .format(len(result), ac_ag_labels))
            return keep_idx, stdnt_data[keep_idx], result
Exemple #6
0
def prepare_student_data(dataset, nb_teachers, save=False):
  """
  Takes a dataset name and the size of the teacher ensemble and prepares
  training data for the student model, according to parameters indicated
  in flags above.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param save: if set to True, will dump student training labels predicted by
               the ensemble of teachers (with Laplacian noise) as npy files.
               It also dumps the clean votes for each class (without noise) and
               the labels assigned by teachers
  :return: pairs of (data, labels) to be used for student training and testing
  """
  assert input.create_dir_if_needed(FLAGS.train_dir)

  # Load the dataset
  if dataset == 'svhn':
    test_data, test_labels = input.ld_svhn(test_only=True)
  elif dataset == 'cifar10':
    test_data, test_labels = input.ld_cifar10(test_only=True)
  elif dataset == 'mnist':
    test_data, test_labels = input.ld_mnist(test_only=True)
  else:
    print("Check value of dataset flag")
    return False

  # Make sure there is data leftover to be used as a test set
  assert FLAGS.stdnt_share < len(test_data)

  # Prepare [unlabeled] student training data (subset of test set)
  stdnt_data = test_data[:FLAGS.stdnt_share]

  # Compute teacher predictions for student training data
  teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data)

  # Aggregate teacher predictions to get student training labels
  if not save:
    stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale)
  else:
    # Request clean votes and clean labels as well
    stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale, return_clean_votes=True) #NOLINT(long-line)

    # Prepare filepath for numpy dump of clean votes
    filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_clean_votes_lap_' + str(FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

    # Prepare filepath for numpy dump of clean labels
    filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_teachers_labels_lap_' + str(FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

    # Dump clean_votes array
    with gfile.Open(filepath, mode='w') as file_obj:
      np.save(file_obj, clean_votes)

    # Dump labels_for_dump array
    with gfile.Open(filepath_labels, mode='w') as file_obj:
      np.save(file_obj, labels_for_dump)

  # Print accuracy of aggregated labels
  ac_ag_labels = metrics.accuracy(stdnt_labels, test_labels[:FLAGS.stdnt_share])
  print("Accuracy of the aggregated labels: " + str(ac_ag_labels))

  # Store unused part of test set for use as a test set after student training
  stdnt_test_data = test_data[FLAGS.stdnt_share:]
  stdnt_test_labels = test_labels[FLAGS.stdnt_share:]

  if save:
    # Prepare filepath for numpy dump of labels produced by noisy aggregation
    filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_labels_lap_' + str(FLAGS.lap_scale) + '.npy' #NOLINT(long-line)

    # Dump student noisy labels array
    with gfile.Open(filepath, mode='w') as file_obj:
      np.save(file_obj, stdnt_labels)

  return stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels
Exemple #7
0
def prepare_student_data(dataset, nb_teachers, save=False):
    """
  Takes a dataset name and the size of the teacher ensemble and prepares
  training data for the student model, according to parameters indicated
  in flags above.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param save: if set to True, will dump student training labels predicted by
               the ensemble of teachers (with Laplacian noise) as npy files.
               It also dumps the clean votes for each class (without noise) and
               the labels assigned by teachers
  :return: pairs of (data, labels) to be used for student training and testing
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)

    # Load the dataset
    if dataset == 'svhn':
        test_data, test_labels = input.ld_svhn(test_only=True)
    elif dataset == 'cifar10':
        test_data, test_labels = input.ld_cifar10(test_only=True)
    elif dataset == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only=True)
    elif dataset == 'digit':
        test_data, test_labels = input.ld_digit_test(test_name=FLAGS.test_name,
                                                     num=2000)
    else:
        print("Check value of dataset flag")
        return False

    # Make sure there is data leftover to be used as a test set
    assert FLAGS.stdnt_share < len(test_data)

    # Prepare [unlabeled] student training data (subset of test set)
    if (FLAGS.d_stu > -1):
        #    stdnt_data = []
        #    for i in range(FLAGS.stdnt_share):
        #      new_img = transform.resize(skimage.img_as_ubyte(test_data[i].astype(int)),(28,28))
        #      if FLAGS.d_stu == 3:
        #        new_img = color.rgb2gray(new_img)
        #      else:
        #        new_img = new_img[ :,:, FLAGS.d_stu]
        #      stdnt_data.append(new_img.reshape(28,28,1).astype(np.float32))
        #    stdnt_data = np.array(stdnt_data)
        trimmed = test_data[:FLAGS.stdnt_share, 2:30, 2:30, :]
        # grey scale
        if (FLAGS.d_stu == 3):
            stdnt_data = 0.2125 * trimmed[:, :, :,
                                          0] + 0.7154 * trimmed[:, :, :,
                                                                1] + 0.0721 * trimmed[:, :, :,
                                                                                      2]
        else:
            stdnt_data = trimmed[:, :, :, FLAGS.d_stu]
        stdnt_data = stdnt_data.reshape((-1, 28, 28, 1))
    else:
        stdnt_data = test_data[:FLAGS.stdnt_share]
    # Compute teacher predictions for student training data
    teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data)

    # Aggregate teacher predictions to get student training labels
    if not save:
        stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale)
    else:
        # Request clean votes and clean labels as well
        stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max(
            teachers_preds, FLAGS.lap_scale,
            return_clean_votes=True)  #NOLINT(long-line)

        # Prepare filepath for numpy dump of clean votes
        filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_student_clean_votes_lap_' + str(
                FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

        # Prepare filepath for numpy dump of clean labels
        filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_teachers_labels_lap_' + str(
                FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

        # Dump clean_votes array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, clean_votes)

        # Dump labels_for_dump array
        with tf.gfile.Open(filepath_labels, mode='w') as file_obj:
            np.save(file_obj, labels_for_dump)

    # Print accuracy of aggregated labels
    ac_ag_labels = metrics.accuracy(stdnt_labels,
                                    test_labels[:FLAGS.stdnt_share])
    print("Accuracy of the aggregated labels: " + str(ac_ag_labels))

    # Store unused part of test set for use as a test set after student training
    if FLAGS.dataset_teacher == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only=True)
    else:
        assert 0 == 1, "Non implemented error: dataset_teacher not equals to mnist"


#  if FLAGS.d_stu > -1:
#    stdnt_test_data = test_data[FLAGS.stdnt_share:, 2:30, 2:30, FLAGS.d_stu : FLAGS.d_stu+1]
#  else:
    stdnt_test_data = test_data[FLAGS.stdnt_share:]

    stdnt_test_labels = test_labels[FLAGS.stdnt_share:]

    if save:
        # Prepare filepath for numpy dump of labels produced by noisy aggregation
        filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_student_labels_lap_' + str(
                FLAGS.lap_scale) + '.npy'  #NOLINT(long-line)

        # Dump student noisy labels array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, stdnt_labels)

    return stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels
Exemple #8
0
def prepare_student_data(dataset, nb_teachers, save=False, shift_data=None):
    """
  Takes a dataset name and the size of the teacher ensemble and prepares
  training data for the student model, according to parameters indicated
  in flags above.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param save: if set to True, will dump student training labels predicted by
               the ensemble of teachers (with Laplacian noise) as npy files.
               It also dumps the clean votes for each class (without noise) and
               the labels assigned by teachers
  :return: pairs of (data, labels) to be used for student training and testing
  """
    if dataset == 'svhn':
        test_data, test_labels = input.ld_svhn(test_only=True)
    elif dataset == 'cifar10':
        test_data, test_labels = input.ld_cifar10(test_only=True)
    elif dataset == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only=True)
    elif dataset == 'adult':
        test_data, test_labels = input.ld_adult(test_only=True)
    else:
        print("Check value of dataset flag")
        return False
    if FLAGS.cov_shift == True:
        student_file_name = FLAGS.data + 'PCA_student' + FLAGS.dataset + '.pkl'
        f = open(student_file_name, 'rb')
        test = pickle.load(f)
        test_data = test['data']
        test_labels = test['label']
    # Prepare [unlabeled] student training data (subset of test set)
    stdnt_data = test_data

    assert input.create_dir_if_needed(FLAGS.train_dir)
    gau_filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
        nb_teachers) + '_student_votes_sigma1:' + str(
            FLAGS.sigma1) + '_sigma2:' + str(
                FLAGS.sigma2) + '.npy'  # NOLINT(long-line)

    # Prepare filepath for numpy dump of clean votes
    filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
        nb_teachers) + '_student_clean_votes' + str(
            FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

    # Prepare filepath for numpy dump of clean labels
    filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
        nb_teachers) + '_teachers_labels_lap_' + str(
            FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)
    """
  if os.path.exists(filepath):
    if FLAGS.PATE2 == True:
      with open(filepath,'rb')as f:
        clean_votes = np.load(f)
        keep_idx, result = gaussian(FLAGS.nb_labels, clean_votes)
        precision_true = metrics.accuracy(result, test_labels[keep_idx])
        print('number of idx={}'.format(len(keep_idx[0])))
        return keep_idx, stdnt_data[keep_idx], result
"""

    # Load the dataset

    # Make sure there is data leftover to be used as a test set
    assert FLAGS.stdnt_share < len(test_data)

    if shift_data is not None:
        #no noise
        # replace original student data with shift data

        stdnt_data = shift_data['data']
        test_labels = shift_data['label']
        print('*** length of shift_data {} lable length={}********'.format(
            len(stdnt_data), len(test_labels)))

    # Compute teacher predictions for student training data
    teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data)

    # Aggregate teacher predictions to get student training labels
    if not save:
        stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale)
    else:
        # Request clean votes and clean labels as well
        stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max(
            FLAGS.nb_labels,
            teachers_preds,
            FLAGS.lap_scale,
            return_clean_votes=True)  #NOLINT(long-line)

        if FLAGS.PATE2 == True:
            keep_idx, result = gaussian(FLAGS.nb_labels, clean_votes)

        # Dump clean_votes array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, clean_votes)

        # Dump labels_for_dump array
        with tf.gfile.Open(filepath_labels, mode='w') as file_obj:
            np.save(file_obj, labels_for_dump)

    # Print accuracy of aggregated labels
    if FLAGS.PATE2 == True:
        with tf.gfile.Open(gau_filepath, mode='w') as file_obj:
            np.save(file_obj, result)
        ac_ag_labels = metrics.accuracy(result, test_labels[keep_idx])
        print(
            "number of gaussian student {}  Accuracy of the aggregated labels:{} "
            .format(len(result), ac_ag_labels))
        return keep_idx, stdnt_data[keep_idx], result
    else:
        ac_ag_labels = metrics.accuracy(stdnt_labels, test_labels)
        print("Accuracy of the aggregated labels: " + str(ac_ag_labels))

    if save:
        # Prepare filepath for numpy dump of labels produced by noisy aggregation
        filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_student_labels_lap_' + str(
                FLAGS.lap_scale) + '.npy'  #NOLINT(long-line)

        # Dump student noisy labels array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, stdnt_labels)

    return stdnt_data, stdnt_labels