def test_preprocessed_img_inversion(self):
    raw_images_ds = _get_example_client_dataset()

    # Inversion turned off, average pixel is dark.
    standard_images_ds = emnist_data_utils.preprocess_img_dataset(
        raw_images_ds, invert_imagery=False, batch_size=BATCH_SIZE)
    for batch in iter(standard_images_ds):
      for image in batch:
        self.assertLessEqual(np.average(image), -0.7)

    # Inversion turned on, average pixel is light.
    inverted_images_ds = emnist_data_utils.preprocess_img_dataset(
        raw_images_ds, invert_imagery=True, batch_size=BATCH_SIZE)
    for batch in iter(inverted_images_ds):
      for image in batch:
        self.assertGreaterEqual(np.average(image), 0.7)
Example #2
0
def _load_and_preprocess_datasets():
    """Load raw EMNIST data and preprocess images and labels."""
    emnist_train, emnist_test = (
        emnist_data_utils.create_real_images_tff_client_data())

    # Raw image datasets.
    train_dataset = emnist_train.create_tf_dataset_from_all_clients()
    test_dataset = emnist_test.create_tf_dataset_from_all_clients()

    # Preprocessed image datasets.
    preprocessed_train_dataset = emnist_data_utils.preprocess_img_dataset(
        train_dataset, include_label=True, batch_size=BATCH_SIZE, shuffle=True)
    preprocessed_test_dataset = emnist_data_utils.preprocess_img_dataset(
        test_dataset, include_label=True, batch_size=BATCH_SIZE, shuffle=False)

    return preprocessed_train_dataset, preprocessed_test_dataset
Example #3
0
    def test_preprocessed_img_labels_are_case_agnostic(self):
        raw_images_ds = _get_example_client_dataset_containing_lowercase()

        raw_ds_iterator = iter(raw_images_ds)
        # The first element in the raw dataset is an uppercase 'I' (label is 18).
        self.assertEqual(next(raw_ds_iterator)['label'].numpy(), 18)
        # The second element in the raw dataset is an uppercase 'C' (label is 12).
        self.assertEqual(next(raw_ds_iterator)['label'].numpy(), 12)
        # The third element in the raw dataset is a lowercase 'd' (label is 39).
        self.assertEqual(next(raw_ds_iterator)['label'].numpy(), 47)

        processed_ds = emnist_data_utils.preprocess_img_dataset(
            raw_images_ds,
            include_label=True,
            batch_size=BATCH_SIZE,
            shuffle=False)
        _, label_batch = next(iter(processed_ds))
        processed_label_iterator = iter(label_batch)
        # The first element (in first batch) in the processed dataset has a case
        # agnostic label of 18 (i.e., assert that value remains unchanged).
        self.assertEqual(next(processed_label_iterator).numpy(), 18)
        # The second element (in first batch) in the processed dataset has a case
        # agnostic label of 12 (i.e., assert that value remains unchanged).
        self.assertEqual(next(processed_label_iterator).numpy(), 12)
        # The third element (in first batch) in the processed dataset should now
        # have a case agnostic label of 47 - 26 = 21.
        self.assertEqual(next(processed_label_iterator).numpy(), 47 - 26)

        for _, label_batch in iter(processed_ds):
            for label in label_batch:
                self.assertGreaterEqual(label, 0)
                self.assertLessEqual(label, 36)
def _get_client_ids_meeting_condition(train_tff_data, bad_accuracy_cutoff,
                                      good_accuracy_cutoff,
                                      invert_imagery_likelihood,
                                      classifier_model):
    """Get clients that classify <bad_accuracy_cutoff or >good_accuracy_cutoff."""
    bad_client_ids_inversion_map = {}
    good_client_ids_inversion_map = {}
    for client_id in train_tff_data.client_ids:
        invert_imagery = (1 == np.random.binomial(n=1,
                                                  p=invert_imagery_likelihood))

        # TF Dataset for particular client.
        raw_images_ds = train_tff_data.create_tf_dataset_for_client(client_id)
        # Preprocess into format expected by classifier.
        images_ds = emnist_data_utils.preprocess_img_dataset(
            raw_images_ds,
            invert_imagery=invert_imagery,
            include_label=True,
            batch_size=None,
            shuffle=False,
            repeat=False)
        # Run classifier on all data on client, compute % classified correctly.
        total_count, correct_count = _analyze_classifier(
            images_ds, classifier_model)
        accuracy = float(correct_count) / float(total_count)

        if accuracy < bad_accuracy_cutoff:
            bad_client_ids_inversion_map[client_id] = invert_imagery
        if accuracy > good_accuracy_cutoff:
            good_client_ids_inversion_map[client_id] = invert_imagery

    return bad_client_ids_inversion_map, good_client_ids_inversion_map
Example #5
0
def _create_real_images_dataset_for_eval():
    """Returns a `tf.data.Dataset` of real images."""
    eval_tff_data = emnist_data_utils.create_real_images_tff_client_data(
        split='test')
    raw_data = eval_tff_data.create_tf_dataset_from_all_clients()

    return emnist_data_utils.preprocess_img_dataset(raw_data,
                                                    include_label=False,
                                                    batch_size=EVAL_BATCH_SIZE,
                                                    shuffle=True,
                                                    repeat=True)
Example #6
0
    def setUp(self):
        super().setUp()
        client_data = emnist_data_utils.create_real_images_tff_client_data(
            split='synthetic')
        images_ds = client_data.create_tf_dataset_for_client(
            client_data.client_ids[0])
        images_ds = emnist_data_utils.preprocess_img_dataset(images_ds,
                                                             shuffle=False)
        images_ds_iterator = iter(images_ds)
        self.real_images = next(images_ds_iterator)

        np.random.seed(seed=123456)
        self.fake_images = tf.constant(np.random.random((32, 28, 28, 1)),
                                       dtype=tf.float32)
Example #7
0
    def _get_dataset(client_id):
        """Retrieve/preprocess a tf.data.Dataset for a given client_id."""
        raw_ds = raw_client_data.create_tf_dataset_for_client(client_id)

        invert_imagery = False
        if selected_client_ids_inversion_map:
            invert_imagery = selected_client_ids_inversion_map[client_id]

        # If filter-by-example, do it here.
        if client_ids_example_indices_map:
            raw_ds = _filter_by_example(raw_ds, client_ids_example_indices_map,
                                        client_id)

        return emnist_data_utils.preprocess_img_dataset(
            raw_ds,
            invert_imagery=invert_imagery,
            include_label=False,
            batch_size=batch_size,
            shuffle=True,
            repeat=False)
  def test_preprocessed_img_labels_are_case_agnostic(self):
    total_num_labels = 62
    raw_dataset = _get_example_client_dataset_containing_lowercase()
    raw_dataset_iterator = iter(raw_dataset)
    num_raw_images = _compute_dataset_length(raw_dataset)
    self.assertEqual(num_raw_images, total_num_labels)

    processed_dataset = emnist_data_utils.preprocess_img_dataset(
        raw_dataset, include_label=True, batch_size=None, shuffle=False)
    processed_dataset_iterator = iter(processed_dataset)
    num_processed_images = _compute_dataset_length(processed_dataset)
    self.assertEqual(num_processed_images, total_num_labels)

    for _ in range(total_num_labels):
      raw_label = next(raw_dataset_iterator)['label']
      if raw_label > 35:
        raw_label = raw_label - 26  # Convert from lowercase to capital

      processed_label = next(processed_dataset_iterator)[1]
      self.assertEqual(raw_label, processed_label)
def _get_client_ids_and_examples_based_on_classification(
    train_tff_data, min_num_examples, invert_imagery_likelihood,
    classifier_model):
  """Get maps storing whether imagery inverted and how examples classified."""
  client_ids_with_correct_examples_map = {}
  client_ids_with_incorrect_examples_map = {}
  client_ids_correct_example_indices_map = {}
  client_ids_incorrect_example_indices_map = {}

  for client_id in train_tff_data.client_ids:
    invert_imagery = (1 == np.random.binomial(n=1, p=invert_imagery_likelihood))

    # TF Dataset for particular client.
    raw_images_ds = train_tff_data.create_tf_dataset_for_client(client_id)
    # Preprocess into format expected by classifier.
    images_ds = emnist_data_utils.preprocess_img_dataset(
        raw_images_ds,
        invert_imagery=invert_imagery,
        include_label=True,
        batch_size=None,
        shuffle=False,
        repeat=False)
    # Run classifier on all data on client, return lists of indices of examples
    # classified correctly and incorrectly.
    correct_indices, incorrect_indices = _analyze_classifier(
        images_ds, classifier_model)

    if len(correct_indices) >= min_num_examples:
      client_ids_with_correct_examples_map[client_id] = invert_imagery
      client_ids_correct_example_indices_map[client_id] = correct_indices

    if len(incorrect_indices) >= min_num_examples:
      client_ids_with_incorrect_examples_map[client_id] = invert_imagery
      client_ids_incorrect_example_indices_map[client_id] = incorrect_indices

  return (client_ids_with_correct_examples_map,
          client_ids_with_incorrect_examples_map,
          client_ids_correct_example_indices_map,
          client_ids_incorrect_example_indices_map)
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    invert_imagery_likelihood = FLAGS.invert_imagery_likelihood
    print('invert_imagery_likelihood is %s' % invert_imagery_likelihood)
    if invert_imagery_likelihood > 1.0:
        raise ValueError(
            'invert_imagery_likelihood cannot be greater than 1.0')

    # TFF Dataset.
    client_real_images_tff_data = (
        emnist_data_utils.create_real_images_tff_client_data(split='train'))
    print('There are %d unique clients.' %
          len(client_real_images_tff_data.client_ids))

    # EMNIST Classifier.
    classifier_model = ecm.get_trained_emnist_classifier_model()

    accuracy_list = []
    overall_total_count = 0
    overall_correct_count = 0
    for client_id in client_real_images_tff_data.client_ids:
        invert_imagery = (1 == np.random.binomial(n=1,
                                                  p=invert_imagery_likelihood))

        # TF Dataset for particular client.
        raw_images_ds = client_real_images_tff_data.create_tf_dataset_for_client(
            client_id)
        # Preprocess into format expected by classifier.
        images_ds = emnist_data_utils.preprocess_img_dataset(
            raw_images_ds,
            invert_imagery=invert_imagery,
            include_label=True,
            batch_size=None,
            shuffle=False,
            repeat=False)
        # Run classifier on all data on client, compute % classified correctly.
        total_count, correct_count = _analyze_classifier(
            images_ds, classifier_model)
        accuracy = float(correct_count) / float(total_count)
        accuracy_list.append(accuracy)

        overall_total_count += total_count
        overall_correct_count += correct_count

    # Calculate histogram.
    bin_width = 1
    histogram = _compute_histogram(accuracy_list, bin_width)
    print('\nHistogram:')
    print(histogram.numpy())
    # Reseasonable check (should be 3400)
    print('(Histogram sum):')
    print(sum(histogram.numpy()))

    # Calculate percentile values.
    percentile_25, percentile_75 = np.percentile(accuracy_list, q=(25, 75))
    print('\nPercentiles...')
    print('25th Percentile : %f' % percentile_25)
    print('75th Percentile : %f' % percentile_75)

    overall_accuracy = (float(overall_correct_count) /
                        float(overall_total_count))
    print('\nOverall classification success percentage: %d / %d (%f)' %
          (overall_correct_count, overall_total_count, overall_accuracy))