Example #1
0
def _download_data(
    num_worker: int, cache_dir: str, base_url: str
) -> Tuple[ClientData, tf.data.Dataset, ClientData, tf.data.Dataset]:
    """Create a `tff.simulation.datasets.ClientData` for the chosen data split.

  Download the entire GLD v2 dataset, subset the dataset to only include the
  images in the federated GLD v2 dataset, and create both gld23k and gld160k
  datasets.

  Args:
    num_worker: The number of threads for downloading the GLD v2 dataset.
    cache_dir: The directory for caching temporary results.
    base_url: The base url for downloading GLD images.

  Returns:
    A tuple of `tff.simulation.datasets.ClientData`, `tf.data.Dataset`.
  """
    logger = logging.getLogger(LOGGER)
    logger.info('Start to download fed gldv2 mapping files')
    path = tf.keras.utils.get_file(
        '%s.zip' % FED_GLD_SPLIT_FILE_BUNDLE,
        origin=FED_GLD_SPLIT_FILE_DOWNLOAD_URL,
        file_hash=FED_GLD_SPLIT_FILE_BUNDLE_MD5_CHECKSUM,
        hash_algorithm='md5',
        extract=True,
        archive_format='zip',
        cache_dir=cache_dir)
    logger.info('Fed gldv2 mapping files are downloaded successfully.')
    base_path = os.path.dirname(path)
    train_path = os.path.join(base_path, FED_GLD_SPLIT_FILE_BUNDLE,
                              FED_GLD_TRAIN_SPLIT_FILE)
    test_path = os.path.join(base_path, FED_GLD_SPLIT_FILE_BUNDLE,
                             FED_GLD_TEST_SPLIT_FILE)
    train_mapping = vision_datasets_utils.read_csv(train_path)
    test_mapping = vision_datasets_utils.read_csv(test_path)
    all_images = set()
    all_images.update([row['image_id'] for row in train_mapping],
                      [row['image_id'] for row in test_mapping])
    image_dir = os.path.join(cache_dir, 'images')
    if not os.path.exists(image_dir):
        os.mkdir(image_dir)
    logger.info('Start to download GLDv2 dataset.')
    with multiprocessing.pool.ThreadPool(num_worker) as pool:
        train_args = [(i, all_images, image_dir, base_url)
                      for i in range(NUM_SHARD_TRAIN)]
        pool.starmap(_filter_images, train_args)

    logger.info('Finish downloading GLDv2 dataset.')
    fed_gld_train, fed_gld_test = _create_federated_gld_dataset(
        cache_dir, image_dir, train_path, test_path)
    mini_gld_train, mini_gld_test = _create_mini_gld_dataset(
        cache_dir, image_dir)

    return fed_gld_train, fed_gld_test, mini_gld_train, mini_gld_test
Example #2
0
def _create_test_data_file(cache_dir: str, image_dir: str, mapping_file: str):
    """Create the test data and persist it into a file.

  Args:
    cache_dir: The directory caching the intermediate results.
    image_dir: The directory containing all the downloaded images.
    mapping_file: The file containing 'image_id' to 'class' mappings.
  """
    logger = logging.getLogger(LOGGER)
    if not os.path.isdir(image_dir):
        logger.error('Image directory %s does not exist', image_dir)
        raise ValueError('%s does not exist or is not a directory' % image_dir)
    mapping_table = vision_datasets_utils.read_csv(mapping_file)
    expected_cols = ['image_id', 'class']
    if not all(col in mapping_table[0].keys() for col in expected_cols):
        logger.error('%s has wrong format.', mapping_file)
        raise ValueError(
            'The mapping file must contain image_id and class columns. The existing'
            ' columns are %s' % ','.join(mapping_table[0].keys()))
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    examples = _create_dataset_with_mapping(image_dir, mapping_table)
    with tf.io.TFRecordWriter(os.path.join(cache_dir,
                                           TEST_FILE_NAME)) as writer:
        for example in examples:
            writer.write(example.SerializeToString())
        logger.info('Created tfrecord file at %s', cache_dir)
Example #3
0
def _create_test_data_file(image_path_map: Dict[str, str], cache_dir: str,
                           split: INaturalistSplit, mapping_file: str):
    """Create the test data and persist it into a file.

  Args:
    image_path_map: The dictionary containing the image id to image path
      mapping.
    cache_dir: The directory caching the intermediate results.
    split: The split of the federated iNaturalist 2017 dataset.
    mapping_file: The file containing 'image_id' to 'class' mappings.
  """
    logger = logging.getLogger(LOGGER)
    mapping_table = utils.read_csv(mapping_file)
    expected_cols = ['image_id', 'class']
    if not all(col in mapping_table[0].keys() for col in expected_cols):
        logger.error('%s has wrong format.', mapping_file)
        raise ValueError(
            'The mapping file must contain image_id and class columns. The existing'
            ' columns are %s' % ','.join(mapping_table[0].keys()))
    cache_dir = os.path.join(cache_dir, split.name.lower())
    examples = _create_dataset_with_mapping(image_path_map, mapping_table)
    with tf.io.TFRecordWriter(os.path.join(cache_dir,
                                           TEST_FILE_NAME)) as writer:
        for example in examples:
            writer.write(example.SerializeToString())
        logger.info('Created test tfrecord file at %s', cache_dir)
Example #4
0
def _create_train_data_files(image_path_map: Dict[str, str], cache_dir: str,
                             split: INaturalistSplit, train_path: str):
    """Create the train data and persist it into a separate file per user.

  Args:
    image_path_map: The dictionary containing the image id to image path
      mapping.
    cache_dir: The directory containing the created datasets.
    split: The split of the federated iNaturalist 2017 dataset.
    train_path: The path to the mapping file for training data.
  """
    logger = logging.getLogger(LOGGER)

    mapping_table = utils.read_csv(train_path)
    user_id_col = split.name.lower()
    expected_cols = [user_id_col, 'image_id', 'class']
    if not all(col in mapping_table[0].keys() for col in expected_cols):
        logger.error('%s has wrong format.', train_path)
        raise ValueError(
            'The mapping file must contain the user_id for the chosen split, image_id and class columns. '
            'The existing columns are %s' % ','.join(mapping_table[0].keys()))
    cache_dir = os.path.join(cache_dir, split.name.lower(), TRAIN_SUB_DIR)
    if not os.path.exists(cache_dir):
        logger.info('Creating cache directory for training data.')
        os.makedirs(cache_dir)
    mapping_per_user = collections.defaultdict(list)
    for row in mapping_table:
        user_id = row[user_id_col]
        if user_id != 'NA':
            mapping_per_user[user_id].append(row)
    for user_id, data in mapping_per_user.items():
        examples = _create_dataset_with_mapping(image_path_map, data)
        with tf.io.TFRecordWriter(os.path.join(cache_dir,
                                               str(user_id))) as writer:
            for example in examples:
                writer.write(example.SerializeToString())
            logger.info(
                'Created tfrecord file for user %s with %d examples, at %s',
                user_id, len(examples), cache_dir)
Example #5
0
def _create_train_data_files(cache_dir: str, image_dir: str,
                             mapping_file: str):
    """Create the train data and persist it into a separate file per user.

  Args:
    cache_dir: The directory caching the intermediate results.
    image_dir: The directory containing all the downloaded images.
    mapping_file: The file containing 'image_id' to 'class' mappings.
  """
    logger = logging.getLogger(LOGGER)
    if not os.path.isdir(image_dir):
        logger.error('Image directory %s does not exist', image_dir)
        raise ValueError('%s does not exist or is not a directory' % image_dir)

    mapping_table = vision_datasets_utils.read_csv(mapping_file)
    expected_cols = ['user_id', 'image_id', 'class']
    if not all(col in mapping_table[0].keys() for col in expected_cols):
        logger.error('%s has wrong format.', mapping_file)
        raise ValueError(
            'The mapping file must contain user_id, image_id and class columns. '
            'The existing columns are %s' % ','.join(mapping_table[0].keys()))
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    mapping_per_user = collections.defaultdict(list)
    for row in mapping_table:
        user_id = row['user_id']
        mapping_per_user[user_id].append(row)
    for user_id, data in mapping_per_user.items():
        examples = _create_dataset_with_mapping(image_dir, data)
        with tf.io.TFRecordWriter(os.path.join(cache_dir,
                                               str(user_id))) as writer:
            for example in examples:
                writer.write(example.SerializeToString())
            logger.info(
                'Created tfrecord file for user %s with %d examples, at %s',
                user_id, len(examples), cache_dir)