コード例 #1
0
def _get_multi_labels_and_problematic_images(
    dl_manager: tfds.download.DownloadManager):
  """Returns multi-labels and problematic images from download json.

  Args:
    dl_manager: tfds.download.DownloadManager for downloading the json file

  Returns:
    val_annotated_images: Dictionary mapping image name to an inner dictionary
      containing the multi_label annotations for that image. The inner multi-
      label annotation dictionary has keys 'correct', 'wrong', or 'unclear'
      (keys will be missing if the image does not have a set of labels of the
      given type) and values that are lists of wnids.
    problematic_images: List of image names for problematic images.
    imagenet_m_2022:  List of image names comprising ImageNet-M 2022 evaluation
      slice.
  """
  with tf.io.gfile.GFile(dl_manager.download(_MULTI_LABELS_URL), 'r') as f:
    human_accuracy_data = json.load(f)
  val_annotated_images = {}
  prefix = 'ILSVRC2012_val_'
  len_prefix = len(prefix)
  for image_name in human_accuracy_data['initial_annots'].keys():
    if image_name[:len_prefix] == prefix:
      val_annotated_images[image_name] = human_accuracy_data['initial_annots'][
          image_name]

  problematic_images = list(human_accuracy_data['problematic_images'].keys())
  imagenet_m_2022 = human_accuracy_data['imagenet_m']
  return val_annotated_images, problematic_images, imagenet_m_2022
コード例 #2
0
  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
    '''RETURNS SplitGenerators'''
    dl_paths = dl.manager.download_kaggle_data('Wikipedia sentences')

    data_dir = dl_manager.download({
            'sentences_train': dl_paths['wikisent2.txt'],
    })

    txt_path = data_dir
    
    with tf.io.gfile.GFile(txt_path, 'r') as f:
      text = f.read()

    # Since there's no official split, putting everything under training split

    return [
            tfds.core.SplitGenerator(
                name = tfds.Split.TRAIN,
                gen_kwargs = {
                    'split_key' : 'train',
                    'split_text' : text,
                },
            ),
    ]

    def _generate_examples(self, split_key, split_text):
      each_sentence = iter(text)
      for index, text in enumerate(each_sentence):
        yield index, {"text": split_text}
コード例 #3
0
 def _split_generators(
     self, dl_manager: tfds.download.DownloadManager
 ) -> Dict[str, tfds.core.SplitGenerator]:
     """Returns SplitGenerators."""
     data = dl_manager.download({'data': _URL})
     # There is no predefined train/val/test split for this dataset.
     return {
         tfds.Split.TRAIN: self._generate_examples(file_path=data['data'])
     }
コード例 #4
0
    def _split_generators(self, dl_manager: tfds.download.DownloadManager):
        """Returns SplitGenerators."""
        imnet_path = os.path.join(dl_manager.manual_dir,
                                  'ILSVRC2012_img_train.tar')
        noisy_images_path = os.path.join(dl_manager.manual_dir, 'noisy_images')
        noisy_split_path = os.path.join(dl_manager.manual_dir,
                                        'dataset_no_images', 'mini-imagenet',
                                        'split')
        noisy_annot_path = os.path.join(dl_manager.manual_dir,
                                        'dataset_no_images',
                                        'mini-imagenet-annotations.json')
        val_path = os.path.join(dl_manager.manual_dir,
                                'ILSVRC2012_img_val.tar')

        with tf.io.gfile.GFile(noisy_annot_path) as json_file:
            data = json.load(json_file)

        image_data = data['data']
        noisy_image_ids = [elem[0]['image/id'] + '.jpg' for elem in image_data]

        # We first load all mini-ImageNet images in the memory, and
        # will access them for the other splits
        paths = dl_manager.download({
            'mini_train': MINI_IMAGENET_TRAIN,
            'mini_val': MINI_IMAGENET_VAL,
            'mini_test': MINI_IMAGENET_TEST
        })

        train_fnames = self._read_mini_imagenet_csv(paths['mini_train'])
        val_fnames = self._read_mini_imagenet_csv(paths['mini_val'])
        test_fnames = self._read_mini_imagenet_csv(paths['mini_test'])
        mini_imnet_fnames = train_fnames + val_fnames + test_fnames

        mini_imnet_images = self._get_clean_images(
            mini_imnet_fnames, dl_manager.iter_archive(imnet_path))

        val_split_file = os.path.join(noisy_split_path, 'clean_validation')

        split_to_generator = {}

        split_to_generator[
            tfds.Split.VALIDATION] = self._generate_val_examples(
                val_split_file, dl_manager.iter_archive(val_path))

        for percent in _PERCENTS:
            split_name = tfds.Split.TRAIN + '_' + '{:02d}'.format(percent)
            split_file = os.path.join(
                noisy_split_path,
                '{}_noise_nl_{}'.format(self.builder_config.color,
                                        str(percent / 100)))
            split_to_generator[split_name] = self._generate_examples(
                split_file, noisy_image_ids, noisy_images_path,
                mini_imnet_images)

        return split_to_generator
コード例 #5
0
    def _split_generators(self, dl_manager: tfds.download.DownloadManager):
        """Returns SplitGenerators."""
        paths = dl_manager.download({
            'train': _TRAIN_URL,
            'test': _TEST_URL,
        })

        return {
            tfds.Split.TRAIN: self._generate_examples(paths['train']),
            tfds.Split.TEST: self._generate_examples(paths['test']),
        }
コード例 #6
0
 def _split_generators(self, dl_manager: tfds.download.DownloadManager):
     """Returns SplitGenerators."""
     path = dl_manager.download(_IMAGENET_SKETCH_URL)
     return [
         tfds.core.SplitGenerator(
             name=tfds.Split.TEST,
             gen_kwargs={
                 'archive': dl_manager.iter_archive(path),
             },
         ),
     ]
コード例 #7
0
ファイル: asqa.py プロジェクト: suvarnak/datasets
    def _split_generators(self, dl_manager: tfds.download.DownloadManager):
        """Returns SplitGenerators."""
        file_path = dl_manager.download(
            'https://storage.googleapis.com/gresearch/ASQA/ASQA.json')

        with tf.io.gfile.GFile(file_path, 'r') as f:
            samples = json.load(f)

        return {
            'train': self._generate_examples(samples['train']),
            'dev': self._generate_examples(samples['dev']),
        }
コード例 #8
0
ファイル: pass_dataset.py プロジェクト: tensorflow/datasets
 def _split_generators(self, dl_manager: tfds.download.DownloadManager):
     """Returns SplitGenerators."""
     pd = tfds.core.lazy_imports.pandas
     paths = dl_manager.download(_URLS)
     with tf.io.gfile.GFile(paths['meta_data']) as f:
         meta = pd.read_csv(f)
     meta = meta.set_index('hash')
     return [
         tfds.core.SplitGenerator(
             name=tfds.Split.TRAIN,
             gen_kwargs=dict(parts=paths['train_images'],
                             meta=meta,
                             dl_manager=dl_manager),
         )
     ]
コード例 #9
0
ファイル: domainnet.py プロジェクト: tensorflow/datasets
    def _split_generators(self, dl_manager: tfds.download.DownloadManager):
        """Returns SplitGenerators."""
        img_download_url = f'{self._BASE_URL}/{self.builder_config.img_url}'
        self.img_path = dl_manager.download_and_extract(img_download_url)

        domain = self.builder_config.name
        train_split_url = f'{self._BASE_URL}/domainnet/txt/{domain}_train.txt'
        test_split_url = f'{self._BASE_URL}/domainnet/txt/{domain}_test.txt'

        self.splits = dl_manager.download({
            'train': train_split_url,
            'test': test_split_url
        })

        return {
            'train': self._generate_examples(split='train'),
            'test': self._generate_examples(split='test'),
        }
コード例 #10
0
  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
    """Returns SplitGenerators."""

    suffices = {'img': 'images.zip', 'latent': 'latents.npz'}
    prod = itertools.product(
        ['bunny', 'dragon'], ['train', 'test'], ['img', 'latent'])
    path_dict = dict([
        ('_'.join([a, b, c]),
         f'https://storage.googleapis.com/dm_s3o4d/{a}/{b}_{suffices[c]}')
        for a, b, c in prod])
    paths = dl_manager.download(path_dict)

    return dict([
        ('_'.join([a, b]),  # pylint: disable=g-complex-comprehension
         self._generate_examples(
             dl_manager, paths['_'.join([a, b, 'img'])],
             paths['_'.join([a, b, 'latent'])], a))
        for a, b in itertools.product(['bunny', 'dragon'], ['train', 'test'])])
コード例 #11
0
ファイル: penguins.py プロジェクト: tensorflow/datasets
 def _split_generators(self, dl_manager: tfds.download.DownloadManager):
     """Returns SplitGenerators."""
     path = dl_manager.download(_PENGUINS_PATH +
                                self.builder_config.file_name)
     return {'train': self._generate_examples(path)}