def _get_multi_labels_and_problematic_images( dl_manager: tfds.download.DownloadManager): """Returns multi-labels and problematic images from download json. Args: dl_manager: tfds.download.DownloadManager for downloading the json file Returns: val_annotated_images: Dictionary mapping image name to an inner dictionary containing the multi_label annotations for that image. The inner multi- label annotation dictionary has keys 'correct', 'wrong', or 'unclear' (keys will be missing if the image does not have a set of labels of the given type) and values that are lists of wnids. problematic_images: List of image names for problematic images. imagenet_m_2022: List of image names comprising ImageNet-M 2022 evaluation slice. """ with tf.io.gfile.GFile(dl_manager.download(_MULTI_LABELS_URL), 'r') as f: human_accuracy_data = json.load(f) val_annotated_images = {} prefix = 'ILSVRC2012_val_' len_prefix = len(prefix) for image_name in human_accuracy_data['initial_annots'].keys(): if image_name[:len_prefix] == prefix: val_annotated_images[image_name] = human_accuracy_data['initial_annots'][ image_name] problematic_images = list(human_accuracy_data['problematic_images'].keys()) imagenet_m_2022 = human_accuracy_data['imagenet_m'] return val_annotated_images, problematic_images, imagenet_m_2022
def _split_generators(self, dl_manager: tfds.download.DownloadManager): '''RETURNS SplitGenerators''' dl_paths = dl.manager.download_kaggle_data('Wikipedia sentences') data_dir = dl_manager.download({ 'sentences_train': dl_paths['wikisent2.txt'], }) txt_path = data_dir with tf.io.gfile.GFile(txt_path, 'r') as f: text = f.read() # Since there's no official split, putting everything under training split return [ tfds.core.SplitGenerator( name = tfds.Split.TRAIN, gen_kwargs = { 'split_key' : 'train', 'split_text' : text, }, ), ] def _generate_examples(self, split_key, split_text): each_sentence = iter(text) for index, text in enumerate(each_sentence): yield index, {"text": split_text}
def _split_generators( self, dl_manager: tfds.download.DownloadManager ) -> Dict[str, tfds.core.SplitGenerator]: """Returns SplitGenerators.""" data = dl_manager.download({'data': _URL}) # There is no predefined train/val/test split for this dataset. return { tfds.Split.TRAIN: self._generate_examples(file_path=data['data']) }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" imnet_path = os.path.join(dl_manager.manual_dir, 'ILSVRC2012_img_train.tar') noisy_images_path = os.path.join(dl_manager.manual_dir, 'noisy_images') noisy_split_path = os.path.join(dl_manager.manual_dir, 'dataset_no_images', 'mini-imagenet', 'split') noisy_annot_path = os.path.join(dl_manager.manual_dir, 'dataset_no_images', 'mini-imagenet-annotations.json') val_path = os.path.join(dl_manager.manual_dir, 'ILSVRC2012_img_val.tar') with tf.io.gfile.GFile(noisy_annot_path) as json_file: data = json.load(json_file) image_data = data['data'] noisy_image_ids = [elem[0]['image/id'] + '.jpg' for elem in image_data] # We first load all mini-ImageNet images in the memory, and # will access them for the other splits paths = dl_manager.download({ 'mini_train': MINI_IMAGENET_TRAIN, 'mini_val': MINI_IMAGENET_VAL, 'mini_test': MINI_IMAGENET_TEST }) train_fnames = self._read_mini_imagenet_csv(paths['mini_train']) val_fnames = self._read_mini_imagenet_csv(paths['mini_val']) test_fnames = self._read_mini_imagenet_csv(paths['mini_test']) mini_imnet_fnames = train_fnames + val_fnames + test_fnames mini_imnet_images = self._get_clean_images( mini_imnet_fnames, dl_manager.iter_archive(imnet_path)) val_split_file = os.path.join(noisy_split_path, 'clean_validation') split_to_generator = {} split_to_generator[ tfds.Split.VALIDATION] = self._generate_val_examples( val_split_file, dl_manager.iter_archive(val_path)) for percent in _PERCENTS: split_name = tfds.Split.TRAIN + '_' + '{:02d}'.format(percent) split_file = os.path.join( noisy_split_path, '{}_noise_nl_{}'.format(self.builder_config.color, str(percent / 100))) split_to_generator[split_name] = self._generate_examples( split_file, noisy_image_ids, noisy_images_path, mini_imnet_images) return split_to_generator
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" paths = dl_manager.download({ 'train': _TRAIN_URL, 'test': _TEST_URL, }) return { tfds.Split.TRAIN: self._generate_examples(paths['train']), tfds.Split.TEST: self._generate_examples(paths['test']), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download(_IMAGENET_SKETCH_URL) return [ tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={ 'archive': dl_manager.iter_archive(path), }, ), ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" file_path = dl_manager.download( 'https://storage.googleapis.com/gresearch/ASQA/ASQA.json') with tf.io.gfile.GFile(file_path, 'r') as f: samples = json.load(f) return { 'train': self._generate_examples(samples['train']), 'dev': self._generate_examples(samples['dev']), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" pd = tfds.core.lazy_imports.pandas paths = dl_manager.download(_URLS) with tf.io.gfile.GFile(paths['meta_data']) as f: meta = pd.read_csv(f) meta = meta.set_index('hash') return [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs=dict(parts=paths['train_images'], meta=meta, dl_manager=dl_manager), ) ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" img_download_url = f'{self._BASE_URL}/{self.builder_config.img_url}' self.img_path = dl_manager.download_and_extract(img_download_url) domain = self.builder_config.name train_split_url = f'{self._BASE_URL}/domainnet/txt/{domain}_train.txt' test_split_url = f'{self._BASE_URL}/domainnet/txt/{domain}_test.txt' self.splits = dl_manager.download({ 'train': train_split_url, 'test': test_split_url }) return { 'train': self._generate_examples(split='train'), 'test': self._generate_examples(split='test'), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" suffices = {'img': 'images.zip', 'latent': 'latents.npz'} prod = itertools.product( ['bunny', 'dragon'], ['train', 'test'], ['img', 'latent']) path_dict = dict([ ('_'.join([a, b, c]), f'https://storage.googleapis.com/dm_s3o4d/{a}/{b}_{suffices[c]}') for a, b, c in prod]) paths = dl_manager.download(path_dict) return dict([ ('_'.join([a, b]), # pylint: disable=g-complex-comprehension self._generate_examples( dl_manager, paths['_'.join([a, b, 'img'])], paths['_'.join([a, b, 'latent'])], a)) for a, b in itertools.product(['bunny', 'dragon'], ['train', 'test'])])
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download(_PENGUINS_PATH + self.builder_config.file_name) return {'train': self._generate_examples(path)}