def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" val_path = os.path.join(dl_manager.manual_dir, 'ILSVRC2012_img_val.tar') if not tf.io.gfile.exists(val_path): raise AssertionError( 'ImageNet requires manual download of the data. Please download ' 'the train and val set and place them into: {}'.format(val_path)) original_labels = _get_original_labels(val_path) (multi_labels, problematic_images, imagenet_m_2022_errors ) = _get_multi_labels_and_problematic_images(dl_manager) imagenet_m_2022 = dict([(k, multi_labels[k]) for k in imagenet_m_2022_errors ]) return { 'validation': self._generate_examples( archive=dl_manager.iter_archive(val_path), original_labels=original_labels, multi_labels=multi_labels, problematic_images=problematic_images), 'imagenet_m': self._generate_examples( archive=dl_manager.iter_archive(val_path), original_labels=original_labels, multi_labels=imagenet_m_2022, problematic_images=problematic_images), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" imnet_path = os.path.join(dl_manager.manual_dir, 'ILSVRC2012_img_train.tar') noisy_images_path = os.path.join(dl_manager.manual_dir, 'noisy_images') noisy_split_path = os.path.join(dl_manager.manual_dir, 'dataset_no_images', 'mini-imagenet', 'split') noisy_annot_path = os.path.join(dl_manager.manual_dir, 'dataset_no_images', 'mini-imagenet-annotations.json') val_path = os.path.join(dl_manager.manual_dir, 'ILSVRC2012_img_val.tar') with tf.io.gfile.GFile(noisy_annot_path) as json_file: data = json.load(json_file) image_data = data['data'] noisy_image_ids = [elem[0]['image/id'] + '.jpg' for elem in image_data] # We first load all mini-ImageNet images in the memory, and # will access them for the other splits paths = dl_manager.download({ 'mini_train': MINI_IMAGENET_TRAIN, 'mini_val': MINI_IMAGENET_VAL, 'mini_test': MINI_IMAGENET_TEST }) train_fnames = self._read_mini_imagenet_csv(paths['mini_train']) val_fnames = self._read_mini_imagenet_csv(paths['mini_val']) test_fnames = self._read_mini_imagenet_csv(paths['mini_test']) mini_imnet_fnames = train_fnames + val_fnames + test_fnames mini_imnet_images = self._get_clean_images( mini_imnet_fnames, dl_manager.iter_archive(imnet_path)) val_split_file = os.path.join(noisy_split_path, 'clean_validation') split_to_generator = {} split_to_generator[ tfds.Split.VALIDATION] = self._generate_val_examples( val_split_file, dl_manager.iter_archive(val_path)) for percent in _PERCENTS: split_name = tfds.Split.TRAIN + '_' + '{:02d}'.format(percent) split_file = os.path.join( noisy_split_path, '{}_noise_nl_{}'.format(self.builder_config.color, str(percent / 100))) split_to_generator[split_name] = self._generate_examples( split_file, noisy_image_ids, noisy_images_path, mini_imnet_images) return split_to_generator
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download(_IMAGENET_SKETCH_URL) return [ tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={ 'archive': dl_manager.iter_archive(path), }, ), ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): '''RETURNS SplitGenerators''' dl_paths = dl.manager.download_kaggle_data('Wikipedia sentences') data_dir = dl_manager.download({ 'sentences_train': dl_paths['wikisent2.txt'], }) txt_path = data_dir with tf.io.gfile.GFile(txt_path, 'r') as f: text = f.read() # Since there's no official split, putting everything under training split return [ tfds.core.SplitGenerator( name = tfds.Split.TRAIN, gen_kwargs = { 'split_key' : 'train', 'split_text' : text, }, ), ] def _generate_examples(self, split_key, split_text): each_sentence = iter(text) for index, text in enumerate(each_sentence): yield index, {"text": split_text}
def _split_generators(self, dl_manager: tfds.download.DownloadManager ) -> List[tfds.core.SplitGenerator]: """Returns SplitGenerators.""" base_dir = dl_manager.download_and_extract( tfds.download.Resource( url=_DOWNLOAD_URL, # Specify extract method manually as filename reported by github.com # misses the .zip extension so auto-detection doesn't work. extract_method=tfds.download.ExtractMethod.ZIP)) data_dir = os.path.join(base_dir, _DOWNLOAD_ARCHIVE_SUBDIR) return [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs={"day_to_paths": _get_day_to_paths(os.path.join(data_dir, "train"))}, ), tfds.core.SplitGenerator( name=tfds.Split.VALIDATION, gen_kwargs={"day_to_paths": _get_day_to_paths(os.path.join(data_dir, "dev"))}, ), tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={"day_to_paths": _get_day_to_paths(os.path.join(data_dir, "test"))}, ), ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" manually_downloaded_files = { 'test_all_frames': dl_manager.manual_dir / 'test_all_frames.zip', 'test_annotations': dl_manager.manual_dir / 'test.json', 'train_all_frames': dl_manager.manual_dir / 'train_all_frames.zip', 'train_annotations': dl_manager.manual_dir / 'train.json', 'valid_all_frames': dl_manager.manual_dir / 'valid_all_frames.zip', 'valid_annotations': dl_manager.manual_dir / 'valid.json', } extracted_files = dl_manager.extract(manually_downloaded_files) return { tfds.Split.TRAIN: self._generate_examples( annotations=extracted_files['train_annotations'], all_frames=extracted_files['train_all_frames'] / 'train_all_frames' / 'JPEGImages', ), tfds.Split.VALIDATION: self._generate_examples( annotations=extracted_files['valid_annotations'], all_frames=extracted_files['valid_all_frames'] / 'valid_all_frames' / 'JPEGImages', ), tfds.Split.TEST: self._generate_examples( annotations=extracted_files['test_annotations'], all_frames=extracted_files['test_all_frames'] / 'test_all_frames' / 'JPEGImages', ), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract(_DS_PATH) return { 'train': self._generate_examples(path, '.dev'), 'test': self._generate_examples(path, '.test'), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract({'file': _URL}) return { 'database': self._generate_examples(path, True), 'test': self._generate_examples(path, False), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" data = dl_manager.download_and_extract({ 'train': _VIDEO_URL + '1-TAO_TRAIN.zip', 'val': _VIDEO_URL + '2-TAO_VAL.zip', 'annotations': _ANNOTATIONS_URL }) manual_train, manual_val = _maybe_prepare_manual_data(dl_manager) id_map = _get_category_id_map(data['annotations'] / 'annotations-1.2') return { tfds.Split.TRAIN: self._generate_examples( data_path=data['train'], manual_path=manual_train, annotations_path=data['annotations'] / 'annotations-1.2' / 'train.json', id_map=id_map), tfds.Split.VALIDATION: self._generate_examples( data_path=data['val'], manual_path=manual_val, annotations_path=data['annotations'] / 'annotations-1.2' / 'validation.json', id_map=id_map) }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" data_dir = dl_manager.download_and_extract( self.builder_config.data_urls) split_generators = {} if 'train' in self.builder_config.data_urls: split_generators.update({ tfds.Split.TRAIN: self._generate_examples(path=data_dir['train'], split='train'), }) if 'validation' in self.builder_config.data_urls: split_generators.update({ tfds.Split.VALIDATION: self._generate_examples(path=data_dir['validation'], split='validation'), }) if 'test' in self.builder_config.data_urls: split_generators.update({ tfds.Split.TEST: self._generate_examples(path=data_dir['test'], split='test'), }) return split_generators
def _get_multi_labels_and_problematic_images( dl_manager: tfds.download.DownloadManager): """Returns multi-labels and problematic images from download json. Args: dl_manager: tfds.download.DownloadManager for downloading the json file Returns: val_annotated_images: Dictionary mapping image name to an inner dictionary containing the multi_label annotations for that image. The inner multi- label annotation dictionary has keys 'correct', 'wrong', or 'unclear' (keys will be missing if the image does not have a set of labels of the given type) and values that are lists of wnids. problematic_images: List of image names for problematic images. imagenet_m_2022: List of image names comprising ImageNet-M 2022 evaluation slice. """ with tf.io.gfile.GFile(dl_manager.download(_MULTI_LABELS_URL), 'r') as f: human_accuracy_data = json.load(f) val_annotated_images = {} prefix = 'ILSVRC2012_val_' len_prefix = len(prefix) for image_name in human_accuracy_data['initial_annots'].keys(): if image_name[:len_prefix] == prefix: val_annotated_images[image_name] = human_accuracy_data['initial_annots'][ image_name] problematic_images = list(human_accuracy_data['problematic_images'].keys()) imagenet_m_2022 = human_accuracy_data['imagenet_m'] return val_annotated_images, problematic_images, imagenet_m_2022
def _split_generators( self, dl_manager: tfds.download.DownloadManager ) -> List[tfds.core.SplitGenerator]: """Returns SplitGenerators.""" base_dir = dl_manager.download_and_extract(_DOWNLOAD_URL) data_dir = os.path.join(base_dir, _DOWNLOAD_ARCHIVE_SUBDIR) return [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs={ "day_to_paths": _get_day_to_paths(os.path.join(data_dir, "train")) }, ), tfds.core.SplitGenerator( name=tfds.Split.VALIDATION, gen_kwargs={ "day_to_paths": _get_day_to_paths( os.path.join(data_dir, "dev")) }, ), tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={ "day_to_paths": _get_day_to_paths(os.path.join(data_dir, "test")) }, ), ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" if self.builder_config.full_resolution: trainval_data = dl_manager.download_and_extract( _URL + 'DAVIS-2017-trainval-Full-Resolution.zip') else: trainval_data = dl_manager.download_and_extract( _URL + 'DAVIS-2017-trainval-480p.zip') train_files = trainval_data / 'DAVIS/ImageSets/2017/train.txt' val_files = trainval_data / 'DAVIS/ImageSets/2017/val.txt' return { tfds.Split.TRAIN: self._generate_examples(train_files), tfds.Split.VALIDATION: self._generate_examples(val_files) }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract( 'http://go.criteo.net/criteo-research-uplift-v2.1.csv.gz') return { 'train': self._generate_examples(path), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract(_DOWNLOAD_URLS) return { 'train': self._generate_examples(path['train']), 'validation': self._generate_examples(path['validation']), 'test': self._generate_examples(path['test']) }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract(_DATA_LINK) return { "train": self._generate_examples(path / "drop_dataset_train.json"), "dev": self._generate_examples(path / "drop_dataset_dev.json"), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" img_download_url = f'{self._BASE_URL}/{self.builder_config.img_url}' self.img_path = dl_manager.download_and_extract(img_download_url) domain = self.builder_config.name train_split_url = f'{self._BASE_URL}/domainnet/txt/{domain}_train.txt' test_split_url = f'{self._BASE_URL}/domainnet/txt/{domain}_test.txt' self.splits = dl_manager.download({ 'train': train_split_url, 'test': test_split_url }) return { 'train': self._generate_examples(split='train'), 'test': self._generate_examples(split='test'), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract(self.builder_config.data_path) return { split_name: self._generate_examples( path / self.builder_config.name, split_name=split_name) for split_name in self.builder_config.splits_names }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract({ 'file_path': self._DATA_PATHS[self.builder_config.name], }) return { 'train': self._generate_examples(path), }
def _split_generators( self, dl_manager: tfds.download.DownloadManager ) -> Dict[str, tfds.core.SplitGenerator]: """Returns SplitGenerators.""" data = dl_manager.download({'data': _URL}) # There is no predefined train/val/test split for this dataset. return { tfds.Split.TRAIN: self._generate_examples(file_path=data['data']) }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract( 'http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv' ) return { 'train': self._generate_examples(path), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" size = self.builder_config.size fold = self.builder_config.fold path = dl_manager.download_and_extract(_URLS[size]) return { split: self._generate_examples(path / f"Fold{fold}/{split}.txt") for split in ["train", "vali", "test"] }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract({ 'file_path': f'http://downloads.cs.stanford.edu/downloads/rt_benchmark/{self.builder_config.task}/ph/{self.builder_config.filename}.hdf5' }) return { 'train': self._generate_examples(path), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" # TODO(esc50): Downloads the data and defines the splits path = dl_manager.download_and_extract( 'https://github.com/karoldvl/ESC-50/archive/master.zip') # TODO(esc50): Returns the Dict[split names, Iterator[Key, Example]] return { 'train': self._generate_examples(path / 'train_imgs'), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract(_URL) return { tfds.Split.TRAIN: self._generate_examples(os.path.join(path, 'train.json')), tfds.Split.VALIDATION: self._generate_examples(os.path.join(path, 'dev.json')), tfds.Split.TEST: self._generate_examples(os.path.join(path, 'test.json')), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" path = dl_manager.download_and_extract(URL) subpath = dl_manager.extract( os.path.join(path, self.builder_config.language + ".tar.gz")) return [ tfds.core.SplitGenerator( name=tfds.Split.VALIDATION, gen_kwargs={"filepath": os.path.join(subpath, "dev")}, ), tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={"filepath": os.path.join(subpath, "test")}, ), tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs={"filepath": os.path.join(subpath, "train")}, ), ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" paths = dl_manager.download({ 'train': _TRAIN_URL, 'test': _TEST_URL, }) return { tfds.Split.TRAIN: self._generate_examples(paths['train']), tfds.Split.TEST: self._generate_examples(paths['test']), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" ds_dir = self.builder_config.dataset_dir ds_name = 'halfcheetah_' + self.builder_config.file_suffix + '.hdf5' path = dl_manager.download_and_extract({ 'file_path': 'http://rail.eecs.berkeley.edu/datasets/offline_rl/' + ds_dir + '/' + ds_name }) return { 'train': self._generate_examples(path), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" run = self.builder_config.name task = self.builder_config.task paths = dl_manager.download_and_extract({ 'file_paths': _get_files(prefix=f'gs://rl_unplugged/dmlab/{task}/{run}', num_shards=self._SHARDS), }) return { 'train': self._generate_examples(paths), }
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" file_path = dl_manager.download( 'https://storage.googleapis.com/gresearch/ASQA/ASQA.json') with tf.io.gfile.GFile(file_path, 'r') as f: samples = json.load(f) return { 'train': self._generate_examples(samples['train']), 'dev': self._generate_examples(samples['dev']), }