def _split_generators(self, dl_manager): arch_path = dl_manager.download(_DOWNLOAD_URL) archive = lambda: dl_manager.iter_archive(arch_path) # Generate vocabulary from training data if SubwordTextEncoder configured self.info.features["text"].maybe_build_from_corpus( self._vocab_text_gen(archive())) return [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, num_shards=10, gen_kwargs={"archive": archive(), "directory": os.path.join("aclImdb", "train")}), tfds.core.SplitGenerator( name=tfds.Split.TEST, num_shards=10, gen_kwargs={"archive": archive(), "directory": os.path.join("aclImdb", "test")}), tfds.core.SplitGenerator( name=tfds.Split("unsupervised"), num_shards=20, gen_kwargs={"archive": archive(), "directory": os.path.join("aclImdb", "train"), "labeled": False}), ]
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" """ TOTAL 2039 samples. 11 samples that are unable to be converted to 'mols' were removed from original dataset. ALL: total 2039 samples. 1) Random split samples (seed=123, ratio=8:2) TRAIN: randomly split bbbp for training (1631 samples) TEST: randomly split bbbp for test (408 samples) 2) Scaffold split samples scaffold_train: choose samples from the largest scaffold sets (1631 samples) scaffold_valid: choose samples from the largest scaffold sets after train set (204 samples) scaffold_test: choose the remaining samples as test set (204 samples) scaffold: total samples, but sorted by the size of the scaffold set, from the largest to the smallest (2039 samples) """ return [ tfds.core.SplitGenerator( name=tfds.Split.TRAIN, gen_kwargs={"file": dl_manager.download(_RAND_TRAIN)}, ), tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={"file": dl_manager.download(_RAND_TEST)}, ), tfds.core.SplitGenerator( name=tfds.Split('scaffold_train'), gen_kwargs={"file": dl_manager.download(_SCAFF_TRAIN)}, ), tfds.core.SplitGenerator( name=tfds.Split('scaffold_validation'), gen_kwargs={"file": dl_manager.download(_SCAFF_VALID)}, ), tfds.core.SplitGenerator( name=tfds.Split('scaffold_test'), gen_kwargs={"file": dl_manager.download(_SCAFF_TEST)}, ), tfds.core.SplitGenerator( name=tfds.Split('scaffold'), gen_kwargs={"file": dl_manager.download(_SCAFF_TOTAL)}, ), ]
def _split_generators(self, dl_manager): tar_gz_path = dl_manager.download(_SUN397_URL + "SUN397.tar.gz") if os.path.isdir(tar_gz_path): # While testing: download() returns the dir containing the tests files. tar_gz_path = os.path.join(tar_gz_path, "SUN397.tar.gz") resource = tfds.download.Resource( path=tar_gz_path, extract_method=tfds.download.ExtractMethod.TAR_GZ_STREAM) return [ tfds.core.SplitGenerator( name=tfds.Split("full"), num_shards=20, # size(shard) ~= 2GB. gen_kwargs=dict(archive=dl_manager.iter_archive(resource))) ]
def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" return { tfds.Split.TRAIN: self._generate_examples(os.path.join(_DATA_URL, _FILENAME_TRAIN), is_training=True), tfds.Split.VALIDATION: self._generate_examples(os.path.join(_DATA_URL, _FILENAME_VAL), is_training=False), tfds.Split.TEST: self._generate_examples(os.path.join(_DATA_URL, _FILENAME_TEST), is_training=False), tfds.Split('test2'): self._generate_examples(os.path.join(_DATA_URL, _FILENAME_TEST2), is_training=False), }
def _split_generators(self, dl_manager): del dl_manager return [ tfds.core.SplitGenerator( tfds.Split.TRAIN, num_shards=10, gen_kwargs=dict(data=RANGE_TRAIN), ), tfds.core.SplitGenerator( tfds.Split.TEST, num_shards=2, gen_kwargs=dict(data=RANGE_TEST), ), tfds.core.SplitGenerator( tfds.Split("custom"), num_shards=2, gen_kwargs=dict(data=RANGE_VAL), ), ]
def _split_generators(self, dl_manager): del dl_manager return [ tfds.core.SplitGenerator( tfds.Split.TRAIN, num_shards=self.num_shards_train, gen_kwargs=dict(data=self.range_train), ), tfds.core.SplitGenerator( tfds.Split.TEST, num_shards=self.num_shards_test, gen_kwargs=dict(data=self.range_test), ), tfds.core.SplitGenerator( tfds.Split("custom"), num_shards=self.num_shards_val, gen_kwargs=dict(data=self.range_val), ), ]
def test_split_merge_slice(self): # Merge, then slice (then merge) split = tfds.Split.TEST + tfds.Split.TRAIN split = split.subsplit(tfds.percent[30:40]) split = split + tfds.Split("custom").subsplit(tfds.percent[:15]) # List sorted so always deterministic self.assertEqual(self._info(split), [ splits.SlicedSplitInfo( split_info=tfds.core.SplitInfo(name="custom", num_shards=2), slice_value=slice(None, 15), ), splits.SlicedSplitInfo( split_info=tfds.core.SplitInfo(name="test", num_shards=2), slice_value=slice(30, 40), ), splits.SlicedSplitInfo( split_info=tfds.core.SplitInfo(name="train", num_shards=10), slice_value=slice(30, 40), ), ])
def _split_generators(self, dl_manager): train_path = os.path.join(dl_manager.manual_dir, 'ILSVRC2012_img_train.tar') val_path = os.path.join(dl_manager.manual_dir, 'ILSVRC2012_img_val.tar') if not tf.io.gfile.exists(train_path) or not tf.io.gfile.exists( val_path): raise AssertionError( 'ImageNet requires manual download of the data. Please download ' 'the train and val set and place them into: {}, {}'.format( train_path, val_path)) # Download and load subset file. subset_file = SUBSET2FILES[self.builder_config.name] if isinstance(subset_file, list): # it will only be a list during testing, subset_file = subset_file[0] # where the first entry is 1shot.txt. subset = set(subset_file.read_text().splitlines()) # Get the file for tune split. tuneset = set(TUNE_FILE.read_text().splitlines()) return { tfds.Split.TRAIN: self._generate_examples( archive=dl_manager.iter_archive(train_path), subset=subset), tfds.Split('tune'): self._generate_examples( archive=dl_manager.iter_archive(train_path), subset=tuneset), tfds.Split.VALIDATION: self._generate_examples( archive=dl_manager.iter_archive(val_path), validation_labels=imagenet.get_validation_labels(val_path)), }
def _split_generators(self, dl_manager): dl_dir = dl_manager.download_and_extract(_DATA_URL) data_dir = os.path.join(dl_dir, 'winogrande_1.1') # Winogrande has different standardized training data sizes and reports # numbers for each of these data sizes, so make those available. data_sizes = ['xs', 's', 'm', 'l', 'xl'] train_splits = [] for size in data_sizes: train_splits.append( tfds.core.SplitGenerator( name=tfds.Split('train_{}'.format(size)), gen_kwargs={ 'filepath': os.path.join(data_dir, 'train_{}.jsonl'.format(size)) })) return train_splits + [ tfds.core.SplitGenerator( name=tfds.Split.TEST, gen_kwargs={'filepath': os.path.join(data_dir, 'test.jsonl')}), tfds.core.SplitGenerator( name=tfds.Split.VALIDATION, gen_kwargs={'filepath': os.path.join(data_dir, 'dev.jsonl')}), ]
_BERT_CONFIG_FILE = '%s/bert_config.json' % MODEL_NAME _TOKENIZER_VOCAB_FILE = '%s/vocab.txt' % MODEL_NAME _INIT_CHECKPOINT = '%s/bert_model.ckpt' % MODEL_NAME _TOKENIZER_DO_LOWER_CASE = False _TOKENIZER_MAX_SEQ_LEN = 512 _VOCAB_FREQUENCY_FILE = 'vocab_frequencies' # UNIVERSAL SENTENCE ENCODER PATHS AND OPTIONS _TF_HUB_PATH = 'https://tfhub.dev/google/' # As a note, this is a newer version of the universal sentence embeddings # than the ones we initially conducted experiments with. _UNIVERSAL_SENTENCE_ENCODER_PATH = (_TF_HUB_PATH + 'universal-sentence-encoder-large/3') VALIDATION_2018 = tfds.Split('validation_2018') VALIDATION_2016 = tfds.Split('validation_2016') TEST_2016 = tfds.Split('test_2016') TEST_2018 = tfds.Split('test_2018') class EmbeddingType(enum.Enum): """Indicates what type of BERT-based embedding to use.""" BERT_REDUCE_MEAN = 'bert_mean_emb' BERT_REDUCE_WEIGHTED_MEAN = 'bert_weighted_mean_emb' BERT_REDUCE_MIN_MAX = 'bert_min_max_emb' BERT_REDUCE_MIN_MAX_MEAN = 'bert_min_max_mean_emb' BERT_CLASS_TOKEN = 'bert_class_token' UNIVERSAL_SENTENCE = 'universal_sentence_emb' MT_SMALL = 'long_tacl_europarl_0602_090043'
} """ _DESCRIPTION = """\ The STL-10 dataset is an image recognition dataset for developing unsupervised feature learning, deep learning, self-taught learning algorithms. It is inspired by the CIFAR-10 dataset but with some modifications. In particular, each class has fewer labeled training examples than in CIFAR-10, but a very large set of unlabeled examples is provided to learn image models prior to supervised training. The primary challenge is to make use of the unlabeled data (which comes from a similar but different distribution from the labeled data) to build a useful prior. All images were acquired from labeled examples on ImageNet. """ URL = "http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz" UNLABELLED = tfds.Split("unlabelled") class Stl10(tfds.core.GeneratorBasedBuilder): """STL-10 dataset.""" VERSION = tfds.core.Version("1.0.0") def _info(self): return tfds.core.DatasetInfo( builder=self, # This is the description that will appear on the datasets page. description=_DESCRIPTION, # tfds.features.FeatureConnectors features=tfds.features.FeaturesDict({ "image": tfds.features.Image(shape=(96, 96, 3)),