Ejemplo n.º 1
0
  def _split_generators(self, dl_manager):
    arch_path = dl_manager.download(_DOWNLOAD_URL)
    archive = lambda: dl_manager.iter_archive(arch_path)

    # Generate vocabulary from training data if SubwordTextEncoder configured
    self.info.features["text"].maybe_build_from_corpus(
        self._vocab_text_gen(archive()))

    return [
        tfds.core.SplitGenerator(
            name=tfds.Split.TRAIN,
            num_shards=10,
            gen_kwargs={"archive": archive(),
                        "directory": os.path.join("aclImdb", "train")}),
        tfds.core.SplitGenerator(
            name=tfds.Split.TEST,
            num_shards=10,
            gen_kwargs={"archive": archive(),
                        "directory": os.path.join("aclImdb", "test")}),
        tfds.core.SplitGenerator(
            name=tfds.Split("unsupervised"),
            num_shards=20,
            gen_kwargs={"archive": archive(),
                        "directory": os.path.join("aclImdb", "train"),
                        "labeled": False}),
    ]
Ejemplo n.º 2
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        """
      TOTAL 2039 samples. 
      11 samples that are unable to be converted to 'mols' were removed from original dataset.
      ALL: total 2039 samples.  

      1) Random split samples (seed=123, ratio=8:2)
      TRAIN: randomly split bbbp for training (1631 samples)
      TEST: randomly split bbbp for test (408 samples)

      2) Scaffold split samples
      scaffold_train: choose samples from the largest scaffold sets (1631 samples)
      scaffold_valid: choose samples from the largest scaffold sets after train set (204 samples)
      scaffold_test: choose the remaining samples as test set (204 samples)
      scaffold: total samples, but sorted by the size of the scaffold set, from the largest to the smallest (2039 samples)
      """

        return [
            tfds.core.SplitGenerator(
                name=tfds.Split.TRAIN,
                gen_kwargs={"file": dl_manager.download(_RAND_TRAIN)},
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split.TEST,
                gen_kwargs={"file": dl_manager.download(_RAND_TEST)},
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split('scaffold_train'),
                gen_kwargs={"file": dl_manager.download(_SCAFF_TRAIN)},
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split('scaffold_validation'),
                gen_kwargs={"file": dl_manager.download(_SCAFF_VALID)},
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split('scaffold_test'),
                gen_kwargs={"file": dl_manager.download(_SCAFF_TEST)},
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split('scaffold'),
                gen_kwargs={"file": dl_manager.download(_SCAFF_TOTAL)},
            ),
        ]
Ejemplo n.º 3
0
  def _split_generators(self, dl_manager):
    tar_gz_path = dl_manager.download(_SUN397_URL + "SUN397.tar.gz")
    if os.path.isdir(tar_gz_path):
      # While testing: download() returns the dir containing the tests files.
      tar_gz_path = os.path.join(tar_gz_path, "SUN397.tar.gz")

    resource = tfds.download.Resource(
        path=tar_gz_path,
        extract_method=tfds.download.ExtractMethod.TAR_GZ_STREAM)
    return [
        tfds.core.SplitGenerator(
            name=tfds.Split("full"),
            num_shards=20,  # size(shard) ~= 2GB.
            gen_kwargs=dict(archive=dl_manager.iter_archive(resource)))
    ]
Ejemplo n.º 4
0
    def _split_generators(self, dl_manager: tfds.download.DownloadManager):
        """Returns SplitGenerators."""

        return {
            tfds.Split.TRAIN:
            self._generate_examples(os.path.join(_DATA_URL, _FILENAME_TRAIN),
                                    is_training=True),
            tfds.Split.VALIDATION:
            self._generate_examples(os.path.join(_DATA_URL, _FILENAME_VAL),
                                    is_training=False),
            tfds.Split.TEST:
            self._generate_examples(os.path.join(_DATA_URL, _FILENAME_TEST),
                                    is_training=False),
            tfds.Split('test2'):
            self._generate_examples(os.path.join(_DATA_URL, _FILENAME_TEST2),
                                    is_training=False),
        }
Ejemplo n.º 5
0
 def _split_generators(self, dl_manager):
     del dl_manager
     return [
         tfds.core.SplitGenerator(
             tfds.Split.TRAIN,
             num_shards=10,
             gen_kwargs=dict(data=RANGE_TRAIN),
         ),
         tfds.core.SplitGenerator(
             tfds.Split.TEST,
             num_shards=2,
             gen_kwargs=dict(data=RANGE_TEST),
         ),
         tfds.core.SplitGenerator(
             tfds.Split("custom"),
             num_shards=2,
             gen_kwargs=dict(data=RANGE_VAL),
         ),
     ]
 def _split_generators(self, dl_manager):
     del dl_manager
     return [
         tfds.core.SplitGenerator(
             tfds.Split.TRAIN,
             num_shards=self.num_shards_train,
             gen_kwargs=dict(data=self.range_train),
         ),
         tfds.core.SplitGenerator(
             tfds.Split.TEST,
             num_shards=self.num_shards_test,
             gen_kwargs=dict(data=self.range_test),
         ),
         tfds.core.SplitGenerator(
             tfds.Split("custom"),
             num_shards=self.num_shards_val,
             gen_kwargs=dict(data=self.range_val),
         ),
     ]
Ejemplo n.º 7
0
    def test_split_merge_slice(self):

        # Merge, then slice (then merge)
        split = tfds.Split.TEST + tfds.Split.TRAIN
        split = split.subsplit(tfds.percent[30:40])
        split = split + tfds.Split("custom").subsplit(tfds.percent[:15])

        # List sorted so always deterministic
        self.assertEqual(self._info(split), [
            splits.SlicedSplitInfo(
                split_info=tfds.core.SplitInfo(name="custom", num_shards=2),
                slice_value=slice(None, 15),
            ),
            splits.SlicedSplitInfo(
                split_info=tfds.core.SplitInfo(name="test", num_shards=2),
                slice_value=slice(30, 40),
            ),
            splits.SlicedSplitInfo(
                split_info=tfds.core.SplitInfo(name="train", num_shards=10),
                slice_value=slice(30, 40),
            ),
        ])
Ejemplo n.º 8
0
    def _split_generators(self, dl_manager):

        train_path = os.path.join(dl_manager.manual_dir,
                                  'ILSVRC2012_img_train.tar')
        val_path = os.path.join(dl_manager.manual_dir,
                                'ILSVRC2012_img_val.tar')

        if not tf.io.gfile.exists(train_path) or not tf.io.gfile.exists(
                val_path):
            raise AssertionError(
                'ImageNet requires manual download of the data. Please download '
                'the train and val set and place them into: {}, {}'.format(
                    train_path, val_path))

        # Download and load subset file.
        subset_file = SUBSET2FILES[self.builder_config.name]
        if isinstance(subset_file,
                      list):  # it will only be a list during testing,
            subset_file = subset_file[0]  # where the first entry is 1shot.txt.
        subset = set(subset_file.read_text().splitlines())

        # Get the file for tune split.
        tuneset = set(TUNE_FILE.read_text().splitlines())

        return {
            tfds.Split.TRAIN:
            self._generate_examples(
                archive=dl_manager.iter_archive(train_path), subset=subset),
            tfds.Split('tune'):
            self._generate_examples(
                archive=dl_manager.iter_archive(train_path), subset=tuneset),
            tfds.Split.VALIDATION:
            self._generate_examples(
                archive=dl_manager.iter_archive(val_path),
                validation_labels=imagenet.get_validation_labels(val_path)),
        }
Ejemplo n.º 9
0
    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(_DATA_URL)
        data_dir = os.path.join(dl_dir, 'winogrande_1.1')

        # Winogrande has different standardized training data sizes and reports
        # numbers for each of these data sizes, so make those available.
        data_sizes = ['xs', 's', 'm', 'l', 'xl']
        train_splits = []
        for size in data_sizes:
            train_splits.append(
                tfds.core.SplitGenerator(
                    name=tfds.Split('train_{}'.format(size)),
                    gen_kwargs={
                        'filepath':
                        os.path.join(data_dir, 'train_{}.jsonl'.format(size))
                    }))
        return train_splits + [
            tfds.core.SplitGenerator(
                name=tfds.Split.TEST,
                gen_kwargs={'filepath': os.path.join(data_dir, 'test.jsonl')}),
            tfds.core.SplitGenerator(
                name=tfds.Split.VALIDATION,
                gen_kwargs={'filepath': os.path.join(data_dir, 'dev.jsonl')}),
        ]
Ejemplo n.º 10
0
_BERT_CONFIG_FILE = '%s/bert_config.json' % MODEL_NAME
_TOKENIZER_VOCAB_FILE = '%s/vocab.txt' % MODEL_NAME
_INIT_CHECKPOINT = '%s/bert_model.ckpt' % MODEL_NAME

_TOKENIZER_DO_LOWER_CASE = False
_TOKENIZER_MAX_SEQ_LEN = 512
_VOCAB_FREQUENCY_FILE = 'vocab_frequencies'

# UNIVERSAL SENTENCE ENCODER PATHS AND OPTIONS
_TF_HUB_PATH = 'https://tfhub.dev/google/'
# As a note, this is a newer version of the universal sentence embeddings
# than the ones we initially conducted experiments with.
_UNIVERSAL_SENTENCE_ENCODER_PATH = (_TF_HUB_PATH +
                                    'universal-sentence-encoder-large/3')

VALIDATION_2018 = tfds.Split('validation_2018')
VALIDATION_2016 = tfds.Split('validation_2016')
TEST_2016 = tfds.Split('test_2016')
TEST_2018 = tfds.Split('test_2018')


class EmbeddingType(enum.Enum):
    """Indicates what type of BERT-based embedding to use."""
    BERT_REDUCE_MEAN = 'bert_mean_emb'
    BERT_REDUCE_WEIGHTED_MEAN = 'bert_weighted_mean_emb'
    BERT_REDUCE_MIN_MAX = 'bert_min_max_emb'
    BERT_REDUCE_MIN_MAX_MEAN = 'bert_min_max_mean_emb'
    BERT_CLASS_TOKEN = 'bert_class_token'
    UNIVERSAL_SENTENCE = 'universal_sentence_emb'
    MT_SMALL = 'long_tacl_europarl_0602_090043'
Ejemplo n.º 11
0
}
"""

_DESCRIPTION = """\
The STL-10 dataset is an image recognition dataset for developing unsupervised
feature learning, deep learning, self-taught learning algorithms. It is inspired
by the CIFAR-10 dataset but with some modifications. In particular, each class
has fewer labeled training examples than in CIFAR-10, but a very large set of 
unlabeled examples is provided to learn image models prior to supervised
training. The primary challenge is to make use of the unlabeled data (which
comes from a similar but different distribution from the labeled data) to build
a useful prior. All images were acquired from labeled examples on ImageNet.
"""

URL = "http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz"
UNLABELLED = tfds.Split("unlabelled")


class Stl10(tfds.core.GeneratorBasedBuilder):
  """STL-10 dataset."""

  VERSION = tfds.core.Version("1.0.0")

  def _info(self):
    return tfds.core.DatasetInfo(
        builder=self,
        # This is the description that will appear on the datasets page.
        description=_DESCRIPTION,
        # tfds.features.FeatureConnectors
        features=tfds.features.FeaturesDict({
            "image": tfds.features.Image(shape=(96, 96, 3)),