Esempio n. 1
0
def make_module_to_builder_dict(datasets=None):
    """Get all builders organized by module in nested dicts."""
    # pylint: disable=g-long-lambda
    # dict to hold tfds->image->mnist->[builders]
    module_to_builder = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict(list)))
    # pylint: enable=g-long-lambda

    if not datasets:
        datasets = [
            name for name in tfds.list_builders()
            if name not in BUILDER_BLACKLIST
        ]
    print("Creating the vanilla builders for %s datasets..." % len(datasets))
    with futures.ThreadPoolExecutor(
            max_workers=WORKER_COUNT_DATASETS) as tpool:
        builders = tpool.map(tfds.builder, datasets)
    print("Vanilla builders built, constructing module_to_builder dict...")

    for builder in builders:
        module_name = builder.__class__.__module__
        modules = module_name.split(".")
        if "testing" in modules:
            continue

        current_mod_ctr = module_to_builder
        for mod in modules:
            current_mod_ctr = current_mod_ctr[mod]
        current_mod_ctr.append(builder)

    module_to_builder = module_to_builder["tensorflow_datasets"]
    return module_to_builder
Esempio n. 2
0
def refactor_datasets() -> None:
  """Refactoring all dataset into one folder."""
  for ds_name in (
      FLAGS.datasets.split(',')
      or tfds.list_builders(with_community_datasets=False)
  ):
    refactor_dataset(ds_name)
Esempio n. 3
0
    def __init__(self,
                 name,
                 data_dir,
                 image_size,
                 download=False,
                 num_max_boxes=None,
                 *args,
                 **kwargs):
        super().__init__(
            *args,
            **kwargs,
        )

        if name in tfds.list_builders():
            self._builder = tfds.builder(name, data_dir=data_dir)
            if download:
                self._builder.download_and_prepare()
        else:
            if not tf.io.gfile.exists(os.path.join(data_dir, name)):
                raise ValueError(
                    "Dataset directory does not exist: {}\n"
                    "Please run `python blueoil/cmd/build_tfds.py -c <config file>` before training."
                    .format(os.path.join(data_dir, name)))

            self._builder = self.builder_class(name, data_dir=data_dir)

        self.info = self._builder.info
        self._init_available_splits()
        self._validate_feature_structure()

        self.tf_dataset = self._builder.as_dataset(
            split=self.available_splits[self.subset])
        self._image_size = image_size
        self._num_max_boxes = num_max_boxes
        self._format_dataset()
def main(_):
  # Legacy datasets
  urls = set(tfds.core.download.checksums.get_all_url_infos().keys())

  # Dataset-as-folder datasets
  # Could keep track of the dataset name, so the report clearly indicates which
  # dataset should be updated.
  url_infos = {
      name: tfds.builder_cls(name).url_infos
      for name in tfds.list_builders(with_community_datasets=False)
  }
  for url_info in url_infos.values():
    if url_info:
      urls |= url_info.keys()

  urls = sorted(urls)

  with futures.ThreadPoolExecutor(max_workers=100) as executor:
    all_codes = executor.map(_get_status_code, urls)

  print('\n************ Summary ************\n')
  total_errors = 0
  for url, code in zip(urls, all_codes):
    if code == requests.codes.ok:
      continue
    total_errors += 1
    print(f'{url} - status code: {code}')
  print(f'{total_errors} URLs had issues')
Esempio n. 5
0
 def build(self, split=None):
     if self.glob_path in tfds.list_builders():
         return tfds.load(name=self.glob_path,
                          split=split,
                          with_info=True,
                          as_supervised=True,
                          try_gcs=tfds.is_dataset_on_gcs(self.glob_path))
     files = tf.io.gfile.glob(self.glob_path)
     if len(files) == 0:
         raise ValueError('No file found')
     try:
         num = reduce(
             lambda x, y: x + y,
             map(lambda file: self._get_num_from_name(file), files))
     except Exception:
         raise ValueError(
             'Please format file name like <name>_<number>.<extension>')
     else:
         tfrecords = list(
             filter(lambda file: file.endswith('.tfrecords'), files))
         txts = list(filter(lambda file: file.endswith('.txt'), files))
         if len(tfrecords) > 0:
             tfrecords_dataset = self._dataset_internal(
                 tfrecords, tf.data.TFRecordDataset, self.parse_tfrecord)
         if len(txts) > 0:
             txts_dataset = self._dataset_internal(txts,
                                                   tf.data.TextLineDataset,
                                                   self.parse_text)
         if len(tfrecords) > 0 and len(txts) > 0:
             return tfrecords_dataset.concatenate(txts_dataset), num
         elif len(tfrecords) > 0:
             return tfrecords_dataset, num
         elif len(txts) > 0:
             return txts_dataset, num
Esempio n. 6
0
def main(_):
    if FLAGS.debug_start:
        pdb.set_trace()
    if FLAGS.sleep_start:
        time.sleep(60 * 60 * 3)

    datasets_to_build = set(FLAGS.datasets and FLAGS.datasets.split(",")
                            or tfds.list_builders())
    datasets_to_build -= set(FLAGS.exclude_datasets.split(","))
    logging.info("Running download_and_prepare for datasets:\n%s",
                 "\n".join(datasets_to_build))
    builders = {
        name: tfds.builder(name, data_dir=FLAGS.data_dir)
        for name in datasets_to_build
    }

    for name, builder in builders.items():
        if builder.BUILDER_CONFIGS and "/" not in name:
            # If builder has multiple configs, and no particular config was
            # requested, then compute all.
            for config in builder.BUILDER_CONFIGS:
                builder_for_config = tfds.builder(builder.name,
                                                  data_dir=FLAGS.data_dir,
                                                  config=config)
                download_and_prepare(builder_for_config)
        else:
            # If there is a slash in the name, then user requested a specific
            # dataset configuration.
            download_and_prepare(builder)
Esempio n. 7
0
def make_category_to_builders_dict(
    datasets: Optional[List[str]] = None,
) -> Dict[str, List[tfds.core.DatasetBuilder]]:
    """Returns the `Dict[dataset_type, List[Builder]]`."""
    if not datasets:
        datasets = [
            name for name in tfds.list_builders()
            if name not in BUILDER_BLACKLIST
        ]
    print('Creating the vanilla builders for %s datasets...' % len(datasets))
    with futures.ThreadPoolExecutor(
            max_workers=WORKER_COUNT_DATASETS) as tpool:
        builders = tpool.map(tfds.builder, datasets)
    print('Vanilla builders built, constructing module_to_builder dict...')

    # Dict[dataset_type, List[Builder]]
    category_to_builders = collections.defaultdict(list)

    for builder in builders:
        module = type(builder).__module__
        if not module.startswith('tensorflow_datasets.'):
            raise AssertionError(f'Unexpected builder {type(builder)}: module')

        module_parts = module.split('.')

        if 'testing' in module_parts:
            continue
        _, category, *_ = module_parts  # tfds.<category>.xyz

        category_to_builders[category].append(builder)
    return category_to_builders
Esempio n. 8
0
def make_module_to_builder_dict():
    """Get all builders organized by module in nested dicts."""
    # pylint: disable=g-long-lambda
    # dict to hold tfds->image->mnist->[builders]
    module_to_builder = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict(list)))
    # pylint: enable=g-long-lambda

    builders = [
        tfds.builder(name)
        for name in tfds.list_builders() if name not in BUILDER_BLACKLIST
    ] + [
        tfds.builder("image_label_folder", dataset_name="image_label_folder")
    ]

    for builder in builders:
        mod_name = builder.__class__.__module__
        modules = mod_name.split(".")

        current_mod_ctr = module_to_builder
        for mod in modules:
            current_mod_ctr = current_mod_ctr[mod]
        current_mod_ctr.append(builder)

    module_to_builder = module_to_builder["tensorflow_datasets"]
    return module_to_builder
Esempio n. 9
0
def main(_):
    if FLAGS.module_import:
        import_modules(FLAGS.module_import)

    if FLAGS.debug_start:
        pdb.set_trace()
    if FLAGS.sleep_start:
        time.sleep(60 * 60 * 3)

    if FLAGS.disable_tqdm:
        logging.info("Disabling tqdm.")
        tfds.disable_progress_bar()

    datasets_to_build = set(FLAGS.datasets and FLAGS.datasets.split(",")
                            or tfds.list_builders())
    datasets_to_build -= set(FLAGS.exclude_datasets.split(","))
    version = "experimental_latest" if FLAGS.experimental_latest_version else None
    logging.info("Running download_and_prepare for datasets:\n%s",
                 "\n".join(datasets_to_build))
    logging.info('Version: "%s"', version)
    builders = {
        name: tfds.builder(name, data_dir=FLAGS.data_dir, version=version)
        for name in datasets_to_build
    }

    if FLAGS.builder_config_id is not None:
        # Requesting a single config of a single dataset
        if len(builders) > 1:
            raise ValueError(
                "--builder_config_id can only be used when building a single dataset"
            )
        builder = builders[list(builders.keys())[0]]
        if not builder.BUILDER_CONFIGS:
            raise ValueError(
                "--builder_config_id can only be used with datasets with configs"
            )
        config = builder.BUILDER_CONFIGS[FLAGS.builder_config_id]
        logging.info("Running download_and_prepare for config: %s",
                     config.name)
        builder_for_config = tfds.builder(builder.name,
                                          data_dir=FLAGS.data_dir,
                                          config=config,
                                          version=version)
        download_and_prepare(builder_for_config)
    else:
        for name, builder in builders.items():
            if builder.BUILDER_CONFIGS and "/" not in name:
                # If builder has multiple configs, and no particular config was
                # requested, then compute all.
                for config in builder.BUILDER_CONFIGS:
                    builder_for_config = tfds.builder(builder.name,
                                                      data_dir=FLAGS.data_dir,
                                                      config=config,
                                                      version=version)
                    download_and_prepare(builder_for_config)
            else:
                # If there is a slash in the name, then user requested a specific
                # dataset configuration.
                download_and_prepare(builder)
 def test_list_builder(self):
     test_datasets = {
         tfds.testing.DummyMnist.name,
         tfds.testing.DummyDatasetSharedGenerator.name,
     }
     registered_datasets = set(tfds.list_builders())
     # The tests datasets should not be present in the registered datasets
     self.assertEmpty(test_datasets & registered_datasets)
Esempio n. 11
0
def test_exclude_datasets():
    # Exclude all datasets except 2
    all_ds = [b for b in tfds.list_builders() if b not in ('mnist', 'cifar10')]
    all_ds_str = ','.join(all_ds)

    dl_and_prepare = _build(f'--exclude_datasets {all_ds_str}')
    assert dl_and_prepare.call_count == 2

    with pytest.raises(ValueError, match='--exclude_datasets can\'t be used'):
        dl_and_prepare = _build('mnist --exclude_datasets cifar10')
Esempio n. 12
0
def main():
    print("Demonstration for using Imagenet2012 dataset with tensorflow datset")
    # List all the datasets provided in the tensorflow_datasets
    # print(tfds.list_builders())
    # Step 1: get a dataset builder for the required dataset
    dataset_name = "imagenet2012"
    if dataset_name in tfds.list_builders():
        imagenet_dataset_builder = tfds.builder(dataset_name)
        print("retrived " + dataset_name + " builder")
    else:
        return
    # get all the information regarding dataset
    print(imagenet_dataset_builder.info)
    print("Image shape", imagenet_dataset_builder.info.features['image'].shape)
    print("class",imagenet_dataset_builder.info.features['label'].num_classes)
    print("classname",imagenet_dataset_builder.info.features['label'].names)
    print("NrTrain",imagenet_dataset_builder.info.splits['train'].num_examples)
    print("Val",imagenet_dataset_builder.info.splits['validation'].num_examples)
    # Download and prepare the dataset internally
    # The dataset should be downloaded to ~/tensorflow-datasets/download
    # but for Imagenet case, we need to manually download the dataset and
    # specify the manual_dir where the downloaded files are kept.
    manual_dataset_dir = "/data/datasets"
    # The download_and_prepare function will assume that two files namely
    # ILSVRC2012_img_train.tar and ILSVRC2012_img_val.tar are present in
    # directory manual_dataset_dir + "/manual/imagenet2012"
    imagenet_download_config = tfds.download.DownloadConfig(
                                                manual_dir = manual_dataset_dir)
    # Conditionally, download config can be passed as second argument.
    imagenet_dataset_builder.download_and_prepare(
                                    download_dir = manual_dataset_dir)
    # Once this is complete (that just pre-process without downloading anything)
    # it will create a director "~/tensorflow_datasets/imagenet2012/2.0.0"
    # having 1000 train tfrecords and 5 validation tfrecords in addition to some
    # bookkeeping json and label txt files.

    # now, we get the tf.data.Dataset structure which tensorflow data-pipeline
    # understands and process in tf graph.
    imagenet_train = imagenet_dataset_builder.as_dataset(split=tfds.Split.TRAIN)
    assert isinstance(imagenet_train, tf.data.Dataset)
    imagenet_validation = imagenet_dataset_builder.as_dataset(
                                                    split=tfds.Split.VALIDATION)
    assert isinstance(imagenet_validation, tf.data.Dataset)

    # Now we can peek into the sample images present in the dataset with take
    (imagenet_example,) = imagenet_train.take(1) # returns a dictionary
    img, label = imagenet_example["image"], imagenet_example["label"]
    # img and label are constant tensors, with numpy field containing numpy arry
    print("Image_shape", img.numpy().shape)
    print("Label_shape", label.numpy().shape)
    # print out the image file on the disk, and print the corresponding label
    imsave("image.png", img.numpy())
    print("label", label.numpy())
Esempio n. 13
0
class TFDS():
  """Download and process datasets from tensorflow dataset"""
  AVAILABLE_DATASETS = tfds.list_builders()

  def __init__(self, name:str, seed=1234, data_dir:Optional[str]=None) -> None:

    (self.train_ds, self.val_ds, self.test_ds), self.metadata = tfds.load(
      name=name,
      split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
      with_info=True,
      as_supervised=True,
      data_dir=data_dir)

    self.total_imgs = self.metadata.splits['train'].num_examples
    self.num_trainImgs = int(self.total_imgs * 0.8)
    self.num_valImgs = int(self.total_imgs * 0.1)
    self.num_testImgs = int(self.total_imgs * 0.1)
    self.img_shape = next(iter(self.train_ds))[0].shape
    self.get_label_name = self.metadata.features['label'].int2str
    self.seed=seed
    self.Preprocess = Preprocess(seed=self.seed)

  def get_dataset(self, num_epochs:int=300, batch_size:int=32,
                  input_shape:Iterable[int]=(32, 32, 3), seed=None):
    """[Generate train, val, test sets from tfds]

      Args:
          batch_size (int, optional): [number of images for each batch]. Defaults to 32.
          num_epochs (int, required): [number of epochs to run on experiment]. Defaults to 300.
          input_shape (tuple, optional): [shape of each image (h, w, c)]. Defaults to (32, 32, 3).
          seed ([type], optional): []. Defaults to None.

      Returns:
          [tuple]: [train, val, test sets]
    """ 
    train_prepare_data_fn = functools.partial(self.Preprocess.preprocess, input_shape=input_shape)
    test_prepare_data_fn = functools.partial(self.Preprocess.preprocess, augment=False, input_shape=input_shape)
    train_ds = (self.train_ds
                     .repeat(num_epochs)
                     .shuffle(10000, seed=seed)
                     .map(train_prepare_data_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                     .batch(batch_size)
                     .prefetch(tf.data.experimental.AUTOTUNE)
    )
    val_ds =  (self.val_ds
                     .repeat(num_epochs)
                     .map(test_prepare_data_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                     .batch(batch_size)
                     .prefetch(tf.data.experimental.AUTOTUNE)
    )
    test_ds = self.test_ds.map(test_prepare_data_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

    return (train_ds, val_ds, test_ds)
Esempio n. 14
0
def test_exclude_datasets():
    # Exclude all datasets except 2
    all_ds = [b for b in tfds.list_builders() if b not in ('mnist', 'cifar10')]
    all_ds_str = ','.join(all_ds)

    assert _build(f'--exclude_datasets {all_ds_str}') == [
        'cifar10',
        'mnist',
    ]

    with pytest.raises(ValueError, match='--exclude_datasets can\'t be used'):
        _build('mnist --exclude_datasets cifar10')
Esempio n. 15
0
def get_tfds(dtype: str,
             data_dir: str = None,
             x_name="image",
             y_name="label",
             is_verbose=True,
             **kwargs):
    name = dtype_to_name(dtype)
    assert name in tfds.list_builders()

    data_dir = data_dir or os.path.join("~", "tfds", "{}_data".format(
        name.upper()))  # e.g. ~/tfds/MNIST_data

    # https://www.tensorflow.org/datasets/datasets
    loaded, info = tfds.load(
        name=name,
        split=["train", "test"],
        data_dir=data_dir,
        batch_size=-1,
        with_info=True,
    )
    if is_verbose:
        print(info)

    # Get numpy matrix
    train_and_validation, test = tfds.as_numpy(loaded)

    # Preprocess & Reshape
    train_and_validation_x, test_x = preprocess_xs(
        name,
        [train_and_validation[x_name], test[x_name]],
        **kwargs,
    )

    # Training Validation Separation
    # this is necessary because tfds does not support validation separation.
    train_num, val_num = name_to_train_and_val_num(name)
    train_x = train_and_validation_x[:train_num]
    val_x = train_and_validation_x[-val_num:]

    # One hot labeling
    to_one_hot = get_to_one_hot(info.features[y_name].num_classes)
    data_label = DataLabel(
        train_labels=to_one_hot(train_and_validation[y_name][:train_num]),
        validation_labels=to_one_hot(train_and_validation[y_name][-val_num:]),
        test_labels=to_one_hot(test[y_name]),
        label_type=LabelType.ONE_LABELS_TO_ALL_TASK,
    )
    return data_label, train_x, val_x, test_x
Esempio n. 16
0
def _build_datasets(args: argparse.Namespace) -> None:
  """Build the given datasets."""
  # Select datasets to generate
  datasets = (args.datasets or []) + (args.datasets_keyword or [])
  if args.exclude_datasets:  # Generate all datasets if `--exclude_datasets` set
    if datasets:
      raise ValueError('--exclude_datasets can\'t be used with `datasets`')
    datasets = set(tfds.list_builders()) - set(args.exclude_datasets.split(','))
  else:
    datasets = datasets or ['']  # Empty string for default

  # Generate all datasets sequencially
  for ds_to_build in datasets:
    # Each `str` may correspond to multiple builder (e.g. multiple configs)
    for builder in _make_builders(args, ds_to_build):
      _download_and_prepare(args, builder)
def main(_):
    if FLAGS.debug_start:
        pdb.set_trace()
    if FLAGS.sleep_start:
        time.sleep(60 * 60 * 3)

    datasets_to_build = set(FLAGS.datasets and FLAGS.datasets.split(",")
                            or tfds.list_builders())
    datasets_to_build -= set(FLAGS.exclude_datasets.split(","))
    logging.info("Running download_and_prepare for datasets:\n%s",
                 "\n".join(datasets_to_build))
    builders = {
        name: tfds.builder(name, data_dir=FLAGS.data_dir)
        for name in datasets_to_build
    }

    if FLAGS.builder_config_id is not None:
        # Requesting a single config of a single dataset
        if len(builders) > 1:
            raise ValueError(
                "--builder_config_id can only be used when building a single dataset"
            )
        builder, = builders
        if not builder.BUILDER_CONFIGS:
            raise ValueError(
                "--builder_config_id can only be used with datasets with configs"
            )
        config = builder.BUILDER_CONFIGS[FLAGS.builder_config_id]
        builder_for_config = tfds.builder(builder.name,
                                          data_dir=FLAGS.data_dir,
                                          config=config)
        download_and_prepare(builder_for_config)
    else:
        for name, builder in builders.items():
            if builder.BUILDER_CONFIGS and "/" not in name:
                # If builder has multiple configs, and no particular config was
                # requested, then compute all.
                for config in builder.BUILDER_CONFIGS:
                    builder_for_config = tfds.builder(builder.name,
                                                      data_dir=FLAGS.data_dir,
                                                      config=config)
                    download_and_prepare(builder_for_config)
            else:
                # If there is a slash in the name, then user requested a specific
                # dataset configuration.
                download_and_prepare(builder)
Esempio n. 18
0
def _collect_path_to_url_infos(
) -> Dict[tfds.core.ReadWritePath, Dict[Url, checksums.UrlInfo]]:
    """Collect checksums paths to url_infos."""
    # Collect legacy checksums paths
    url_info_paths = list(checksums._checksum_paths().values())  # pylint: disable=protected-access

    # Collect dataset-as-folder checksums path
    for name in tfds.list_builders():
        url_info_path = tfds.builder_cls(name)._checksums_path  # pylint: disable=protected-access
        if url_info_path.exists():
            url_info_paths.append(url_info_path)

    url_info_paths = [tfds.core.utils.to_write_path(p) for p in url_info_paths]
    return {
        path: typing.cast(Dict[Url, checksums.UrlInfo],
                          checksums.load_url_infos(path))
        for path in url_info_paths
    }
Esempio n. 19
0
    def test_glue_load(self):
        hparams = Hparams()
        hparams.load_from_config_file("../configs/qa/dureader_yesno.yml")
        hparams.stand_by()
        checksum_dir = "../aispace/datasets/url_checksums"
        tfds.download.add_checksums_dir(checksum_dir)
        download_config = DownloadConfig(register_checksums=True)
        print(tfds.list_builders())
        dureader = tfds.load(
            "dureader/yesno",
            # data_dir="/search/data1/yyk/data/datasets/glue_zh",
            data_dir="../data/dureader",
            builder_kwargs={'hparams': hparams},
            download_and_prepare_kwargs={'download_config': download_config})
        for itm in dureader['train']:
            print(itm)
            break
        print()

        # train_dataset, dev_dataset, dataset_info = next(load_dataset(hparams, ret_test=False))
        # test_dataset = next(load_dataset(hparams, ret_train=True, ret_dev=True, ret_test=True, ret_info=True))[0]

        # total, zero = 0, 0
        # for itm in tqdm(test_dataset):
        # tt = itm[0]['input_ids'].numpy().tolist()
        # print(itm[0]['p_mask'].numpy().tolist())
        # print(itm[0]['start_position'].numpy().tolist())
        # print(itm[0]['end_position'].numpy().tolist())
        # print(tt)
        # break
        # total += 1
        # zero += len([t for t in tt if t == 0])
        # print()
        # print(f"{zero}, {total}, {zero / float(total)}")
        # print(total)


# python -u aispace/trainer.py \
#    --experiment_name test \
#    --model_name bert_for_classification \
#    --schedule train_and_eval \
#    --config_name tnews \
#    --config_dir ./configs/glue_zh \
#    --gpus 0 1 2 3
Esempio n. 20
0
def _build_datasets(args: argparse.Namespace) -> None:
    """Build the given datasets."""
    # Eventually register additional datasets imports
    if args.imports:
        list(importlib.import_module(m) for m in args.imports.split(','))

    # Select datasets to generate
    datasets = (args.datasets or []) + (args.datasets_keyword or [])
    if args.exclude_datasets:  # Generate all datasets if `--exclude_datasets` set
        if datasets:
            raise ValueError(
                '--exclude_datasets can\'t be used with `datasets`')
        datasets = (set(tfds.list_builders(with_community_datasets=False)) -
                    set(args.exclude_datasets.split(',')))
        datasets = sorted(datasets)  # `set` is not deterministic
    else:
        datasets = datasets or ['']  # Empty string for default

    # Generate all datasets sequencially
    for ds_to_build in datasets:
        # Each `str` may correspond to multiple builder (e.g. multiple configs)
        for builder in _make_builders(args, ds_to_build):
            _download_and_prepare(args, builder)
Esempio n. 21
0
def main():
    pp = pprint.PrettyPrinter(indent=4)

    print("### looking up available datasets ###")
    builders: List[str] = tfds.list_builders()
    pp.pprint(builders)

    print("\n### the first 5 elements of train_ex ###")
    pipeline = SetupInputPipeline()
    pipeline.init_ted_hrlr()
    train_ex = pipeline.train_ex
    train_ex_numpy = ((pt.numpy(), en.numpy())
                      for (pt, en) in train_ex)  # generator comprehension
    pp.pprint(list(itertools.islice(train_ex_numpy, 5)))

    print("\n### testing the english tokenizer ###")
    pipeline.init_subwords_tokenizer()
    tokenizer_en = pipeline.tokenizer_en
    sample_string = "Transformer is awesome"
    tokenized_string = tokenizer_en.encode(
        s=sample_string)  # this won't terminate?
    print("Sample string: {}\nEncoded string: {}".format(
        sample_string, tokenized_string))

    print(
        "\n### if a word does not exist in vocab, it is tokenized into subwords ###"
    )
    for ts in tokenized_string:
        print('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

    print("\n### the first 5 elements of the preprocessed training set ###")
    preproc_train_ex = pipeline.preproc_train_ex()
    preproc_train_ex_numpy = ((pt.numpy(), en.numpy())
                              for (pt, en) in preproc_train_ex)
    # have a look at the first batch (64 instances)
    pp.pprint(next(iter(preproc_train_ex_numpy)))
Esempio n. 22
0
import collections
import os
import sys

from absl import app
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow_datasets.core.utils import py_utils

BASE_URL = "https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets"

# ImageLabelFolder require an extra constructor arg so is handled separately
# TODO(tfds): Document the manual_dir datasets in a separate section
BUILDER_BLACKLIST = ["image_label_folder"]

DOC = """\
<!-- auto-generated by tfds.scripts.document_datasets -->
# Datasets

## Usage

```
# See all registered datasets
tfds.list_builders()

# Load a given dataset by name, along with the DatasetInfo
data, info = tfds.load("mnist", with_info=True)
train_data, test_data = data['test'], data['train']
assert isinstance(train_data, tf.data.Dataset)
assert info.features['label'].num_classes == 10
assert info.splits['train'].num_examples == 60000
Esempio n. 23
0
def main(_):
    if FLAGS.module_import:
        import_modules(FLAGS.module_import)

    if FLAGS.debug_start:
        pdb.set_trace()
    if FLAGS.sleep_start:
        time.sleep(60 * 60 * 3)

    if FLAGS.disable_tqdm:
        logging.info("Disabling tqdm.")
        tfds.disable_progress_bar()

    if FLAGS.checksums_dir:
        tfds.download.add_checksums_dir(FLAGS.checksums_dir)

    datasets_to_build = set(FLAGS.datasets and FLAGS.datasets.split(",")
                            or tfds.list_builders())
    datasets_to_build -= set(FLAGS.exclude_datasets.split(","))

    # Only pass the version kwargs when required. Otherwise, `version=None`
    # overwrite the version parsed from the name.
    # `tfds.builder('my_dataset:1.2.0', version=None)`
    if FLAGS.experimental_latest_version:
        version_kwarg = {"version": "experimental_latest"}
    else:
        version_kwarg = {}

    logging.info("Running download_and_prepare for dataset(s):\n%s",
                 "\n".join(datasets_to_build))
    builders = {
        name: tfds.builder(name, data_dir=FLAGS.data_dir, **version_kwarg)
        for name in datasets_to_build
    }

    if FLAGS.builder_config_id is not None:
        # Requesting a single config of a single dataset
        if len(builders) > 1:
            raise ValueError(
                "--builder_config_id can only be used when building a single dataset"
            )
        builder = builders[list(builders.keys())[0]]
        if not builder.BUILDER_CONFIGS:
            raise ValueError(
                "--builder_config_id can only be used with datasets with configs"
            )
        config = builder.BUILDER_CONFIGS[FLAGS.builder_config_id]
        logging.info("Running download_and_prepare for config: %s",
                     config.name)
        builder_for_config = tfds.builder(builder.name,
                                          data_dir=FLAGS.data_dir,
                                          config=config,
                                          **version_kwarg)
        download_and_prepare(builder_for_config)
    else:
        for name, builder in builders.items():
            if builder.BUILDER_CONFIGS and "/" not in name:
                # If builder has multiple configs, and no particular config was
                # requested, then compute all.
                for config in builder.BUILDER_CONFIGS:
                    builder_for_config = tfds.builder(builder.name,
                                                      data_dir=FLAGS.data_dir,
                                                      config=config,
                                                      **version_kwarg)
                    download_and_prepare(builder_for_config)
            else:
                # If there is a slash in the name, then user requested a specific
                # dataset configuration.
                download_and_prepare(builder)
Esempio n. 24
0
from __future__ import division
from __future__ import print_function

import os
import pdb
import time

from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import tensorflow_datasets as tfds
import termcolor

FLAGS = flags.FLAGS
BUILDERS = ",".join(tfds.list_builders())

DEFAULT_DATA_DIR = os.path.expanduser(os.path.join("~", "tensorflow_datasets"))

flags.DEFINE_string(
    "datasets", BUILDERS,
    "Comma separated list of datasets to build, defaults to all"
    "registered builders.")
flags.DEFINE_string(
    "exclude_datasets", "", "Comma separated list of datasets to exclude,"
    "(no download, no prepare).")

flags.DEFINE_string("data_dir", DEFAULT_DATA_DIR, "Were to place the data.")
flags.DEFINE_string("download_dir", None, "Where to place downloads.")
flags.DEFINE_string("extract_dir", None, "Where to extract files.")
flags.DEFINE_string(
Esempio n. 25
0
def _all_tfds_datasets() -> List[str]:
    """Returns all "official" TFDS dataset names."""
    return sorted([
        name for name in tfds.list_builders(with_community_datasets=True)  # pylint: disable=g-complex-comprehension
        if name not in _BUILDER_BLACKLIST
    ])
Esempio n. 26
0

# %%

import tensorflow as tf
import tensorflow_datasets as tfds

import IPython.display as display

# Here we assume Eager mode is enabled (TF2), but tfds also works in Graph mode.

print(tf.__version__)
# %%

# See available datasets
print(tfds.list_builders())

# %%

# Construct a tf.data.Dataset
#ds_train = tfds.load(name="mnist", split="train", shuffle_files=True)

# Build your input pipeline
# %%
ds_train = tfds.load(name="coco/2017", split="train", shuffle_files=True)
# dataset = (
#     ds_train
#     .shuffle(1000)
#     .batch(128)
#     .prefetch(10)
# )
Esempio n. 27
0
# coding=utf-8
# created by msg on 2019/12/4 2:53 下午

import tensorflow as tf
import tensorflow_datasets as tfds

for data in tfds.list_builders():
    print(data)
    try:
        t = tfds.load(data)
    except Exception:
        continue
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
#!/usr/bin/env python
# coding: utf-8

import tensorflow as tf
import tensorflow_datasets as tfds

# For brevity I decided to reuse this small image dataset that was available with tfds
# For an example of a more extensive raw data preprocessing I point to my recent project
# here: https://github.com/dsalaj/common-voice-tf
IMGDS = 'aflw2k3d'
assert IMGDS in tfds.list_builders(
), IMGDS + ' dataset not found in tfds! This was tested with tensorflow-datasets-2.1.0'
ds_builder = tfds.builder(IMGDS)
ds_builder.download_and_prepare()
ds_raw = ds_builder.as_dataset(split='train')

# import matplotlib.pyplot as plt
# # get_ipython().run_line_magic('matplotlib', 'inline')
_, ds_info = tfds.load(IMGDS, with_info=True)
num_examples = ds_info.splits['train'].num_examples
# # Plot samples from the dataset
# fig = tfds.show_examples(ds_info, ds_raw)


def extract_images(features):
    return features['image']  # (450, 450, 3)


ds = ds_raw.map(extract_images).shuffle(num_examples)

Esempio n. 29
0
def refactor_datasets() -> None:
  """Refactoring all dataset into one folder."""
  for ds_name in tfds.list_builders():
    refactor_dataset(ds_name)
Esempio n. 30
0
def in_tfds(dataset_name: str):
    all_datasets = tfds.list_builders()
    name_without_params = dataset_name.split("/")[0]
    return name_without_params in all_datasets