def _get_process_split(split, process_index, process_count, drop_remainder):
    """Returns the split for the given process given a multi-process setup."""
    splits = tfds.even_splits(split,
                              n=process_count,
                              drop_remainder=drop_remainder)
    process_split = splits[process_index]
    return process_split
Esempio n. 2
0
    def make_source_dataset(self, current_host_index, num_hosts):
        """Makes a dataset of dictionaries of images and labels.

    Args:
      current_host_index: current host index.
      num_hosts: total number of hosts.

    Returns:
      A `tf.data.Dataset` object where each dataset element is a dictionary. For
      image classification datasets, the dictionary will contain an 'image' key
      with a decoded uint8 image (of shape [height, width, channels]) and a
      'label' key with an int64 label. For video classification datasets, the
      dictionary will contain a 'video' key with a decoded uint8 video (of shape
      [frames, height, width, channels]) and a 'label' key with an int64 label.
    """
        split = self.split
        if self.mode == enums.ModelMode.TRAIN and self.shard_per_host:
            split = tfds.even_splits(split, n=num_hosts)[current_host_index]
        # Don't shuffle until after sharding, since otherwise you risk dropping
        # samples because the sharding is performed on different shufflings of the
        # data for each core.
        return tfds.load(name=self.dataset_name,
                         split=split,
                         data_dir=self.data_dir,
                         shuffle_files=False)
Esempio n. 3
0
def main(_):
    log('Loading "coco_captions" from tfds. This will build a copy of '
        'the dataset under TFDS_DATA_DIR if one doesn\'t already exist.')
    tfds.load('coco_captions')

    # Milan split name => constituent tfds subsplits. (Subsplitting enables the
    # tfds data to be processed in parallel.)
    split_to_coco_subsplits = dict(train=tfds.even_splits('train', 100) +
                                   tfds.even_splits('restval', 100),
                                   dev=tfds.even_splits('val', 50),
                                   test=tfds.even_splits('test', 50))

    encoder_params = TfHubBertEncoder.Params().Set(
        preprocessor=FLAGS.bert_tfhub_preprocessor,
        model=FLAGS.bert_tfhub_model,
        max_seq_len=FLAGS.bert_max_length)

    def pipeline(root):
        for split in FLAGS.splits:
            subsplits = split_to_coco_subsplits[split]
            _ = root | split.title() >> (
                'Create' >> beam.Create(subsplits)
                | 'Read' >> beam.ParDo(ReadCocoFromTfdsFn())
                | 'Encode' >> beam.ParDo(EncodeCaptionsFn(encoder_params))
                | 'ToProto' >> beam.Map(dict_to_example_proto)
                | 'Shuffle' >> beam.Reshuffle()
                | 'Write' >> beam.io.tfrecordio.WriteToTFRecord(
                    file_path_prefix=os.path.join(FLAGS.output_dir, split),
                    coder=beam.coders.ProtoCoder(tf.train.Example),
                    num_shards=FLAGS.num_output_shards))

    pipeline_options = beam.options.pipeline_options.PipelineOptions()
    pipeline_options.view_as(beam.options.pipeline_options.DirectOptions
                             ).direct_num_workers = FLAGS.num_workers
    with beam.Pipeline(options=pipeline_options) as root:
        pipeline(root)
Esempio n. 4
0

"""### MAIN CELL: Experimenting a range of hyperparameters"""

# at = [0.2, 0.9]
# ll1 = [1e-3]
# ll2 = [1e-3, 1e-4]
# all = []
at = [0.0, 0.2, 0.4, 0.5, 0.7, 0.9]
ll1 = [1e-1, 1e-2, 1e-3]
ll2 = [1e-3, 1e-4, 1e-5]
all = []
n_envs = 2
ds_train_envs = []
batch_size = 10000
splits = tfds.even_splits('train', n=n_envs)
for m in range(n_envs):
    ds = tfds.load("mnist:3.*.*", split=splits[m]).cache().repeat()
    ds = ds.shuffle(10 * batch_size, seed=0)
    ds = ds.batch(batch_size)
    ds_train_envs.append(iter(tfds.as_numpy(ds)))

test_ds = tfds.load("mnist:3.*.*", split='test').cache().repeat()
test_ds = test_ds.shuffle(10 * 10000, seed=0)
test_ds = test_ds.batch(10000)
test_ds= iter(tfds.as_numpy(test_ds))

round = 0
for idx, thresh in enumerate(at):
  for l1 in ll1:
    for l2 in ll2:
    def _lazy_init(self):
        """ Lazily initialize the dataset.

        This is necessary to init the Tensorflow dataset pipeline in the (dataloader) process that
        will be using the dataset instance. The __init__ method is called on the main process,
        this will be called in a dataloader worker process.

        NOTE: There will be problems if you try to re-use this dataset across different loader/worker
        instances once it has been initialized. Do not call any dataset methods that can call _lazy_init
        before it is passed to dataloader.
        """
        worker_info = torch.utils.data.get_worker_info()

        # setup input context to split dataset across distributed processes
        num_workers = 1
        global_worker_id = 0
        if worker_info is not None:
            self.worker_info = worker_info
            self.worker_seed = worker_info.seed
            num_workers = worker_info.num_workers
            self.global_num_workers = self.dist_num_replicas * num_workers
            global_worker_id = self.dist_rank * num_workers + worker_info.id
            """ Data sharding
            InputContext will assign subset of underlying TFRecord files to each 'pipeline' if used.
            My understanding is that using split, the underling TFRecord files will shuffle (shuffle_files=True)
            between the splits each iteration, but that understanding could be wrong.

            I am currently using a mix of InputContext shard assignment and fine-grained sub-splits for distributing
            the data across workers. For training InputContext is used to assign shards to nodes unless num_shards
            in dataset < total number of workers. Otherwise sub-split API is used for datasets without enough shards or
            for validation where we can't drop examples and need to avoid minimize uneven splits to avoid padding.
            """
            should_subsplit = self.global_num_workers > 1 and (
                self.split_info.num_shards < self.global_num_workers
                or not self.is_training)
            if should_subsplit:
                # split the dataset w/o using sharding for more even examples / worker, can result in less optimal
                # read patterns for distributed training (overlap across shards) so better to use InputContext there
                if has_buggy_even_splits:
                    # my even_split workaround doesn't work on subsplits, upgrade tfds!
                    if not isinstance(self.split_info,
                                      tfds.core.splits.SubSplitInfo):
                        subsplits = even_split_indices(self.split,
                                                       self.global_num_workers,
                                                       self.num_examples)
                        self.subsplit = subsplits[global_worker_id]
                else:
                    subsplits = tfds.even_splits(self.split,
                                                 self.global_num_workers)
                    self.subsplit = subsplits[global_worker_id]

        input_context = None
        if self.global_num_workers > 1 and self.subsplit is None:
            # set input context to divide shards among distributed replicas
            input_context = tf.distribute.InputContext(
                num_input_pipelines=self.global_num_workers,
                input_pipeline_id=global_worker_id,
                num_replicas_in_sync=self.
                dist_num_replicas  # FIXME does this arg have any impact?
            )
        read_config = tfds.ReadConfig(shuffle_seed=self.common_seed,
                                      shuffle_reshuffle_each_iteration=True,
                                      input_context=input_context)
        ds = self.builder.as_dataset(split=self.subsplit or self.split,
                                     shuffle_files=self.is_training,
                                     read_config=read_config)
        # avoid overloading threading w/ combo of TF ds threads + PyTorch workers
        options = tf.data.Options()
        thread_member = 'threading' if hasattr(
            options, 'threading') else 'experimental_threading'
        getattr(options, thread_member).private_threadpool_size = max(
            1, self.max_threadpool_size // num_workers)
        getattr(options, thread_member).max_intra_op_parallelism = 1
        ds = ds.with_options(options)
        if self.is_training or self.repeats > 1:
            # to prevent excessive drop_last batch behaviour w/ IterableDatasets
            # see warnings at https://pytorch.org/docs/stable/data.html#multi-process-data-loading
            ds = ds.repeat()  # allow wrap around and break iteration manually
        if self.is_training:
            ds = ds.shuffle(min(self.num_examples, self.shuffle_size) //
                            self.global_num_workers,
                            seed=self.worker_seed)
        ds = ds.prefetch(
            min(self.num_examples // self.global_num_workers,
                self.prefetch_size))
        self.ds = tfds.as_numpy(ds)
https://www.tensorflow.org/datasets/catalog/overview#image_classification

Hacked together by / Copyright 2020 Ross Wightman
"""
import math
import torch
import torch.distributed as dist
from PIL import Image

try:
    import tensorflow as tf
    tf.config.set_visible_devices(
        [], 'GPU')  # Hands off my GPU! (or pip install tensorflow-cpu)
    import tensorflow_datasets as tfds
    try:
        tfds.even_splits('', 1, drop_remainder=False
                         )  # non-buggy even_splits has drop_remainder arg
        has_buggy_even_splits = False
    except TypeError:
        print(
            "Warning: This version of tfds doesn't have the latest even_splits impl. "
            "Please update or use tfds-nightly for better fine-grained split behaviour."
        )
        has_buggy_even_splits = True
    # NOTE uncomment below if having file limit issues on dataset build (or alter your OS defaults)
    # import resource
    # low, high = resource.getrlimit(resource.RLIMIT_NOFILE)
    # resource.setrlimit(resource.RLIMIT_NOFILE, (high, high))
except ImportError as e:
    print(e)
    print(
        "Please install tensorflow_datasets package `pip install tensorflow-datasets`."
Esempio n. 7
0
make_plot = True
vote_threshold = 0.5

width = 32
height = 32
n_classes = 10

optimizer = tf.keras.optimizers.Adam
l_rate = 0.001
batch_size = 64
loss = "sparse_categorical_crossentropy"
vote_batches = 2

train_datasets, info = tfds.load('cifar10',
                                 split=tfds.even_splits('train', n=n_learners),
                                 as_supervised=True, with_info=True)
n_datapoints = info.splits['train'].num_examples

test_datasets = tfds.load('cifar10',
                          split=tfds.even_splits('test', n=n_learners),
                          as_supervised=True)

for i in range(n_learners):
    ds_train = train_datasets[i].map(
        normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_train = ds_train.cache()
    ds_train = ds_train.shuffle(n_datapoints // n_learners)
    ds_train = ds_train.batch(batch_size)
    train_datasets[i] = ds_train.prefetch(tf.data.experimental.AUTOTUNE)