def _get_process_split(split, process_index, process_count, drop_remainder): """Returns the split for the given process given a multi-process setup.""" splits = tfds.even_splits(split, n=process_count, drop_remainder=drop_remainder) process_split = splits[process_index] return process_split
def make_source_dataset(self, current_host_index, num_hosts): """Makes a dataset of dictionaries of images and labels. Args: current_host_index: current host index. num_hosts: total number of hosts. Returns: A `tf.data.Dataset` object where each dataset element is a dictionary. For image classification datasets, the dictionary will contain an 'image' key with a decoded uint8 image (of shape [height, width, channels]) and a 'label' key with an int64 label. For video classification datasets, the dictionary will contain a 'video' key with a decoded uint8 video (of shape [frames, height, width, channels]) and a 'label' key with an int64 label. """ split = self.split if self.mode == enums.ModelMode.TRAIN and self.shard_per_host: split = tfds.even_splits(split, n=num_hosts)[current_host_index] # Don't shuffle until after sharding, since otherwise you risk dropping # samples because the sharding is performed on different shufflings of the # data for each core. return tfds.load(name=self.dataset_name, split=split, data_dir=self.data_dir, shuffle_files=False)
def main(_): log('Loading "coco_captions" from tfds. This will build a copy of ' 'the dataset under TFDS_DATA_DIR if one doesn\'t already exist.') tfds.load('coco_captions') # Milan split name => constituent tfds subsplits. (Subsplitting enables the # tfds data to be processed in parallel.) split_to_coco_subsplits = dict(train=tfds.even_splits('train', 100) + tfds.even_splits('restval', 100), dev=tfds.even_splits('val', 50), test=tfds.even_splits('test', 50)) encoder_params = TfHubBertEncoder.Params().Set( preprocessor=FLAGS.bert_tfhub_preprocessor, model=FLAGS.bert_tfhub_model, max_seq_len=FLAGS.bert_max_length) def pipeline(root): for split in FLAGS.splits: subsplits = split_to_coco_subsplits[split] _ = root | split.title() >> ( 'Create' >> beam.Create(subsplits) | 'Read' >> beam.ParDo(ReadCocoFromTfdsFn()) | 'Encode' >> beam.ParDo(EncodeCaptionsFn(encoder_params)) | 'ToProto' >> beam.Map(dict_to_example_proto) | 'Shuffle' >> beam.Reshuffle() | 'Write' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(FLAGS.output_dir, split), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=FLAGS.num_output_shards)) pipeline_options = beam.options.pipeline_options.PipelineOptions() pipeline_options.view_as(beam.options.pipeline_options.DirectOptions ).direct_num_workers = FLAGS.num_workers with beam.Pipeline(options=pipeline_options) as root: pipeline(root)
"""### MAIN CELL: Experimenting a range of hyperparameters""" # at = [0.2, 0.9] # ll1 = [1e-3] # ll2 = [1e-3, 1e-4] # all = [] at = [0.0, 0.2, 0.4, 0.5, 0.7, 0.9] ll1 = [1e-1, 1e-2, 1e-3] ll2 = [1e-3, 1e-4, 1e-5] all = [] n_envs = 2 ds_train_envs = [] batch_size = 10000 splits = tfds.even_splits('train', n=n_envs) for m in range(n_envs): ds = tfds.load("mnist:3.*.*", split=splits[m]).cache().repeat() ds = ds.shuffle(10 * batch_size, seed=0) ds = ds.batch(batch_size) ds_train_envs.append(iter(tfds.as_numpy(ds))) test_ds = tfds.load("mnist:3.*.*", split='test').cache().repeat() test_ds = test_ds.shuffle(10 * 10000, seed=0) test_ds = test_ds.batch(10000) test_ds= iter(tfds.as_numpy(test_ds)) round = 0 for idx, thresh in enumerate(at): for l1 in ll1: for l2 in ll2:
def _lazy_init(self): """ Lazily initialize the dataset. This is necessary to init the Tensorflow dataset pipeline in the (dataloader) process that will be using the dataset instance. The __init__ method is called on the main process, this will be called in a dataloader worker process. NOTE: There will be problems if you try to re-use this dataset across different loader/worker instances once it has been initialized. Do not call any dataset methods that can call _lazy_init before it is passed to dataloader. """ worker_info = torch.utils.data.get_worker_info() # setup input context to split dataset across distributed processes num_workers = 1 global_worker_id = 0 if worker_info is not None: self.worker_info = worker_info self.worker_seed = worker_info.seed num_workers = worker_info.num_workers self.global_num_workers = self.dist_num_replicas * num_workers global_worker_id = self.dist_rank * num_workers + worker_info.id """ Data sharding InputContext will assign subset of underlying TFRecord files to each 'pipeline' if used. My understanding is that using split, the underling TFRecord files will shuffle (shuffle_files=True) between the splits each iteration, but that understanding could be wrong. I am currently using a mix of InputContext shard assignment and fine-grained sub-splits for distributing the data across workers. For training InputContext is used to assign shards to nodes unless num_shards in dataset < total number of workers. Otherwise sub-split API is used for datasets without enough shards or for validation where we can't drop examples and need to avoid minimize uneven splits to avoid padding. """ should_subsplit = self.global_num_workers > 1 and ( self.split_info.num_shards < self.global_num_workers or not self.is_training) if should_subsplit: # split the dataset w/o using sharding for more even examples / worker, can result in less optimal # read patterns for distributed training (overlap across shards) so better to use InputContext there if has_buggy_even_splits: # my even_split workaround doesn't work on subsplits, upgrade tfds! if not isinstance(self.split_info, tfds.core.splits.SubSplitInfo): subsplits = even_split_indices(self.split, self.global_num_workers, self.num_examples) self.subsplit = subsplits[global_worker_id] else: subsplits = tfds.even_splits(self.split, self.global_num_workers) self.subsplit = subsplits[global_worker_id] input_context = None if self.global_num_workers > 1 and self.subsplit is None: # set input context to divide shards among distributed replicas input_context = tf.distribute.InputContext( num_input_pipelines=self.global_num_workers, input_pipeline_id=global_worker_id, num_replicas_in_sync=self. dist_num_replicas # FIXME does this arg have any impact? ) read_config = tfds.ReadConfig(shuffle_seed=self.common_seed, shuffle_reshuffle_each_iteration=True, input_context=input_context) ds = self.builder.as_dataset(split=self.subsplit or self.split, shuffle_files=self.is_training, read_config=read_config) # avoid overloading threading w/ combo of TF ds threads + PyTorch workers options = tf.data.Options() thread_member = 'threading' if hasattr( options, 'threading') else 'experimental_threading' getattr(options, thread_member).private_threadpool_size = max( 1, self.max_threadpool_size // num_workers) getattr(options, thread_member).max_intra_op_parallelism = 1 ds = ds.with_options(options) if self.is_training or self.repeats > 1: # to prevent excessive drop_last batch behaviour w/ IterableDatasets # see warnings at https://pytorch.org/docs/stable/data.html#multi-process-data-loading ds = ds.repeat() # allow wrap around and break iteration manually if self.is_training: ds = ds.shuffle(min(self.num_examples, self.shuffle_size) // self.global_num_workers, seed=self.worker_seed) ds = ds.prefetch( min(self.num_examples // self.global_num_workers, self.prefetch_size)) self.ds = tfds.as_numpy(ds)
https://www.tensorflow.org/datasets/catalog/overview#image_classification Hacked together by / Copyright 2020 Ross Wightman """ import math import torch import torch.distributed as dist from PIL import Image try: import tensorflow as tf tf.config.set_visible_devices( [], 'GPU') # Hands off my GPU! (or pip install tensorflow-cpu) import tensorflow_datasets as tfds try: tfds.even_splits('', 1, drop_remainder=False ) # non-buggy even_splits has drop_remainder arg has_buggy_even_splits = False except TypeError: print( "Warning: This version of tfds doesn't have the latest even_splits impl. " "Please update or use tfds-nightly for better fine-grained split behaviour." ) has_buggy_even_splits = True # NOTE uncomment below if having file limit issues on dataset build (or alter your OS defaults) # import resource # low, high = resource.getrlimit(resource.RLIMIT_NOFILE) # resource.setrlimit(resource.RLIMIT_NOFILE, (high, high)) except ImportError as e: print(e) print( "Please install tensorflow_datasets package `pip install tensorflow-datasets`."
make_plot = True vote_threshold = 0.5 width = 32 height = 32 n_classes = 10 optimizer = tf.keras.optimizers.Adam l_rate = 0.001 batch_size = 64 loss = "sparse_categorical_crossentropy" vote_batches = 2 train_datasets, info = tfds.load('cifar10', split=tfds.even_splits('train', n=n_learners), as_supervised=True, with_info=True) n_datapoints = info.splits['train'].num_examples test_datasets = tfds.load('cifar10', split=tfds.even_splits('test', n=n_learners), as_supervised=True) for i in range(n_learners): ds_train = train_datasets[i].map( normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_train = ds_train.cache() ds_train = ds_train.shuffle(n_datapoints // n_learners) ds_train = ds_train.batch(batch_size) train_datasets[i] = ds_train.prefetch(tf.data.experimental.AUTOTUNE)