def make_one_source_batch_pipeline(dataset_spec, split, batch_size, pool=None, shuffle_buffer_size=None, read_buffer_size_bytes=None, image_size=None): """Returns a pipeline emitting data from one single source as Batches. Args: dataset_spec: A DatasetSpecification object defining what to read from. split: A learning_spec.Split object identifying the source split. batch_size: An int representing the max number of examples in each batch. pool: String (optional), for example-split datasets, which example split to use ('valid', or 'test'), used at meta-test time only. shuffle_buffer_size: int or None, number of examples in the buffer used for shuffling the examples from different classes, while they are mixed together. There is only one shuffling operation, not one per class. read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset. image_size: int, desired image size used during decoding. Returns: A Dataset instance that outputs decoded batches from all classes in the split. """ batch_reader = reader.BatchReader(dataset_spec, split, shuffle_buffer_size, read_buffer_size_bytes) dataset = batch_reader.create_dataset_input_pipeline(batch_size=batch_size, pool=pool) map_fn = functools.partial(process_batch, image_size=image_size) dataset = dataset.map(map_fn) # Overlap episode processing and training. dataset = dataset.prefetch(1) return dataset
def make_multisource_batch_pipeline(dataset_spec_list, split, batch_size, add_dataset_offset, pool=None, shuffle_buffer_size=None, read_buffer_size_bytes=None, num_prefetch=0, image_size=None): """Returns a pipeline emitting data from multiple source as Batches. Args: dataset_spec_list: A list of DatasetSpecification, one for each source. split: A learning_spec.Split object identifying the source split. batch_size: An int representing the max number of examples in each batch. add_dataset_offset: A Boolean, whether to add an offset to each dataset's targets, so that each target is unique across all datasets. pool: String (optional), for example-split datasets, which example split to use ('valid', or 'test'), used at meta-test time only. shuffle_buffer_size: int or None, number of examples in the buffer used for shuffling the examples from different classes, while they are mixed together. There is only one shuffling operation, not one per class. read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset. num_prefetch: int, the number of examples to prefetch for each class of each dataset. Prefetching occurs just after the class-specific Dataset object is constructed. If < 1, no prefetching occurs. image_size: int, desired image size used during decoding. Returns: A Dataset instance that outputs decoded batches from all classes in the split. """ sources = [] offset = 0 for dataset_spec in dataset_spec_list: batch_reader = reader.BatchReader(dataset_spec, split, shuffle_buffer_size, read_buffer_size_bytes, num_prefetch) dataset = batch_reader.create_dataset_input_pipeline( batch_size=batch_size, pool=pool, offset=offset) sources.append(dataset) if add_dataset_offset: offset += len(dataset_spec.get_classes(split)) # Sample uniformly among sources dataset = tf.data.experimental.sample_from_datasets(sources) map_fn = functools.partial(process_batch, image_size=image_size) dataset = dataset.map(map_fn) # Overlap episode processing and training. dataset = dataset.prefetch(1) return dataset
def make_one_source_batch_pipeline(dataset_spec, split, batch_size, pool=None, shuffle_buffer_size=None, read_buffer_size_bytes=None, num_prefetch=0, image_size=None, num_to_take=None): """Returns a pipeline emitting data from one single source as Batches. Args: dataset_spec: A DatasetSpecification object defining what to read from. split: A learning_spec.Split object identifying the source split. batch_size: An int representing the max number of examples in each batch. pool: String (optional), for example-split datasets, which example split to use ('valid', or 'test'), used at meta-test time only. shuffle_buffer_size: int or None, number of examples in the buffer used for shuffling the examples from different classes, while they are mixed together. There is only one shuffling operation, not one per class. read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset. num_prefetch: int, the number of examples to prefetch for each class of each dataset. Prefetching occurs just after the class-specific Dataset object is constructed. If < 1, no prefetching occurs. image_size: int, desired image size used during decoding. num_to_take: Optional, an int specifying a number of elements to pick from each class' tfrecord. If specified, the available images of each class will be restricted to that int. By default no restriction is applied and all data is used. Returns: A Dataset instance that outputs decoded batches from all classes in the split. """ if num_to_take is None: num_to_take = -1 batch_reader = reader.BatchReader(dataset_spec, split, shuffle_buffer_size, read_buffer_size_bytes, num_prefetch, num_to_take) dataset = batch_reader.create_dataset_input_pipeline(batch_size=batch_size, pool=pool) map_fn = functools.partial(process_batch, image_size=image_size) dataset = dataset.map(map_fn) # There is only one data source, so we know that all batches belong to it, # but for interface consistency, zip with a dataset identifying the source. source_id_dataset = tf.data.Dataset.from_tensors(0).repeat() dataset = tf.data.Dataset.zip((dataset, source_id_dataset)) # Overlap episode processing and training. dataset = dataset.prefetch(1) return dataset
def make_multisource_batch_pipeline(dataset_spec_list, split, batch_size, add_dataset_offset, pool=None, shuffle_buffer_size=None, read_buffer_size_bytes=None, num_prefetch=0, image_size=None, num_to_take=None): """Returns a pipeline emitting data from multiple source as Batches. Args: dataset_spec_list: A list of DatasetSpecification, one for each source. split: A learning_spec.Split object identifying the source split. batch_size: An int representing the max number of examples in each batch. add_dataset_offset: A Boolean, whether to add an offset to each dataset's targets, so that each target is unique across all datasets. pool: String (optional), for example-split datasets, which example split to use ('valid', or 'test'), used at meta-test time only. shuffle_buffer_size: int or None, number of examples in the buffer used for shuffling the examples from different classes, while they are mixed together. There is only one shuffling operation, not one per class. read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset. num_prefetch: int, the number of examples to prefetch for each class of each dataset. Prefetching occurs just after the class-specific Dataset object is constructed. If < 1, no prefetching occurs. image_size: int, desired image size used during decoding. num_to_take: Optional, a list specifying for each dataset the number of examples per class to restrict to (for this given split). If provided, its length must be the same as len(dataset_spec). If None, no restrictions are applied to any dataset and all data per class is used. Returns: A Dataset instance that outputs decoded batches from all classes in the split. """ if num_to_take is not None and len(num_to_take) != len(dataset_spec_list): raise ValueError('num_to_take does not have the same length as ' 'dataset_spec_list.') if num_to_take is None: num_to_take = [-1] * len(dataset_spec_list) sources = [] offset = 0 for source_id, (dataset_spec, num_to_take_for_dataset) in enumerate( zip(dataset_spec_list, num_to_take)): batch_reader = reader.BatchReader(dataset_spec, split, shuffle_buffer_size, read_buffer_size_bytes, num_prefetch, num_to_take_for_dataset) dataset = batch_reader.create_dataset_input_pipeline( batch_size=batch_size, pool=pool, offset=offset) # Create a dataset to zip with the above for identifying the source. source_id_dataset = tf.data.Dataset.from_tensors(source_id).repeat() sources.append(tf.data.Dataset.zip((dataset, source_id_dataset))) if add_dataset_offset: offset += len(dataset_spec.get_classes(split)) # Sample uniformly among sources dataset = tf.data.experimental.sample_from_datasets(sources) def map_fn(batch, source_id): return process_batch(*batch, image_size=image_size), source_id dataset = dataset.map(map_fn) # Overlap episode processing and training. dataset = dataset.prefetch(1) return dataset