コード例 #1
0
def make_one_source_batch_pipeline(dataset_spec,
                                   split,
                                   batch_size,
                                   pool=None,
                                   shuffle_buffer_size=None,
                                   read_buffer_size_bytes=None,
                                   image_size=None):
    """Returns a pipeline emitting data from one single source as Batches.

  Args:
    dataset_spec: A DatasetSpecification object defining what to read from.
    split: A learning_spec.Split object identifying the source split.
    batch_size: An int representing the max number of examples in each batch.
    pool: String (optional), for example-split datasets, which example split to
      use ('valid', or 'test'), used at meta-test time only.
    shuffle_buffer_size: int or None, number of examples in the buffer used for
      shuffling the examples from different classes, while they are mixed
      together. There is only one shuffling operation, not one per class.
    read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset.
    image_size: int, desired image size used during decoding.

  Returns:
    A Dataset instance that outputs decoded batches from all classes in the
    split.
  """
    batch_reader = reader.BatchReader(dataset_spec, split, shuffle_buffer_size,
                                      read_buffer_size_bytes)
    dataset = batch_reader.create_dataset_input_pipeline(batch_size=batch_size,
                                                         pool=pool)
    map_fn = functools.partial(process_batch, image_size=image_size)
    dataset = dataset.map(map_fn)

    # Overlap episode processing and training.
    dataset = dataset.prefetch(1)
    return dataset
コード例 #2
0
def make_multisource_batch_pipeline(dataset_spec_list,
                                    split,
                                    batch_size,
                                    add_dataset_offset,
                                    pool=None,
                                    shuffle_buffer_size=None,
                                    read_buffer_size_bytes=None,
                                    num_prefetch=0,
                                    image_size=None):
    """Returns a pipeline emitting data from multiple source as Batches.

  Args:
    dataset_spec_list: A list of DatasetSpecification, one for each source.
    split: A learning_spec.Split object identifying the source split.
    batch_size: An int representing the max number of examples in each batch.
    add_dataset_offset: A Boolean, whether to add an offset to each dataset's
      targets, so that each target is unique across all datasets.
    pool: String (optional), for example-split datasets, which example split to
      use ('valid', or 'test'), used at meta-test time only.
    shuffle_buffer_size: int or None, number of examples in the buffer used for
      shuffling the examples from different classes, while they are mixed
      together. There is only one shuffling operation, not one per class.
    read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset.
    num_prefetch: int, the number of examples to prefetch for each class of
      each dataset. Prefetching occurs just after the class-specific Dataset
      object is constructed. If < 1, no prefetching occurs.
    image_size: int, desired image size used during decoding.

  Returns:
    A Dataset instance that outputs decoded batches from all classes in the
    split.
  """
    sources = []
    offset = 0
    for dataset_spec in dataset_spec_list:
        batch_reader = reader.BatchReader(dataset_spec, split,
                                          shuffle_buffer_size,
                                          read_buffer_size_bytes, num_prefetch)
        dataset = batch_reader.create_dataset_input_pipeline(
            batch_size=batch_size, pool=pool, offset=offset)
        sources.append(dataset)
        if add_dataset_offset:
            offset += len(dataset_spec.get_classes(split))

    # Sample uniformly among sources
    dataset = tf.data.experimental.sample_from_datasets(sources)

    map_fn = functools.partial(process_batch, image_size=image_size)
    dataset = dataset.map(map_fn)

    # Overlap episode processing and training.
    dataset = dataset.prefetch(1)
    return dataset
コード例 #3
0
def make_one_source_batch_pipeline(dataset_spec,
                                   split,
                                   batch_size,
                                   pool=None,
                                   shuffle_buffer_size=None,
                                   read_buffer_size_bytes=None,
                                   num_prefetch=0,
                                   image_size=None,
                                   num_to_take=None):
    """Returns a pipeline emitting data from one single source as Batches.

  Args:
    dataset_spec: A DatasetSpecification object defining what to read from.
    split: A learning_spec.Split object identifying the source split.
    batch_size: An int representing the max number of examples in each batch.
    pool: String (optional), for example-split datasets, which example split to
      use ('valid', or 'test'), used at meta-test time only.
    shuffle_buffer_size: int or None, number of examples in the buffer used for
      shuffling the examples from different classes, while they are mixed
      together. There is only one shuffling operation, not one per class.
    read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset.
    num_prefetch: int, the number of examples to prefetch for each class of each
      dataset. Prefetching occurs just after the class-specific Dataset object
      is constructed. If < 1, no prefetching occurs.
    image_size: int, desired image size used during decoding.
    num_to_take: Optional, an int specifying a number of elements to pick from
      each class' tfrecord. If specified, the available images of each class
      will be restricted to that int. By default no restriction is applied and
      all data is used.

  Returns:
    A Dataset instance that outputs decoded batches from all classes in the
    split.
  """
    if num_to_take is None:
        num_to_take = -1
    batch_reader = reader.BatchReader(dataset_spec, split, shuffle_buffer_size,
                                      read_buffer_size_bytes, num_prefetch,
                                      num_to_take)
    dataset = batch_reader.create_dataset_input_pipeline(batch_size=batch_size,
                                                         pool=pool)
    map_fn = functools.partial(process_batch, image_size=image_size)
    dataset = dataset.map(map_fn)

    # There is only one data source, so we know that all batches belong to it,
    # but for interface consistency, zip with a dataset identifying the source.
    source_id_dataset = tf.data.Dataset.from_tensors(0).repeat()
    dataset = tf.data.Dataset.zip((dataset, source_id_dataset))

    # Overlap episode processing and training.
    dataset = dataset.prefetch(1)
    return dataset
コード例 #4
0
def make_multisource_batch_pipeline(dataset_spec_list,
                                    split,
                                    batch_size,
                                    add_dataset_offset,
                                    pool=None,
                                    shuffle_buffer_size=None,
                                    read_buffer_size_bytes=None,
                                    num_prefetch=0,
                                    image_size=None,
                                    num_to_take=None):
    """Returns a pipeline emitting data from multiple source as Batches.

  Args:
    dataset_spec_list: A list of DatasetSpecification, one for each source.
    split: A learning_spec.Split object identifying the source split.
    batch_size: An int representing the max number of examples in each batch.
    add_dataset_offset: A Boolean, whether to add an offset to each dataset's
      targets, so that each target is unique across all datasets.
    pool: String (optional), for example-split datasets, which example split to
      use ('valid', or 'test'), used at meta-test time only.
    shuffle_buffer_size: int or None, number of examples in the buffer used for
      shuffling the examples from different classes, while they are mixed
      together. There is only one shuffling operation, not one per class.
    read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset.
    num_prefetch: int, the number of examples to prefetch for each class of each
      dataset. Prefetching occurs just after the class-specific Dataset object
      is constructed. If < 1, no prefetching occurs.
    image_size: int, desired image size used during decoding.
    num_to_take: Optional, a list specifying for each dataset the number of
      examples per class to restrict to (for this given split). If provided, its
      length must be the same as len(dataset_spec). If None, no restrictions are
      applied to any dataset and all data per class is used.

  Returns:
    A Dataset instance that outputs decoded batches from all classes in the
    split.
  """
    if num_to_take is not None and len(num_to_take) != len(dataset_spec_list):
        raise ValueError('num_to_take does not have the same length as '
                         'dataset_spec_list.')
    if num_to_take is None:
        num_to_take = [-1] * len(dataset_spec_list)
    sources = []
    offset = 0
    for source_id, (dataset_spec, num_to_take_for_dataset) in enumerate(
            zip(dataset_spec_list, num_to_take)):
        batch_reader = reader.BatchReader(dataset_spec, split,
                                          shuffle_buffer_size,
                                          read_buffer_size_bytes, num_prefetch,
                                          num_to_take_for_dataset)
        dataset = batch_reader.create_dataset_input_pipeline(
            batch_size=batch_size, pool=pool, offset=offset)
        # Create a dataset to zip with the above for identifying the source.
        source_id_dataset = tf.data.Dataset.from_tensors(source_id).repeat()
        sources.append(tf.data.Dataset.zip((dataset, source_id_dataset)))
        if add_dataset_offset:
            offset += len(dataset_spec.get_classes(split))

    # Sample uniformly among sources
    dataset = tf.data.experimental.sample_from_datasets(sources)

    def map_fn(batch, source_id):
        return process_batch(*batch, image_size=image_size), source_id

    dataset = dataset.map(map_fn)

    # Overlap episode processing and training.
    dataset = dataset.prefetch(1)
    return dataset