Beispiel #1
0
    def __call__(self, params):
        bs = params["batch_size"]

        dataset = get_split("train" if self._is_training else "validation",
                            dataset_dir=self._data_dir)

        if self._is_training:
            dataset = dataset.shuffle(buffer_size=1024)
            dataset = dataset.repeat()

        def _load_records(filename):
            return tf.data.TFRecordDataset(filename,
                                           buffer_size=32 * 1000 * 1000)

        dataset = dataset.apply(
            contrib_data.parallel_interleave(_load_records,
                                             sloppy=True,
                                             cycle_length=64))

        dataset = dataset.prefetch(bs * 4)
        dataset = dataset.map(self._parse_record, num_parallel_calls=32)
        dataset = dataset.batch(bs, drop_remainder=True)
        dataset = dataset.prefetch(4)

        features, labels = dataset.make_one_shot_iterator().get_next()
        labels = tf.cast(labels, tf.int32)
        features.set_shape([bs, 224, 224, 3])
        labels.set_shape([bs])
        return features, labels
def parallel_dataset(dataset_fn, num_shards, seed):
    """ Builds the dataset to pull in parallel by using parallel interleaved datasets.

    Parameters
    ----------
    dataset_fn: a function which creates a dataset with the given seed.
    num_shards: the number of shards for each dataset.
    seed: the seed to use.

    Returns
    -------
    dataset: a dataset pulling in parallel from the given number of shards.
    """
    # from tensorflow.data.experimental import parallel_interleave
    from tensorflow.contrib.data import parallel_interleave

    if num_shards is None or num_shards == 1:
        return dataset_fn(seed)

    print('parallel dataset seems ok...')
    print(num_shards)
    seed_offset_dataset = tf.data.Dataset.range(num_shards)
    seed_offset_dataset = seed_offset_dataset.repeat()
    seed_offset_dataset = seed_offset_dataset.shuffle(num_shards)
    return seed_offset_dataset.apply(
        parallel_interleave(lambda input: dataset_fn(seed + input),
                            cycle_length=num_shards))
Beispiel #3
0
 def _get_dataset_from_path(self):
   dataset = tf.data.Dataset.list_files(self._train_data_path)
   dataset = dataset.apply(contrib_data.shuffle_and_repeat(buffer_size=1000))
   dataset = dataset.apply(
       contrib_data.parallel_interleave(
           tf.data.TFRecordDataset, cycle_length=20, sloppy=True))
   return dataset
  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.
    d = tf.data.Dataset.list_files(input_file, shuffle=False)
    d = d.apply(
        contrib_data.parallel_interleave(
            functools.partial(
                tf.data.TFRecordDataset,
                compression_type=FLAGS.compression_type),
            cycle_length=32,
            sloppy=is_training))
    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.apply(
        contrib_data.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=batch_size,
            drop_remainder=drop_remainder))

    return d
Beispiel #5
0
  def make_source_dataset(self, index, num_hosts):
    """See base class."""
    if not self.data_dir:
      tf.logging.info('Undefined data_dir implies null input')
      return tf.data.Dataset.range(1).repeat().map(self._get_null_input)

    # Shuffle the filenames to ensure better randomization.
    file_pattern = os.path.join(
        self.data_dir, 'train-*' if self.is_training else 'validation-*')

    # For multi-host training, we want each hosts to always process the same
    # subset of files.  Each host only sees a subset of the entire dataset,
    # allowing us to cache larger datasets in memory.
    dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
    dataset = dataset.shard(num_hosts, index)

    if self.is_training and not self.cache:
      dataset = dataset.repeat()

    def fetch_dataset(filename):
      buffer_size = 8 * 1024 * 1024  # 8 MiB per file
      dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size)
      return dataset

    # Read the data from disk in parallel
    dataset = dataset.apply(
        contrib_data.parallel_interleave(
            fetch_dataset, cycle_length=64, sloppy=True))

    if self.cache:
      dataset = dataset.cache().apply(
          contrib_data.shuffle_and_repeat(1024 * 16))
    else:
      dataset = dataset.shuffle(1024)
    return dataset
Beispiel #6
0
    def input_fn():
        """Supplies input to our model.

    This function supplies input to our model, where this input is a
    function of the mode. For example, we supply different data if
    we're performing training versus evaluation.

    Returns:
      A tuple consisting of 1) a dictionary of tensors whose keys are
      the feature names, and 2) a tensor of target labels if the mode
      is not INFER (and None, otherwise).
    """
        is_training = mode == tf.estimator.ModeKeys.TRAIN
        num_epochs = None if is_training else 1

        with tf.name_scope('read_batch'):
            file_names = input_files
            files = tf.data.Dataset.list_files(file_names)
            if shuffle:
                files = files.shuffle(buffer_size=len(file_names))
            dataset = (files.apply(
                contrib_data.parallel_interleave(
                    tf.data.TFRecordDataset,
                    cycle_length=10)).repeat(num_epochs))
            if shuffle:
                dataset = dataset.shuffle(buffer_size=100)
            parse_fn = _make_parsing_fn(mode, label_name, include_age,
                                        categorical_context_features,
                                        sequence_features,
                                        time_crossed_features)
            feature_engineering_fn = _make_feature_engineering_fn(
                dedup, time_windows, include_age, sequence_features,
                time_crossed_features)
            if tf.__version__ < '1.12.0':
                dataset = dataset.map(parse_fn, num_parallel_calls=8)
                feature_map = (dataset.prefetch(buffer_size=batch_size).
                               make_one_shot_iterator().get_next())
                # Batch with padding.
                feature_map = tf.train.batch(feature_map,
                                             batch_size,
                                             num_threads=8,
                                             capacity=2,
                                             enqueue_many=False,
                                             dynamic_pad=True)
                feature_map = feature_engineering_fn(feature_map)
            else:
                feature_map = (
                    dataset.batch(batch_size)
                    # Parallelize the input processing and put it behind a
                    # queue to increase performance by removing it from the
                    # critical path of per-step-computation.
                    .map(parse_fn, num_parallel_calls=8).map(
                        feature_engineering_fn, num_parallel_calls=8).prefetch(
                            buffer_size=1).make_one_shot_iterator().get_next())
            label = None
            if mode != tf.estimator.ModeKeys.PREDICT:
                label = feature_map.pop(CONTEXT_KEY_PREFIX + label_name)
            return feature_map, label
    def make_source_dataset(self, index=0, num_hosts=1):
        """See base class."""

        if not self.data_dir:
            tf.logging.info("Undefined data_dir implies null input")
            return tf.data.Dataset.range(1).repeat().map(self._get_null_input)

        get_filenames = get_filenames_func()
        filenames, _ = get_filenames(self.dataset_split)
        dataset = tf.data.Dataset.from_tensor_slices(filenames)

        if self.is_training and not self.cache:
            if filenames is not None:
                dataset = dataset.shuffle(len(filenames))
            dataset = dataset.repeat()

        def fetch_dataset(filename):
            buffer_size = 8 * 1024 * 1024  # 8 MB per file
            dataset = tf.data.TFRecordDataset(filename,
                                              buffer_size=buffer_size)
            return dataset

        cycle_length = 64
        shuffle_size = 1024

        # Read the data from disk in parallel
        if self.is_training:
            dataset = dataset.apply(
                contrib_data.parallel_interleave(fetch_dataset,
                                                 cycle_length=cycle_length,
                                                 sloppy=True))
        else:
            dataset = dataset.apply(
                contrib_data.parallel_interleave(fetch_dataset,
                                                 cycle_length=1,
                                                 sloppy=False))

        if self.cache:
            dataset = dataset.cache().apply(
                contrib_data.shuffle_and_repeat(shuffle_size))
        else:
            if self.is_training:
                dataset = dataset.shuffle(shuffle_size)
        return dataset
Beispiel #8
0
  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    name_to_features = {
        "input_ids":
            tf.FixedLenFeature([max_seq_length], tf.int64),
        "input_mask":
            tf.FixedLenFeature([max_seq_length], tf.int64),
        "segment_ids":
            tf.FixedLenFeature([max_seq_length], tf.int64),
        "masked_lm_positions":
            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
        "masked_lm_ids":
            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
        "masked_lm_weights":
            tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
        "next_sentence_labels":
            tf.FixedLenFeature([1], tf.int64),
    }

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.
    if is_training:
      d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
      d = d.repeat()
      d = d.shuffle(buffer_size=len(input_files))

      # `cycle_length` is the number of parallel files that get read.
      cycle_length = min(num_cpu_threads, len(input_files))

      # `sloppy` mode means that the interleaving is not exact. This adds
      # even more randomness to the training pipeline.
      d = d.apply(
          contrib_data.parallel_interleave(
              tf.data.TFRecordDataset,
              sloppy=is_training,
              cycle_length=cycle_length))
      d = d.shuffle(buffer_size=100)
    else:
      d = tf.data.TFRecordDataset(input_files)
      # Since we evaluate for a fixed number of steps we don't want to encounter
      # out-of-range exceptions.
      d = d.repeat()

    # We must `drop_remainder` on training because the TPU requires fixed
    # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
    # and we *don't* want to drop the remainder, otherwise we wont cover
    # every sample.
    d = d.apply(
        tf.data.experimental.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=batch_size,
            num_parallel_batches=num_cpu_threads,
            drop_remainder=True))
    return d
Beispiel #9
0
  def input_fn(self, params):
    """Input function which provides a single batch for train or eval.

    Args:
      params: `dict` of parameters passed from the `TPUEstimator`.
          `params['batch_size']` is always provided and should be used as the
          effective batch size.

    Returns:
      A (images, labels) tuple of `Tensor`s for a batch of samples.
    """
    batch_size = params['batch_size']

    if FLAGS.use_data == 'real':
      file_pattern = os.path.join(
          self.data_dir, 'train-*' if self.is_training else 'validation-*')
      dataset = tf.data.Dataset.list_files(file_pattern,
                                           shuffle=self.is_training)

      if self.is_training:
        dataset = dataset.repeat()

      def prefetch_dataset(filename):
        dataset = tf.data.TFRecordDataset(
            filename, buffer_size=FLAGS.prefetch_dataset_buffer_size)
        return dataset

      dataset = dataset.apply(
          contrib_data.parallel_interleave(
              prefetch_dataset,
              cycle_length=FLAGS.num_files_infeed,
              sloppy=True))

      if FLAGS.followup_shuffle_buffer_size > 0:
        dataset = dataset.shuffle(
            buffer_size=FLAGS.followup_shuffle_buffer_size)

      dataset = dataset.map(
          self.dataset_parser,
          num_parallel_calls=FLAGS.num_parallel_calls)

      dataset = dataset.prefetch(batch_size)

      dataset = dataset.batch(batch_size, drop_remainder=True)

      dataset = dataset.prefetch(2)  # Prefetch overlaps in-feed with training

      images, labels = dataset.make_one_shot_iterator().get_next()
    else:
      images = tf.random_uniform(
          [batch_size, FLAGS.height, FLAGS.width, 3], minval=-1, maxval=1)
      labels = tf.random_uniform(
          [batch_size], minval=0, maxval=999, dtype=tf.int32)

    images = tensor_transform_fn(images, params['pipeline_transpose_dims'])
    return images, labels
Beispiel #10
0
def input_clouds_fn(filelist,
                    batch_size=32,
                    copy_size=4,
                    prefetch=1,
                    read_threads=4,
                    distribute=(1, 0)):
    """
      INPUT:
        prefetch: tf.int64. How many "minibatch" we asynchronously prepare on CPU ahead of GPU
    """
    def parser(ser):
        """
        Decode & Pass datast in tf.record
        *Cuation*
        floating point: tfrecord data ==> tf.float64 
        """
        features = {
            "shape": tf.FixedLenFeature([3], tf.int64),
            "patch": tf.FixedLenFeature([], tf.string),
            "filename": tf.FixedLenFeature([], tf.string),
            "coordinate": tf.FixedLenFeature([2], tf.int64),
        }
        decoded = tf.parse_single_example(ser, features)
        patch = tf.reshape(
            tf.decode_raw(decoded["patch"], tf.float64), decoded["shape"]
            #tf.decode_raw(decoded["patch"], tf.float32), decoded["shape"]
        )
        print("shape check in pipeline {}".format(patch.shape), flush=True)
        #patch = tf.random_crop(patch, shape)
        #return decoded["filename"], decoded["coordinate"], patch

        # conversion of tensor
        patch = tf.cast(patch, tf.float32)
        return patch

    # check batch/copy ratio
    try:
        if batch_size % copy_size == 0:
            print("\n Number of actual original images == {} ".format(
                int(batch_size)))
    except:
        raise ValueError(
            "\n Division of batch size and copy size is not Integer \n")

    dataset = (tf.data.Dataset.list_files(
        filelist, shuffle=True).shard(*distribute).apply(
            parallel_interleave(
                lambda f: tf.data.TFRecordDataset(f).map(parser),
                cycle_length=read_threads,
                sloppy=True,
            )))
    dataset = dataset.shuffle(1000).repeat().batch(
        int(batch_size)).prefetch(prefetch)
    return dataset
def read_dataset(file_read_func, input_files, config,
                 filename_shard_fn=None):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf_data.parallel_interleave, to
      read every individual file into a tf.data.Dataset.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.
    filename_shard_fn: optional, A funciton used to shard filenames across
      replicas. This function takes as input a TF dataset of filenames and
      is expected to return its sharded version. It is useful when the
      dataset is being loaded on one of possibly many replicas and we want
      to evenly shard the files between the replicas.

  Returns:
    A tf.data.Dataset of (undecoded) tf-records based on config.

  Raises:
    RuntimeError: If no files are found at the supplied path(s).
  """
  # Shard, shuffle, and read files.
  filenames = tf.gfile.Glob(input_files)
  if not filenames:
    raise RuntimeError('Did not find any input files matching the glob pattern '
                       '{}'.format(input_files))
  num_readers = config.num_readers
  if num_readers > len(filenames):
    num_readers = len(filenames)
    tf.logging.warning('num_readers has been reduced to %d to match input file '
                       'shards.' % num_readers)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  elif num_readers > 1:
    tf.logging.warning('`shuffle` is false, but the input data stream is '
                       'still slightly shuffled since `num_readers` > 1.')
  if filename_shard_fn:
    filename_dataset = filename_shard_fn(filename_dataset)

  filename_dataset = filename_dataset.repeat(config.num_epochs or None)
  records_dataset = filename_dataset.apply(
      tf_data.parallel_interleave(
          file_read_func,
          cycle_length=num_readers,
          block_length=config.read_block_length,
          sloppy=config.shuffle))
  if config.shuffle:
    records_dataset = records_dataset.shuffle(config.shuffle_buffer_size)
  return records_dataset
Beispiel #12
0
def input_fn(params, sequence_schema, context_schema, part_files):
    dataset = Dataset.from_tensor_slices(part_files).shuffle(len(part_files))
    dataset = dataset.apply(
        parallel_interleave(
            lambda file: TFRecordDataset(file, compression_type='GZIP'),
            cycle_length=params['cycle_length'],
            sloppy=True))
    dataset = dataset.map(partial(parse_example, context_schema,
                                  sequence_schema),
                          num_parallel_calls=cpu_count())
    dataset = dataset.apply(
        shuffle_and_repeat(params['buffer_size'], count=params['epochs']))
    dataset = dataset.batch(params['batch_size'])
    return dataset
    def __call__(self, params):
        """Input function which provides a single batch for train or eval."""
        if self.data_dir is None:
            tf.logging.info('Using fake input.')
            return self._input_fn_null(params)

        # Retrieves the batch size for the current shard. The # of shards is
        # computed according to the input pipeline deployment. See
        # `tf.contrib.tpu.RunConfig` for details.
        batch_size = params["batch_size"]

        # Shuffle the filenames to ensure better randomization
        file_pattern = os.path.join(
            self.data_dir, "train-*" if self.is_training else "validation-*")
        dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
        if self.is_training:
            dataset = dataset.shuffle(
                buffer_size=1024)  # 1024 files in dataset

        if self.is_training:
            dataset = dataset.repeat()

        def prefetch_dataset(filename):
            buffer_size = FLAGS.prefetch_dataset_buffer_size
            dataset = tf.data.TFRecordDataset(filename,
                                              buffer_size=buffer_size)
            return dataset

        dataset = dataset.apply(
            contrib_data.parallel_interleave(
                prefetch_dataset,
                cycle_length=FLAGS.num_files_infeed,
                sloppy=True))
        dataset = dataset.shuffle(FLAGS.shuffle_buffer_size)

        dataset = dataset.map(self.dataset_parser, num_parallel_calls=128)
        dataset = dataset.prefetch(batch_size)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        dataset = dataset.prefetch(
            2)  # Prefetch overlaps in-feed with training
        return dataset
    def input_fn(params):
        def _dataset_fn(s):
            walk_length = int(params.num_edges / params.window_size)

            return RandomWalkDataset(
                walk_length,
                _constant_hidden_value(adjacency_list.neighbours,
                                       'create_neighbours'),
                _constant_hidden_value(adjacency_list.lengths,
                                       'create_lengths'),
                _constant_hidden_value(adjacency_list.offsets,
                                       'create_offsets'),
                seed=s).prefetch(params.batch_size * 2)

        if dataset_shards is None:
            dataset = _dataset_fn(params.seed)
        else:
            from tensorflow.contrib.data import parallel_interleave

            dataset = tf.data.Dataset.range(dataset_shards).apply(
                parallel_interleave(_dataset_fn,
                                    cycle_length=dataset_shards,
                                    sloppy=True))

        window = adapters.adapt_random_walk_window(params.window_size)
        add_negative_samples = relational_erm.sampling.negative_sampling.add_negative_sample(
            num_vertex, params.num_negative, seed=params.seed)

        processing_fn = adapters.compose(
            window, add_negative_samples, adapters.relabel_subgraph(),
            adapters.append_packed_vertex_labels(packed_labels.labels,
                                                 packed_labels.lengths,
                                                 packed_labels.offsets),
            adapters.add_sample_size_info(), adapters.format_features_labels())

        dataset = dataset.map(processing_fn, num_parallel_calls=12)
        dataset = dataset.prefetch(params.batch_size * 2)
        dataset = dataset.apply(
            adapters.padded_batch_samples(params.batch_size))
        return dataset.apply(tf.contrib.data.prefetch_to_device('/gpu:0'))
Beispiel #15
0
def load_tfrecord(serialized_data,
                  shape,
                  batch_size=1,
                  read_threads=4,
                  shuffle_buffer_size=1000,
                  prefetch=1,
                  distribute=(1, 0)):
    def parser(serialized_data):
        features = {
            "shape": tf.FixedLenFeature([3], tf.int64),
            "patch": tf.FixedLenFeature([], tf.string),
            "filename": tf.FixedLenFeature([], tf.string),
            "coordinate": tf.FixedLenFeature([2], tf.int64),
        }
        decoded = tf.parse_single_example(serialized_data, features)
        # &&&&&& My output id tf.float64 !!!! &&&&&&
        patch = tf.reshape(tf.decode_raw(decoded["patch"], tf.float64),
                           decoded["shape"])
        # randomly crop mini-patches from data
        patch = tf.random_crop(patch, shape)
        print(decoded["filename"], decoded["coordinate"], patch)
        return decoded["filename"], decoded["coordinate"], patch

    # TODO: understand this code
    dataset = (tf.data.Dataset.list_files(
        serialized_data, shuffle=True).shard(*distribute).apply(
            parallel_interleave(
                lambda f: tf.data.TFRecordDataset(f).map(parser),
                cycle_length=read_threads,
                sloppy=True,
            )))

    # TODO: understand the code
    print(dataset)
    dataset = dataset.apply(
        batch_and_drop_remainder(batch_size)).prefetch(prefetch)
    return dataset
Beispiel #16
0
    def input_fn(self, params):
        """Input function which provides a single batch for train or eval.

    Args:
      params: `dict` of parameters passed from the `TPUEstimator`.
          `params['batch_size']` is always provided and should be used as the
          effective batch size.

    Returns:
      A `tf.data.Dataset` object.
    """
        batch_size = params['batch_size']

        if FLAGS.use_data == 'real':
            assert self.data_dir, 'data_dir is required'
            file_pattern = os.path.join(
                self.data_dir,
                'train-*' if self.is_training else 'validation-*')
            dataset = tf.data.Dataset.list_files(file_pattern,
                                                 shuffle=self.is_training)

            if self.is_training:
                dataset = dataset.repeat()

            def prefetch_dataset(filename):
                dataset = tf.data.TFRecordDataset(
                    filename, buffer_size=FLAGS.prefetch_dataset_buffer_size)
                return dataset

            dataset = dataset.apply(
                contrib_data.parallel_interleave(
                    prefetch_dataset,
                    cycle_length=FLAGS.num_files_infeed,
                    sloppy=True))

            if FLAGS.followup_shuffle_buffer_size > 0:
                dataset = dataset.shuffle(
                    buffer_size=FLAGS.followup_shuffle_buffer_size)

            dataset = dataset.map(self.dataset_parser,
                                  num_parallel_calls=FLAGS.num_parallel_calls)
        else:
            random_image = tf.random.uniform([FLAGS.height, FLAGS.width, 3],
                                             minval=-1,
                                             maxval=1)
            random_label = tf.random.uniform([],
                                             minval=0,
                                             maxval=999,
                                             dtype=tf.int32)
            dataset = tf.data.Dataset.range(1).repeat().map(
                lambda data: (random_image, random_label))

        dataset = dataset.prefetch(batch_size)

        dataset = dataset.batch(batch_size, drop_remainder=True)

        dataset = dataset.prefetch(
            2)  # Prefetch overlaps in-feed with training

        def transpose_images(images):
            return tensor_transform_fn(images, params['output_perm'])

        dataset = dataset.map(lambda images, labels:
                              (transpose_images(images), labels),
                              num_parallel_calls=FLAGS.num_parallel_calls)

        return dataset
Beispiel #17
0
def get_pretraining_dataset(opts, data_type, is_training=True, num_cpu_threads=4, use_static_mask=False):
    if is_training:
        input_file = opts['train_file']
    else:
        input_file = opts['test_file']
    micro_batch_size = opts['micro_batch_size']
    max_seq_length = opts['seq_length']
    max_predictions_per_seq = opts['max_predictions_per_seq']

    input_files = []
    for input_pattern in input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    if use_static_mask:
        # The masked tokens have been re-arranaged to always be at the first
        # 'max_predictions_per_seq' positions.
        name_to_features = {
            "input_ids":
            tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_position":
            tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids":
            tf.FixedLenFeature([max_seq_length], tf.int64),
            "mask_padding_index":
            tf.FixedLenFeature([1], tf.int64),
            "seq_padding_index":
            tf.FixedLenFeature([1], tf.int64),
            "masked_labels":
            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
            "masked_lm_weights":
            tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
            "next_sentence_labels":
            tf.FixedLenFeature([1], tf.int64),
        }
    else:
        # Default, the tokens have not been re-arranged.
        name_to_features = {
            "input_ids":
            tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask":
            tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids":
            tf.FixedLenFeature([max_seq_length], tf.int64),
            "masked_lm_positions":
            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
            "masked_lm_ids":
            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
            "masked_lm_weights":
            tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
            "next_sentence_labels":
            tf.FixedLenFeature([1], tf.int64),
        }

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.
    if is_training:
        d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
        d = d.repeat()

        # `cycle_length` is the number of parallel files that get read.
        cycle_length = min(num_cpu_threads, len(input_files))

        # `sloppy` mode means that the interleaving is not exact. This adds
        # even more randomness to the training pipeline.
        d = d.apply(parallel_interleave(tf.data.TFRecordDataset, sloppy=is_training, cycle_length=cycle_length))

        # `buffer_size` should be set big enough to keep data shuffle sufficiently.
        if opts['distributed_worker_count'] > 1:
            d = d.shard(num_shards=opts['distributed_worker_count'], index=opts['distributed_worker_index'])
            d = d.shuffle(buffer_size=1000, seed=opts['seed'])
        else:
            d = d.shuffle(buffer_size=1000)
    else:
        d = tf.data.TFRecordDataset(input_files)
        d = d.repeat()

    d = d.apply(map_and_batch(
            lambda record: _decode_record(record, name_to_features, data_type),
            batch_size=micro_batch_size,
            num_parallel_batches=num_cpu_threads,
            drop_remainder=True))
    return d
Beispiel #18
0
    def input_fn(params):
        """The actual input function."""
        batch_size = params['batch_size']

        name_to_features = {
            'input_ids': tf.FixedLenFeature([max_seq_length], tf.int64),
            'input_mask': tf.FixedLenFeature([max_seq_length], tf.int64),
            'segment_ids': tf.FixedLenFeature([max_seq_length], tf.int64),
            # Note: We keep this feature name `next_sentence_labels` to be
            # compatible with the original data created by lanzhzh@. However, in
            # the ALBERT case it does represent sentence_order_labels.
            'next_sentence_labels': tf.FixedLenFeature([1], tf.int64),
        }

        if FLAGS.masked_lm_budget:
            name_to_features.update(
                {
                    'token_boundary': tf.FixedLenFeature(
                        [max_seq_length], tf.int64
                    )
                }
            )
        else:
            name_to_features.update(
                {
                    'masked_lm_positions': tf.FixedLenFeature(
                        [max_predictions_per_seq], tf.int64
                    ),
                    'masked_lm_ids': tf.FixedLenFeature(
                        [max_predictions_per_seq], tf.int64
                    ),
                    'masked_lm_weights': tf.FixedLenFeature(
                        [max_predictions_per_seq], tf.float32
                    ),
                }
            )

        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        if is_training:
            d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
            d = d.repeat()
            d = d.shuffle(buffer_size = len(input_files))

            # `cycle_length` is the number of parallel files that get read.
            cycle_length = min(num_cpu_threads, len(input_files))

            # `sloppy` mode means that the interleaving is not exact. This adds
            # even more randomness to the training pipeline.
            d = d.apply(
                contrib_data.parallel_interleave(
                    tf.data.TFRecordDataset,
                    sloppy = is_training,
                    cycle_length = cycle_length,
                )
            )
            d = d.shuffle(buffer_size = 100)
        else:
            d = tf.data.TFRecordDataset(input_files)
            # Since we evaluate for a fixed number of steps we don't want to encounter
            # out-of-range exceptions.
            d = d.repeat()

        # We must `drop_remainder` on training because the TPU requires fixed
        # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
        # and we *don't* want to drop the remainder, otherwise we wont cover
        # every sample.
        d = d.apply(
            tf.data.experimental.map_and_batch_with_legacy_function(
                lambda record: _decode_record(record, name_to_features),
                batch_size = batch_size,
                num_parallel_batches = num_cpu_threads,
                drop_remainder = True,
            )
        )
        tf.logging.info(d)
        return d
Beispiel #19
0
def get_batch(datasets,
              preprocess_name,
              is_training,
              batch_size,
              num_gpu=1,
              seed=None):
    with tf.device('/cpu:0'):

        num_class = datasets.num_class
        file_name = datasets.source
        feature = datasets.feature
        decoder = datasets.decoder
        name = datasets.description['name']

        image_preprocessing_fn = get_preprocess_fn(preprocess_name)

        dataset = tf.data.Dataset.from_tensor_slices(file_name)

        if is_training:
            # Shuffle the input files
            dataset = dataset.shuffle(len(file_name),
                                      seed=seed,
                                      reshuffle_each_iteration=True)
        '''  
    Convert to individual records.
    cycle_length = 8 means 8 files will be read and deserialized in parallel.
    This number is low enough to not cause too much contention on small systems
    but high enough to provide the benefits of parallelization. You may want
    to increase this number if you have a large number of CPU cores.
    '''

        cycle_length = min(10, len(file_name))
        dataset = dataset.apply(
            data.parallel_interleave(tf.data.TFRecordDataset,
                                     cycle_length=cycle_length))

        # We prefetch a batch at a time, This can help smooth out the time taken to
        # load input files as we go through shuffling and processing.
        dataset = dataset.prefetch(buffer_size=batch_size)

        if is_training:
            dataset = dataset.apply(
                data.shuffle_and_repeat(buffer_size=10000, seed=seed))
        else:
            dataset = dataset.repeat()

        def map_func(record):

            parsed = tf.parse_single_example(record, feature)
            image = decoder(parsed['image/encoded'])
            # Perform additional preprocessing on the parsed data.
            image = image_preprocessing_fn(image,
                                           datasets,
                                           is_training=is_training)
            label = parsed['image/class/label']

            label = tf.one_hot(label, num_class)
            return image, label

        '''
    Parse the raw records into images and labels. Testing has shown that setting
    num_parallel_batches > 1 produces no improvement in throughput, since
    batch_size is almost always much greater than the number of CPU cores.    
    '''
        dataset = dataset.apply(
            data.map_and_batch(map_func=map_func,
                               batch_size=batch_size,
                               num_parallel_batches=1))
        '''
    Operations between the final prefetch and the get_next call to the iterator
    will happen synchronously during run time. We prefetch here again to
    background all of the above processing work and keep it out of the
    critical training path.    
    '''

        dataset = dataset.prefetch(buffer_size=32)
        iterator = dataset.make_one_shot_iterator()
        return iterator
Beispiel #20
0
    def __call__(self, params):
        input_anchors = anchors.Anchors(params['min_level'],
                                        params['max_level'],
                                        params['num_scales'],
                                        params['aspect_ratios'],
                                        params['anchor_scale'],
                                        params['image_size'])
        anchor_labeler = anchors.AnchorLabeler(input_anchors,
                                               params['num_classes'])
        example_decoder = tf_example_decoder.TfExampleDecoder()

        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        image: Image tensor that is preproessed to have normalized value and
          fixed dimension [image_size, image_size, 3]
        cls_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors]. The height_l and width_l
          represent the dimension of class logits at l-th level.
        box_targets_dict: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, num_anchors * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        num_positives: Number of positive anchors in the image.
        source_id: Source image id. Default value -1 if the source id is empty
          in the groundtruth annotation.
        image_scale: Scale of the proccessed image to the original image.
        image_info: image information that includes the original height and
            width, the scale of the proccessed image to the original image, and
            the scaled height and width.
        boxes: Groundtruth bounding box annotations. The box is represented in
          [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed
          dimension [self._max_num_instances, 4].
        is_crowds: Groundtruth annotations to indicate if an annotation
          represents a group of instances by value {0, 1}. The tennsor is
          padded with 0 to the fixed dimension [self._max_num_instances].
        areas: Groundtruth areas annotations. The tennsor is padded with -1
          to the fixed dimension [self._max_num_instances].
        classes: Groundtruth classes annotations. The tennsor is padded with -1
          to the fixed dimension [self._max_num_instances].
      """
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                data['groundtruth_is_crowd'] = tf.cond(
                    tf.greater(tf.size(data['groundtruth_is_crowd']),
                               0), lambda: data['groundtruth_is_crowd'],
                    lambda: tf.zeros_like(data['groundtruth_classes'],
                                          dtype=tf.bool))
                source_id = data['source_id']
                image = data['image']
                boxes = data['groundtruth_boxes']
                classes = data['groundtruth_classes']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                areas = data['groundtruth_area']
                is_crowds = data['groundtruth_is_crowd']
                classes = tf.reshape(tf.cast(classes, dtype=tf.float32),
                                     [-1, 1])
                input_height = tf.shape(image)[0]
                input_width = tf.shape(image)[1]

                if params['skip_crowd_during_training'] and self._is_training:
                    indices = tf.where(
                        tf.logical_not(data['groundtruth_is_crowd']))
                    classes = tf.gather_nd(classes, indices)
                    boxes = tf.gather_nd(boxes, indices)

                input_processor = DetectionInputProcessor(
                    image, params['image_size'], boxes, classes)
                input_processor.normalize_image()
                if self._is_training and params['input_rand_hflip']:
                    input_processor.random_horizontal_flip()
                if self._is_training:
                    input_processor.set_training_random_scale_factors(
                        params['train_scale_min'], params['train_scale_max'])
                else:
                    input_processor.set_scale_factors_to_output_size()
                image = input_processor.resize_and_crop_image()
                boxes, classes = input_processor.resize_and_crop_boxes()

                # Assign anchors.
                (cls_targets, box_targets,
                 num_positives) = anchor_labeler.label_anchors(boxes, classes)

                source_id = tf.where(tf.equal(source_id, tf.constant('')),
                                     '-1', source_id)
                source_id = tf.string_to_number(source_id)

                # Pad groundtruth data for evaluation.
                image_scale = input_processor.image_scale_to_original
                scaled_height = tf.to_float(
                    input_height) * input_processor.image_scale
                scaled_width = tf.to_float(
                    input_width) * input_processor.image_scale
                image_info = tf.stack([
                    tf.cast(scaled_height, dtype=tf.float32),
                    tf.cast(scaled_width, dtype=tf.float32),
                    image_scale,
                    tf.cast(input_height, dtype=tf.float32),
                    tf.cast(input_width, dtype=tf.float32),
                ])
                boxes *= image_scale
                is_crowds = tf.cast(is_crowds, dtype=tf.float32)
                boxes = pad_to_fixed_size(boxes, -1,
                                          [self._max_num_instances, 4])
                is_crowds = pad_to_fixed_size(is_crowds, 0,
                                              [self._max_num_instances, 1])
                areas = pad_to_fixed_size(areas, -1,
                                          [self._max_num_instances, 1])
                classes = pad_to_fixed_size(classes, -1,
                                            [self._max_num_instances, 1])
                if params['use_bfloat16']:
                    image = tf.cast(image, dtype=tf.bfloat16)
                return (image, cls_targets, box_targets, num_positives,
                        source_id, image_scale, image_info, boxes, is_crowds,
                        areas, classes)

        batch_size = params['batch_size']
        dataset = tf.data.Dataset.list_files(self._file_pattern,
                                             shuffle=self._is_training,
                                             seed=tf.random.set_random_seed(
                                                 int(time.time() * 1e9)))
        if self._is_training:
            dataset = dataset.repeat()

        # Prefetch data from files.
        def _prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(
                filename, buffer_size=BUFFER_SIZE).prefetch(1)
            return dataset

        dataset = dataset.apply(
            contrib_data.parallel_interleave(_prefetch_dataset,
                                             cycle_length=32,
                                             sloppy=self._is_training))

        if params.get('dataset_private_threadpool_size', None):
            options = tf.data.Options()
            options.experimental_threading.private_threadpool_size = params[
                'dataset_private_threadpool_size']
            dataset = dataset.with_options(options)

        if params.get('dataset_max_intra_op_parallelism', None):
            options = tf.data.Options()
            options.experimental_threading.max_intra_op_parallelism = params[
                'dataset_max_intra_op_parallelism']
            dataset = dataset.with_options(options)

        if self._is_training:
            dataset = dataset.shuffle(64)

        # Parse the fetched records to input tensors for model function.
        dataset = dataset.map(_dataset_parser, num_parallel_calls=64)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        dataset = dataset.batch(batch_size, drop_remainder=True)

        def _process_example(images, cls_targets, box_targets, num_positives,
                             source_ids, image_scales, image_info, boxes,
                             is_crowds, areas, classes):
            """Processes one batch of data."""
            labels = {}
            # Count num_positives in a batch.
            num_positives_batch = tf.reduce_mean(num_positives)
            labels['mean_num_positives'] = tf.reshape(
                tf.tile(tf.expand_dims(num_positives_batch, 0), [
                    batch_size,
                ]), [batch_size, 1])

            for level in range(params['min_level'], params['max_level'] + 1):
                labels['cls_targets_%d' % level] = cls_targets[level]
                labels['box_targets_%d' % level] = box_targets[level]
            # Concatenate groundtruth annotations to a tensor.
            groundtruth_data = tf.concat([boxes, is_crowds, areas, classes],
                                         axis=2)
            labels['source_ids'] = source_ids
            labels['groundtruth_data'] = groundtruth_data
            labels['image_scales'] = image_scales
            labels['image_info'] = image_info
            if not self._is_training:
                return {
                    'inputs': images,
                    'image_info': image_info,
                    'labels': labels
                }
            return images, labels

        dataset = dataset.map(_process_example)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        if self._num_examples > 0:
            dataset = dataset.take(self._num_examples)
        return dataset
Beispiel #21
0
    def __call__(self, params):
        example_decoder = tf_example_decoder.TfExampleSegmentationDecoder()

        def _dataset_parser(value):
            """Parse data to a fixed dimension input image and learning targets.

      Args:
        value: A dictionary contains an image and groundtruth annotations.

      Returns:
        A list of the following elements in order:
        image: Image tensor that is preproessed to have normalized value and
          fixed dimension [image_size, image_size, 3]
        label: label tensor of the same spatial dimension as the image.
      """
            with tf.name_scope('parser'):
                data = example_decoder.decode(value)
                image = data['image']
                label = data['labels_class']
                label = tf.to_int32(label)
                input_processor = SegmentationInputProcessor(
                    image, params['image_size'], label)
                # The image normalization is identical to Cloud TPU ResNet.
                input_processor.normalize_image()
                if self._is_training and params['input_rand_hflip']:
                    input_processor.random_horizontal_flip()
                if self._is_training:
                    input_processor.set_training_random_scale_factors(
                        params['train_scale_min'], params['train_scale_max'])
                image = input_processor.resize_and_crop_image()

                # Set padding to background (class=0) during training.
                if self._is_training:
                    label = input_processor.resize_and_crop_label(0)
                else:
                    label = input_processor.resize_and_crop_label(
                        params['ignore_label'])
                if params['use_bfloat16']:
                    image = tf.cast(image, dtype=tf.bfloat16)
                return image, label

        batch_size = params['batch_size']

        dataset = tf.data.Dataset.list_files(self._file_pattern,
                                             shuffle=self._is_training)
        if self._is_training:
            dataset = dataset.repeat()

        def _prefetch_dataset(filename):
            dataset = tf.data.TFRecordDataset(filename).prefetch(1)
            return dataset

        dataset = dataset.apply(
            contrib_data.parallel_interleave(_prefetch_dataset,
                                             cycle_length=32,
                                             sloppy=self._is_training))
        if self._is_training:
            dataset = dataset.shuffle(64)

        dataset = dataset.map(_dataset_parser, num_parallel_calls=64)
        dataset = dataset.prefetch(batch_size)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        return dataset