def testBucketBySeqLength(self):

        def example_len(ex):
            return tf.shape(ex["inputs"])[0]

        boundaries = [10, 20, 30]
        batch_sizes = [10, 8, 4, 2]
        window_size = 40

        dataset = data_reader.read_examples(
            self.problem,
            self.filepatterns[0],
            32,
            mode=tf.contrib.learn.ModeKeys.EVAL)
        dataset = data_reader.bucket_by_sequence_length(
            dataset, example_len,
            boundaries, batch_sizes, window_size)
        batch = dataset.make_one_shot_iterator().get_next()

        input_vals = []
        obs_batch_sizes = []
        with tf.train.MonitoredSession() as sess:
            # Until OutOfRangeError
            while True:
                batch_val = sess.run(batch)
                batch_inputs = batch_val["inputs"]
                batch_size, max_len = batch_inputs.shape
                obs_batch_sizes.append(batch_size)
                for inputs in batch_inputs:
                    input_val = inputs[0]
                    input_vals.append(input_val)
                    # The inputs were constructed such that they were repeated value+1
                    # times (i.e. if the inputs value is 7, the example has 7 repeated 8
                    # times).
                    repeat = input_val + 1
                    # Check padding
                    self.assertAllEqual(
                        [input_val] * repeat + [0] * (max_len - repeat),
                        inputs)

        # Check that all inputs came through
        self.assertEqual(list(range(30)), sorted(input_vals))
        # Check that we saw variable batch size
        self.assertTrue(len(set(obs_batch_sizes)) > 1)
Esempio n. 2
0
  def testBucketBySeqLength(self):

    def example_len(ex):
      return tf.shape(ex["inputs"])[0]

    boundaries = [10, 20, 30]
    batch_sizes = [10, 8, 4, 2]

    dataset = self.problem.dataset(
        tf.estimator.ModeKeys.TRAIN,
        data_dir=self.data_dir,
        shuffle_files=False)
    dataset = data_reader.bucket_by_sequence_length(
        dataset, example_len, boundaries, batch_sizes)
    batch = dataset.make_one_shot_iterator().get_next()

    input_vals = []
    obs_batch_sizes = []
    with tf.train.MonitoredSession() as sess:
      # Until OutOfRangeError
      while True:
        batch_val = sess.run(batch)
        batch_inputs = batch_val["inputs"]
        batch_size, max_len = batch_inputs.shape
        obs_batch_sizes.append(batch_size)
        for inputs in batch_inputs:
          input_val = inputs[0]
          input_vals.append(input_val)
          # The inputs were constructed such that they were repeated value+1
          # times (i.e. if the inputs value is 7, the example has 7 repeated 8
          # times).
          repeat = input_val + 1
          # Check padding
          self.assertAllEqual([input_val] * repeat + [0] * (max_len - repeat),
                              inputs)

    # Check that all inputs came through
    self.assertEqual(list(range(30)), sorted(input_vals))
    # Check that we saw variable batch size
    self.assertTrue(len(set(obs_batch_sizes)) > 1)
Esempio n. 3
0
    def input_fn(self,  # noqa: C901
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 config=None,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.

        Args:
            mode: tf.estimator.ModeKeys
            hparams: HParams, model hparams
            data_dir: str, data directory; if None, will use hparams.data_dir
            params: dict, may include "batch_size"
            config: RunConfig; should have the data_parallelism attribute if not using
                TPU
            dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
                method when called

        Returns:
            (features_dict<str name, Tensor feature>, Tensor targets)
        """
        partition_id, num_partitions = self._dataset_partition(mode, config)

        is_training = mode == tf.estimator.ModeKeys.TRAIN
        if config and config.use_tpu:
            num_threads = 64
        else:
            num_threads = 4 if is_training else 1

        max_length = self.max_length(hparams)

        def tpu_valid_size(example):
            return data_reader.example_valid_size(
                example,
                hparams.min_length,
                max_length
            )

        def gpu_valid_size(example):
            drop_long_sequences = is_training or hparams.eval_drop_long_sequences
            return data_reader.example_valid_size(
                example,
                hparams.min_length,
                max_length
                if drop_long_sequences else 10**9
            )

        def define_shapes(example):
            batch_size = config and config.use_tpu and params["batch_size"]
            return standardize_shapes(example, batch_size=batch_size)

        # Read and preprocess
        data_dir = data_dir or hparams.data_dir

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
                "mode": mode,
                "data_dir": data_dir,
                "num_threads": num_threads,
                "hparams": hparams,
                "partition_id": partition_id,
                "num_partitions": num_partitions,
        })

        dataset = self.dataset(**dataset_kwargs)
        if is_training:
            # Repeat and skip a random number of records
            dataset = dataset.repeat()
            data_files = tf.contrib.slim.parallel_reader.get_data_files(
                    self.filepattern(data_dir, mode))
            #    In continuous_train_and_eval when switching between train and
            #    eval, this input_fn method gets called multiple times and it
            #    would give you the exact same samples from the last call
            #    (because the Graph seed is set). So this skip gives you some
            #    shuffling.
            dataset = skip_random_fraction(dataset, data_files[0])

        dataset = dataset.map(
                data_reader.cast_int64_to_int32, num_parallel_calls=num_threads)

        if self.batch_size_means_tokens:
            batch_size_means_tokens = True
        else:
            if _are_shapes_fully_defined(dataset.output_shapes):
                batch_size_means_tokens = False
            else:
                tf.logging.warning(
                        "Shapes are not fully defined. Assuming batch_size means tokens. "
                        "Override batch_size_means_tokens() "
                        "in your problem subclass if this is undesired behavior.")
                batch_size_means_tokens = True

        # Batching
        if not batch_size_means_tokens:
            # Batch size means examples per datashard.
            if config and config.use_tpu:
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                dataset = dataset.apply(
                        tf.contrib.data.batch_and_drop_remainder(batch_size))
            else:
                num_shards = (config and config.data_parallelism.n) or 1
                batch_size = hparams.batch_size * num_shards
                dataset = dataset.batch(batch_size)
        else:
            # batch_size means tokens per datashard
            if config and config.use_tpu:
                dataset = dataset.filter(tpu_valid_size)
                padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams)
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                dataset = dataset.apply(
                        tf.contrib.data.padded_batch_and_drop_remainder(
                                batch_size, padded_shapes))
            else:
                # On GPU, bucket by length
                dataset = dataset.filter(gpu_valid_size)
                batching_scheme = data_reader.hparams_to_batching_scheme(
                        hparams,
                        shard_multiplier=(config and config.data_parallelism.n) or 1,
                        length_multiplier=self.get_hparams().batch_size_multiplier)
                if hparams.use_fixed_batch_size:
                    # Here    batch_size really means examples per datashard.
                    batching_scheme["batch_sizes"] = [hparams.batch_size]
                    batching_scheme["boundaries"] = []
                dataset = data_reader.bucket_by_sequence_length(
                        dataset, data_reader.example_length, batching_scheme["boundaries"],
                        batching_scheme["batch_sizes"])

                if not is_training:

                    def _pad_batch(features):
                        if not config or config.data_parallelism.n <= 1:
                            return features
                        tf.logging.warn(
                                "Padding the batch to ensure that remainder eval batches have "
                                "a batch size divisible by the number of data shards. This may "
                                "lead to incorrect metrics for non-zero-padded features, e.g. "
                                "images. Use a single datashard (i.e. 1 GPU) in that case.")
                        return pad_batch(features, config.data_parallelism.n)

                    dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads)

        dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)
        dataset = dataset.prefetch(2)
        features = dataset.make_one_shot_iterator().get_next()
        if not config or not config.use_tpu:
            _summarize_features(features, (config and config.data_parallelism.n) or 1)

        if mode == tf.estimator.ModeKeys.PREDICT:
            features["infer_targets"] = features["targets"]
            features["targets"] = None
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner())

        return features, features["targets"]
Esempio n. 4
0
    def input_fn(self,
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 config=None,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
        is_training = mode == tf.estimator.ModeKeys.TRAIN
        num_threads = 4 if is_training else 1

        def tpu_valid_size(example):
            return data_reader.example_valid_size(example, hparams.min_length,
                                                  hparams.max_length)

        def gpu_valid_size(example):
            drop_long_sequences = is_training or hparams.eval_drop_long_sequences
            return data_reader.example_valid_size(
                example, hparams.min_length,
                hparams.max_length if drop_long_sequences else 10**9)

        def define_shapes(example):
            batch_size = config and config.use_tpu and params["batch_size"]
            return standardize_shapes(example, batch_size=batch_size)

        # Read and preprocess
        data_dir = data_dir or hparams.data_dir

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
            "mode": mode,
            "data_dir": data_dir,
            "num_threads": num_threads,
            "hparams": hparams
        })

        dataset = self.dataset(**dataset_kwargs)
        dataset = dataset.map(data_reader.cast_int64_to_int32,
                              num_parallel_calls=num_threads)
        if is_training:
            dataset = dataset.repeat(None)

        # Batching
        if _are_shapes_fully_defined(dataset.output_shapes):
            # Static shape features (e.g. images)
            if config and config.use_tpu:
                tpu_batch_size = params["batch_size"]
                dataset = dataset.apply(
                    tf.contrib.data.batch_and_drop_remainder(tpu_batch_size))
            else:
                num_shards = (config and config.data_parallelism.n) or 1
                dataset = dataset.batch(hparams.batch_size * num_shards)
        else:
            # Variable length features
            if config and config.use_tpu:
                # On TPU, pad to hparams.max_length
                dataset = dataset.filter(tpu_valid_size)
                padded_shapes = _fill_shape_nones(
                    dataset.output_shapes, none_filler=hparams.max_length)
                dataset = dataset.apply(
                    tf.contrib.data.padded_batch_and_drop_remainder(
                        params["batch_size"], padded_shapes))
            else:
                # On GPU, bucket by length
                dataset = dataset.filter(gpu_valid_size)
                batching_scheme = data_reader.hparams_to_batching_scheme(
                    hparams,
                    shard_multiplier=(config and config.data_parallelism.n)
                    or 1,
                    length_multiplier=self.get_hparams().batch_size_multiplier)
                if hparams.use_fixed_batch_size:
                    batching_scheme["batch_sizes"] = [hparams.batch_size]
                    batching_scheme["boundaries"] = []
                dataset = data_reader.bucket_by_sequence_length(
                    dataset, data_reader.example_length,
                    batching_scheme["boundaries"],
                    batching_scheme["batch_sizes"])

        dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)
        dataset = dataset.prefetch(1)
        features = dataset.make_one_shot_iterator().get_next()
        if not config or not config.use_tpu:
            _summarize_features(features,
                                (config and config.data_parallelism.n) or 1)

        if mode == tf.estimator.ModeKeys.PREDICT:
            features["infer_targets"] = features["targets"]
            features["targets"] = None
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                                 data_reader.DummyQueueRunner())

        return features, features["targets"]
Esempio n. 5
0
  def input_fn(self,
               mode,
               hparams,
               data_dir=None,
               params=None,
               config=None,
               dataset_kwargs=None):
    """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
    partition_id, num_partitions = self._dataset_partition(mode, config)

    is_training = mode == tf.estimator.ModeKeys.TRAIN
    if config and config.use_tpu:
      num_threads = 64
    else:
      num_threads = 4 if is_training else 1

    max_length = self.max_length(hparams)

    def tpu_valid_size(example):
      return data_reader.example_valid_size(example, hparams.min_length,
                                            max_length)

    def gpu_valid_size(example):
      drop_long_sequences = is_training or hparams.eval_drop_long_sequences
      return data_reader.example_valid_size(example, hparams.min_length,
                                            max_length
                                            if drop_long_sequences else 10**9)

    def define_shapes(example):
      batch_size = config and config.use_tpu and params["batch_size"]
      return standardize_shapes(example, batch_size=batch_size)

    # Read and preprocess
    data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir)

    dataset_kwargs = dataset_kwargs or {}
    dataset_kwargs.update({
        "mode": mode,
        "data_dir": data_dir,
        "num_threads": num_threads,
        "hparams": hparams,
        "partition_id": partition_id,
        "num_partitions": num_partitions,
    })

    dataset = self.dataset(**dataset_kwargs)
    if is_training:
      # Repeat and skip a random number of records
      dataset = dataset.repeat()
      data_files = tf.contrib.slim.parallel_reader.get_data_files(
          self.filepattern(data_dir, mode))
      #  In continuous_train_and_eval when switching between train and
      #  eval, this input_fn method gets called multiple times and it
      #  would give you the exact same samples from the last call
      #  (because the Graph seed is set). So this skip gives you some
      #  shuffling.
      dataset = skip_random_fraction(dataset, data_files[0])

    dataset = dataset.map(
        data_reader.cast_ints_to_int32, num_parallel_calls=num_threads)

    if self.batch_size_means_tokens:
      batch_size_means_tokens = True
    else:
      if _are_shapes_fully_defined(dataset.output_shapes):
        batch_size_means_tokens = False
      else:
        tf.logging.warning(
            "Shapes are not fully defined. Assuming batch_size means tokens.")
        batch_size_means_tokens = True

    # Batching
    if not batch_size_means_tokens:
      # Batch size means examples per datashard.
      if config and config.use_tpu:
        # on TPU, we use params["batch_size"], which specifies the number of
        # examples across all datashards
        batch_size = params["batch_size"]
        dataset = dataset.batch(batch_size, drop_remainder=True)
      else:
        num_shards = config.data_parallelism.n if config else 1
        batch_size = hparams.batch_size * num_shards
        dataset = dataset.batch(batch_size)
    else:
      # batch_size means tokens per datashard
      if config and config.use_tpu:
        dataset = dataset.filter(tpu_valid_size)
        padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams)
        # on TPU, we use params["batch_size"], which specifies the number of
        # examples across all datashards
        batch_size = params["batch_size"]
        dataset = dataset.apply(
            tf.contrib.data.padded_batch_and_drop_remainder(
                batch_size, padded_shapes))
      else:
        # On GPU, bucket by length
        dataset = dataset.filter(gpu_valid_size)
        shard_multiplier = config.data_parallelism.n if config else 1
        batching_scheme = data_reader.hparams_to_batching_scheme(
            hparams,
            shard_multiplier=shard_multiplier,
            length_multiplier=self.get_hparams().batch_size_multiplier)
        if hparams.use_fixed_batch_size:
          # Here  batch_size really means examples per datashard.
          batching_scheme["batch_sizes"] = [hparams.batch_size]
          batching_scheme["boundaries"] = []
        dataset = data_reader.bucket_by_sequence_length(
            dataset, data_reader.example_length, batching_scheme["boundaries"],
            batching_scheme["batch_sizes"])

        if not is_training:
          batch_multiple = shard_multiplier
          if hparams.use_fixed_batch_size:
            # Make sure the last batch has the same fixed size as the rest.
            batch_multiple *= hparams.batch_size
          if batch_multiple > 1:
            tf.logging.warn(
                "Padding the batch to ensure that remainder eval batches have "
                "a batch size divisible by the number of data shards. This may "
                "lead to incorrect metrics for non-zero-padded features, e.g. "
                "images. Use a single datashard (i.e. 1 GPU) in that case.")
            dataset = dataset.map(
                functools.partial(pad_batch, batch_multiple=batch_multiple),
                num_parallel_calls=num_threads)

    dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)

    def prepare_for_output(example):
      if not config or not config.use_tpu:
        _summarize_features(example,
                            (config and config.data_parallelism.n) or 1)
      if mode == tf.estimator.ModeKeys.PREDICT:
        example["infer_targets"] = example.pop("targets")
        return example
      else:
        return example, example["targets"]

    dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads)
    dataset = dataset.prefetch(2)

    if mode == tf.estimator.ModeKeys.PREDICT:
      # This is because of a bug in the Estimator that short-circuits prediction
      # if it doesn't see a QueueRunner. DummyQueueRunner implements the
      # minimal expected interface but does nothing.
      tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                           data_reader.DummyQueueRunner())

    return dataset
Esempio n. 6
0
  def input_fn(self, mode, hparams, data_dir=None, params=None, config=None,
               dataset_kwargs=None):
    """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    num_threads = 4 if is_training else 1

    def tpu_valid_size(example):
      return data_reader.example_valid_size(example, hparams.min_length,
                                            hparams.max_length)

    def gpu_valid_size(example):
      drop_long_sequences = is_training or hparams.eval_drop_long_sequences
      return data_reader.example_valid_size(
          example,
          hparams.min_length,
          hparams.max_length if drop_long_sequences else 10**9)

    def define_shapes(example):
      batch_size = config and config.use_tpu and params["batch_size"]
      return standardize_shapes(example, batch_size=batch_size)

    # Read and preprocess
    data_dir = data_dir or hparams.data_dir

    dataset_kwargs = dataset_kwargs or {}
    dataset_kwargs.update({
        "mode": mode,
        "data_dir": data_dir,
        "num_threads": num_threads,
        "hparams": hparams})

    dataset = self.dataset(**dataset_kwargs)
    dataset = dataset.map(
        data_reader.cast_int64_to_int32, num_parallel_calls=num_threads)
    if is_training:
      dataset = dataset.repeat(None)

    # Batching
    if _are_shapes_fully_defined(dataset.output_shapes):
      # Static shape features (e.g. images)
      if config and config.use_tpu:
        tpu_batch_size = params["batch_size"]
        dataset = dataset.apply(
            tf.contrib.data.batch_and_drop_remainder(tpu_batch_size))
      else:
        num_shards = (config and config.data_parallelism.n) or 1
        dataset = dataset.batch(hparams.batch_size * num_shards)
    else:
      # Variable length features
      if config and config.use_tpu:
        # On TPU, pad to hparams.max_length
        dataset = dataset.filter(tpu_valid_size)
        padded_shapes = _fill_shape_nones(
            dataset.output_shapes, none_filler=hparams.max_length)
        dataset = dataset.apply(
            tf.contrib.data.padded_batch_and_drop_remainder(
                params["batch_size"], padded_shapes))
      else:
        # On GPU, bucket by length
        dataset = dataset.filter(gpu_valid_size)
        batching_scheme = data_reader.hparams_to_batching_scheme(
            hparams,
            shard_multiplier=(config and config.data_parallelism.n) or 1,
            length_multiplier=self.get_hparams().batch_size_multiplier)
        if hparams.use_fixed_batch_size:
          batching_scheme["batch_sizes"] = [hparams.batch_size]
          batching_scheme["boundaries"] = []
        dataset = data_reader.bucket_by_sequence_length(
            dataset,
            data_reader.example_length,
            batching_scheme["boundaries"],
            batching_scheme["batch_sizes"])

        if not is_training:
          def _pad_batch(features):
            if not config or config.data_parallelism.n <= 1:
              return features
            tf.logging.warn(
                "Padding the batch to ensure that remainder eval batches have "
                "a batch size divisible by the number of data shards. This may "
                "lead to incorrect metrics for non-zero-padded features, e.g. "
                "images. Use a single datashard (i.e. 1 GPU) in that case.")
            return pad_batch(features, config.data_parallelism.n)

          dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads)

    dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)
    dataset = dataset.prefetch(1)
    features = dataset.make_one_shot_iterator().get_next()
    if not config or not config.use_tpu:
      _summarize_features(features, (config and config.data_parallelism.n) or 1)

    if mode == tf.estimator.ModeKeys.PREDICT:
      features["infer_targets"] = features["targets"]
      features["targets"] = None
      # This is because of a bug in the Estimator that short-circuits prediction
      # if it doesn't see a QueueRunner. DummyQueueRunner implements the
      # minimal expected interface but does nothing.
      tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                           data_reader.DummyQueueRunner())

    return features, features["targets"]