Beispiel #1
0
    def input_fn(self,  # noqa: C901
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 config=None,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.

        Args:
            mode: tf.estimator.ModeKeys
            hparams: HParams, model hparams
            data_dir: str, data directory; if None, will use hparams.data_dir
            params: dict, may include "batch_size"
            config: RunConfig; should have the data_parallelism attribute if not using
                TPU
            dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
                method when called

        Returns:
            (features_dict<str name, Tensor feature>, Tensor targets)
        """
        partition_id, num_partitions = self._dataset_partition(mode, config)

        is_training = mode == tf.estimator.ModeKeys.TRAIN
        if config and config.use_tpu:
            num_threads = 64
        else:
            num_threads = 4 if is_training else 1

        max_length = self.max_length(hparams)

        def tpu_valid_size(example):
            return data_reader.example_valid_size(
                example,
                hparams.min_length,
                max_length
            )

        def gpu_valid_size(example):
            drop_long_sequences = is_training or hparams.eval_drop_long_sequences
            return data_reader.example_valid_size(
                example,
                hparams.min_length,
                max_length
                if drop_long_sequences else 10**9
            )

        def define_shapes(example):
            batch_size = config and config.use_tpu and params["batch_size"]
            return standardize_shapes(example, batch_size=batch_size)

        # Read and preprocess
        data_dir = data_dir or hparams.data_dir

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
                "mode": mode,
                "data_dir": data_dir,
                "num_threads": num_threads,
                "hparams": hparams,
                "partition_id": partition_id,
                "num_partitions": num_partitions,
        })

        dataset = self.dataset(**dataset_kwargs)
        if is_training:
            # Repeat and skip a random number of records
            dataset = dataset.repeat()
            data_files = tf.contrib.slim.parallel_reader.get_data_files(
                    self.filepattern(data_dir, mode))
            #    In continuous_train_and_eval when switching between train and
            #    eval, this input_fn method gets called multiple times and it
            #    would give you the exact same samples from the last call
            #    (because the Graph seed is set). So this skip gives you some
            #    shuffling.
            dataset = skip_random_fraction(dataset, data_files[0])

        dataset = dataset.map(
                data_reader.cast_int64_to_int32, num_parallel_calls=num_threads)

        if self.batch_size_means_tokens:
            batch_size_means_tokens = True
        else:
            if _are_shapes_fully_defined(dataset.output_shapes):
                batch_size_means_tokens = False
            else:
                tf.logging.warning(
                        "Shapes are not fully defined. Assuming batch_size means tokens. "
                        "Override batch_size_means_tokens() "
                        "in your problem subclass if this is undesired behavior.")
                batch_size_means_tokens = True

        # Batching
        if not batch_size_means_tokens:
            # Batch size means examples per datashard.
            if config and config.use_tpu:
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                dataset = dataset.apply(
                        tf.contrib.data.batch_and_drop_remainder(batch_size))
            else:
                num_shards = (config and config.data_parallelism.n) or 1
                batch_size = hparams.batch_size * num_shards
                dataset = dataset.batch(batch_size)
        else:
            # batch_size means tokens per datashard
            if config and config.use_tpu:
                dataset = dataset.filter(tpu_valid_size)
                padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams)
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                dataset = dataset.apply(
                        tf.contrib.data.padded_batch_and_drop_remainder(
                                batch_size, padded_shapes))
            else:
                # On GPU, bucket by length
                dataset = dataset.filter(gpu_valid_size)
                batching_scheme = data_reader.hparams_to_batching_scheme(
                        hparams,
                        shard_multiplier=(config and config.data_parallelism.n) or 1,
                        length_multiplier=self.get_hparams().batch_size_multiplier)
                if hparams.use_fixed_batch_size:
                    # Here    batch_size really means examples per datashard.
                    batching_scheme["batch_sizes"] = [hparams.batch_size]
                    batching_scheme["boundaries"] = []
                dataset = data_reader.bucket_by_sequence_length(
                        dataset, data_reader.example_length, batching_scheme["boundaries"],
                        batching_scheme["batch_sizes"])

                if not is_training:

                    def _pad_batch(features):
                        if not config or config.data_parallelism.n <= 1:
                            return features
                        tf.logging.warn(
                                "Padding the batch to ensure that remainder eval batches have "
                                "a batch size divisible by the number of data shards. This may "
                                "lead to incorrect metrics for non-zero-padded features, e.g. "
                                "images. Use a single datashard (i.e. 1 GPU) in that case.")
                        return pad_batch(features, config.data_parallelism.n)

                    dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads)

        dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)
        dataset = dataset.prefetch(2)
        features = dataset.make_one_shot_iterator().get_next()
        if not config or not config.use_tpu:
            _summarize_features(features, (config and config.data_parallelism.n) or 1)

        if mode == tf.estimator.ModeKeys.PREDICT:
            features["infer_targets"] = features["targets"]
            features["targets"] = None
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner())

        return features, features["targets"]
Beispiel #2
0
    def input_fn(self,
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 config=None,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
        is_training = mode == tf.estimator.ModeKeys.TRAIN
        num_threads = 4 if is_training else 1

        def tpu_valid_size(example):
            return data_reader.example_valid_size(example, hparams.min_length,
                                                  hparams.max_length)

        def gpu_valid_size(example):
            drop_long_sequences = is_training or hparams.eval_drop_long_sequences
            return data_reader.example_valid_size(
                example, hparams.min_length,
                hparams.max_length if drop_long_sequences else 10**9)

        def define_shapes(example):
            batch_size = config and config.use_tpu and params["batch_size"]
            return standardize_shapes(example, batch_size=batch_size)

        # Read and preprocess
        data_dir = data_dir or hparams.data_dir

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
            "mode": mode,
            "data_dir": data_dir,
            "num_threads": num_threads,
            "hparams": hparams
        })

        dataset = self.dataset(**dataset_kwargs)
        dataset = dataset.map(data_reader.cast_int64_to_int32,
                              num_parallel_calls=num_threads)
        if is_training:
            dataset = dataset.repeat(None)

        # Batching
        if _are_shapes_fully_defined(dataset.output_shapes):
            # Static shape features (e.g. images)
            if config and config.use_tpu:
                tpu_batch_size = params["batch_size"]
                dataset = dataset.apply(
                    tf.contrib.data.batch_and_drop_remainder(tpu_batch_size))
            else:
                num_shards = (config and config.data_parallelism.n) or 1
                dataset = dataset.batch(hparams.batch_size * num_shards)
        else:
            # Variable length features
            if config and config.use_tpu:
                # On TPU, pad to hparams.max_length
                dataset = dataset.filter(tpu_valid_size)
                padded_shapes = _fill_shape_nones(
                    dataset.output_shapes, none_filler=hparams.max_length)
                dataset = dataset.apply(
                    tf.contrib.data.padded_batch_and_drop_remainder(
                        params["batch_size"], padded_shapes))
            else:
                # On GPU, bucket by length
                dataset = dataset.filter(gpu_valid_size)
                batching_scheme = data_reader.hparams_to_batching_scheme(
                    hparams,
                    shard_multiplier=(config and config.data_parallelism.n)
                    or 1,
                    length_multiplier=self.get_hparams().batch_size_multiplier)
                if hparams.use_fixed_batch_size:
                    batching_scheme["batch_sizes"] = [hparams.batch_size]
                    batching_scheme["boundaries"] = []
                dataset = data_reader.bucket_by_sequence_length(
                    dataset, data_reader.example_length,
                    batching_scheme["boundaries"],
                    batching_scheme["batch_sizes"])

        dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)
        dataset = dataset.prefetch(1)
        features = dataset.make_one_shot_iterator().get_next()
        if not config or not config.use_tpu:
            _summarize_features(features,
                                (config and config.data_parallelism.n) or 1)

        if mode == tf.estimator.ModeKeys.PREDICT:
            features["infer_targets"] = features["targets"]
            features["targets"] = None
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                                 data_reader.DummyQueueRunner())

        return features, features["targets"]
Beispiel #3
0
    def input_fn(self,
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 config=None,
                 force_repeat=False,
                 prevent_repeat=False,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      force_repeat: bool, whether to repeat the data even if not training
      prevent_repeat: bool, whether to not repeat when in training mode.
        Overrides force_repeat.
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
        partition_id, num_partitions = self._dataset_partition(mode, config)

        is_training = mode == tf.estimator.ModeKeys.TRAIN
        if config and config.use_tpu:
            num_threads = 64
        else:
            num_threads = cpu_count() if is_training else 1

        if config and hasattr(config,
                              "data_parallelism") and config.data_parallelism:
            num_shards = config.data_parallelism.n
        else:
            num_shards = 1

        max_length = self.max_length(hparams)
        mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH,
                                     value=max_length)

        def tpu_valid_size(example):
            return data_reader.example_valid_size(example, hparams.min_length,
                                                  max_length)

        def gpu_valid_size(example):
            drop_long_sequences = is_training or hparams.eval_drop_long_sequences
            return data_reader.example_valid_size(
                example, hparams.min_length,
                max_length if drop_long_sequences else 10**9)

        def define_shapes(example):
            batch_size = config and config.use_tpu and params["batch_size"]
            return standardize_shapes(example, batch_size=batch_size)

        # Read and preprocess
        data_dir = data_dir or (hasattr(hparams, "data_dir")
                                and hparams.data_dir)

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
            "mode": mode,
            "data_dir": data_dir,
            "num_threads": num_threads,
            "hparams": hparams,
            "partition_id": partition_id,
            "num_partitions": num_partitions,
        })

        dataset = self.dataset(**dataset_kwargs)
        if (force_repeat or is_training) and not prevent_repeat:
            # Repeat and skip a random number of records
            dataset = dataset.repeat()

        if is_training and self.skip_random_fraction_when_training:
            data_files = tf.contrib.slim.parallel_reader.get_data_files(
                self.filepattern(data_dir, mode))
            #  In continuous_train_and_eval when switching between train and
            #  eval, this input_fn method gets called multiple times and it
            #  would give you the exact same samples from the last call
            #  (because the Graph seed is set). So this skip gives you some
            #  shuffling.
            dataset = skip_random_fraction(dataset, data_files[0])

        dataset = dataset.map(data_reader.cast_ints_to_int32,
                              num_parallel_calls=num_threads)

        if self.batch_size_means_tokens:
            batch_size_means_tokens = True
        else:
            if _are_shapes_fully_defined(dataset.output_shapes):
                batch_size_means_tokens = False
            else:
                tf.logging.warning(
                    "Shapes are not fully defined. Assuming batch_size means tokens."
                )
                batch_size_means_tokens = True

        # Batching
        if not batch_size_means_tokens:
            # Batch size means examples per datashard.
            if config and config.use_tpu:
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                dataset = dataset.batch(batch_size, drop_remainder=True)
            else:
                batch_size = hparams.batch_size * num_shards
                dataset = dataset.batch(batch_size)
        else:
            # batch_size means tokens per datashard
            if config and config.use_tpu:
                dataset = dataset.filter(tpu_valid_size)
                padded_shapes = self._pad_for_tpu(dataset.output_shapes,
                                                  hparams)
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                if hparams.pad_batch:
                    tf.logging.warn(
                        "Padding the batch to ensure that remainder eval batches are "
                        "processed. This may lead to incorrect metrics for "
                        "non-zero-padded features, e.g. images. Use a smaller batch "
                        "size that has no remainder in that case.")
                    dataset = dataset.padded_batch(batch_size,
                                                   padded_shapes,
                                                   drop_remainder=False)
                    dataset = dataset.map(functools.partial(
                        pad_batch, batch_multiple=batch_size),
                                          num_parallel_calls=num_threads)
                else:
                    dataset = dataset.padded_batch(batch_size,
                                                   padded_shapes,
                                                   drop_remainder=True)
            else:
                # On GPU, bucket by length
                dataset = dataset.filter(gpu_valid_size)
                batching_scheme = data_reader.hparams_to_batching_scheme(
                    hparams,
                    shard_multiplier=num_shards,
                    length_multiplier=self.get_hparams().batch_size_multiplier)
                if hparams.use_fixed_batch_size:
                    # Here  batch_size really means examples per datashard.
                    batching_scheme["batch_sizes"] = [hparams.batch_size]
                    batching_scheme["boundaries"] = []
                dataset = dataset.apply(
                    tf.contrib.data.bucket_by_sequence_length(
                        data_reader.example_length,
                        batching_scheme["boundaries"],
                        batching_scheme["batch_sizes"]))

                if not is_training:
                    batch_multiple = num_shards
                    if hparams.use_fixed_batch_size:
                        # Make sure the last batch has the same fixed size as the rest.
                        batch_multiple *= hparams.batch_size
                    if batch_multiple > 1:
                        tf.logging.warn(
                            "Padding the batch to ensure that remainder eval batches have "
                            "a batch size divisible by the number of data shards. This may "
                            "lead to incorrect metrics for non-zero-padded features, e.g. "
                            "images. Use a single datashard (i.e. 1 GPU) in that case."
                        )
                        dataset = dataset.map(functools.partial(
                            pad_batch, batch_multiple=batch_multiple),
                                              num_parallel_calls=num_threads)

        dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)

        # Add shuffling for training batches. This is necessary along with record
        # level shuffling in the dataset generation. Record shuffling will shuffle
        # the examples. However, in some cases, it's possible that the shuffle
        # buffer size for record shuffling is smaller than the batch size. In such
        # cases, adding batch shuffling ensures that the data is in random order
        # during training
        if (is_training and hasattr(hparams, "batch_shuffle_size")
                and hparams.batch_shuffle_size):
            dataset = dataset.shuffle(hparams.batch_shuffle_size)

        def prepare_for_output(example):
            if not config or not config.use_tpu:
                _summarize_features(example, num_shards)
            if mode == tf.estimator.ModeKeys.PREDICT:
                example["infer_targets"] = example.pop("targets")
                return example
            else:
                return example, example["targets"]

        dataset = dataset.map(prepare_for_output,
                              num_parallel_calls=num_threads)
        dataset = dataset.prefetch(2)

        if mode == tf.estimator.ModeKeys.PREDICT:
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                                 data_reader.DummyQueueRunner())

        return dataset
    def input_fn(self,
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 force_repeat=False,
                 prevent_repeat=False,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.
        Args:
            mode: tf.estimator.ModeKeys
            hparams: HParams, model hp
            data_dir: str, data directory; if None, will use hp.data_dir
            params: dict, may include "batch_size"
            force_repeat: bool, whether to repeat the data even if not training
            prevent_repeat: bool, whether to not repeat when in training mode.
                Overrides force_repeat.
            dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
                method when called
        Returns:
            (features_dict<str name, Tensor feature>, Tensor targets)
        """
        is_training = mode == tf.estimator.ModeKeys.TRAIN
        num_threads = problem.cpu_count() if is_training else 1

        # def gpu_valid_size(example):
        #     return data_utils.example_valid_size(example, hparams.min_length, hparams.max_length)

        # Read and preprocess
        data_dir = data_dir or (hasattr(hparams, "data_dir")
                                and hparams.data_dir)

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
            "mode": mode,
            "data_dir": data_dir,
            "num_threads": num_threads,
            "hparams": hparams
        })

        dataset = self.dataset(**dataset_kwargs)
        if (force_repeat or is_training) and not prevent_repeat:
            # Repeat and skip a random number of records
            dataset = dataset.repeat()

        dataset = dataset.map(data_reader.cast_ints_to_int32,
                              num_parallel_calls=num_threads)

        # dataset = dataset.filter(gpu_valid_size)

        dataset = dataset.apply(
            tf.contrib.data.bucket_by_sequence_length(
                data_utils.example_length, [], [hparams.batch_size]))

        def prepare_for_output(example):
            problem._summarize_features(example, 1)
            return example

        dataset = dataset.map(prepare_for_output,
                              num_parallel_calls=num_threads)
        dataset = dataset.prefetch(2)

        if mode == tf.estimator.ModeKeys.PREDICT:
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                                 data_reader.DummyQueueRunner())

        return dataset