Example #1
0
        def _input_fn():
            context_feature_columns, example_feature_columns, _ = (
                _get_feature_columns())
            context_feature_spec = tf.feature_column.make_parse_example_spec(
                list(context_feature_columns.values()))

            label_column = tf.feature_column.numeric_column(
                _LABEL_FEATURE, dtype=tf.float32, default_value=_PADDING_LABEL)
            weight_column = (_get_example_weight_feature_column()
                             if weights_feature_name == _EXAMPLE_WEIGHT_FEATURE
                             else None)
            example_fc_list = (list(example_feature_columns.values()) +
                               [label_column] +
                               ([weight_column] if weight_column else []))
            example_feature_spec = tf.feature_column.make_parse_example_spec(
                example_fc_list)

            dataset = data.build_ranking_dataset(
                file_pattern=self._data_file,
                data_format=data.ELWC,
                batch_size=10,
                context_feature_spec=context_feature_spec,
                example_feature_spec=example_feature_spec,
                list_size=2,
                reader=tf.data.TFRecordDataset,
                size_feature_name=_SIZE)
            features = tf.compat.v1.data.make_one_shot_iterator(
                dataset).get_next()
            label = tf.squeeze(features.pop(_LABEL_FEATURE), axis=2)
            return features, label
Example #2
0
    def _make_dataset(self,
                      batch_size,
                      list_size,
                      input_pattern,
                      randomize_input=True,
                      num_epochs=None):
        """Builds a dataset for the TF-Ranking model.

    Args:
      batch_size: (int) The number of input examples to process per batch. Use
        params['batch_size'] for TPUEstimator, and `batch_size` for Estimator.
      list_size: (int) The list size for an ELWC example.
      input_pattern: (str) File pattern for the input data.
      randomize_input: (bool) If true, randomize input example order. It should
        almost always be true except for unittest/debug purposes.
      num_epochs: (int) The number of times the input dataset must be repeated.
        None to repeat the data indefinitely.

    Returns:
      A tuple of (feature tensors, label tensor).
    """
        context_feature_spec = tf.feature_column.make_parse_example_spec(
            self._context_feature_columns.values())

        label_column = tf.feature_column.numeric_column(
            self._label_feature_name,
            dtype=self._label_feature_type,
            default_value=_PADDING_LABEL)
        example_feature_spec = tf.feature_column.make_parse_example_spec(
            list(self._example_feature_columns.values()) + [label_column])

        dataset = tfr_data.build_ranking_dataset(
            file_pattern=input_pattern,
            data_format=tfr_data.ELWC,
            batch_size=batch_size,
            list_size=list_size,
            context_feature_spec=context_feature_spec,
            example_feature_spec=example_feature_spec,
            reader=self._dataset_reader,
            reader_args=None,
            num_epochs=num_epochs,
            shuffle=randomize_input,
            shuffle_buffer_size=1000,
            shuffle_seed=None,
            prefetch_buffer_size=10000,
            reader_num_threads=64,
            sloppy_ordering=True,
            drop_final_batch=False,
            num_parser_threads=None,
            size_feature_name=self._size_feature_name)

        return dataset.map(self._features_and_labels)
Example #3
0
    def test_build_ranking_dataset_reader_num_threads(self,
                                                      reader_num_threads):
        with tf.Graph().as_default():
            # Save EIE protos in a sstable file in a temp folder.
            serialized_example_in_examples = [
                _example_in_example(CONTEXT_1, EXAMPLES_1).SerializeToString(),
                _example_in_example(CONTEXT_2, EXAMPLES_2).SerializeToString(),
            ] * 5
            data_dir = tf.compat.v1.test.get_temp_dir()
            data_file = os.path.join(data_dir, "test_ranking_data.tfrecord")
            if tf.io.gfile.exists(data_file):
                tf.io.gfile.remove(data_file)

            with tf.io.TFRecordWriter(data_file) as writer:
                for serialized_eie in serialized_example_in_examples:
                    writer.write(serialized_eie)

            batched_dataset = data_lib.build_ranking_dataset(
                file_pattern=data_file,
                data_format=data_lib.EIE,
                batch_size=2,
                list_size=2,
                context_feature_spec=CONTEXT_FEATURE_SPEC,
                example_feature_spec=EXAMPLE_FEATURE_SPEC,
                reader=tf.data.TFRecordDataset,
                shuffle=False,
                reader_num_threads=reader_num_threads)
            features = tf.compat.v1.data.make_one_shot_iterator(
                batched_dataset).get_next()
            self.assertAllEqual([2, 1],
                                features["query_length"].get_shape().as_list())
            self.assertAllEqual([2, 2, 1],
                                features["utility"].get_shape().as_list())

            self.assertAllEqual(sorted(features.keys()),
                                ["query_length", "unigrams", "utility"])

            with tf.compat.v1.Session() as sess:
                sess.run(tf.compat.v1.local_variables_initializer())
                features = sess.run(features)
                self.assertAllEqual(features["unigrams"].dense_shape,
                                    [2, 2, 3])
                self.assertAllEqual(
                    features["unigrams"].indices,
                    [[0, 0, 0], [0, 1, 0], [0, 1, 1], [0, 1, 2], [1, 0, 0]])
                self.assertAllEqual(
                    features["unigrams"].values,
                    [b"tensorflow", b"learning", b"to", b"rank", b"gbdt"])
                # For Tensors with dense values, values can be directly checked.
                self.assertAllEqual(features["query_length"], [[3], [2]])
                self.assertAllEqual(features["utility"],
                                    [[[0.], [1.0]], [[0.], [-1.]]])
Example #4
0
  def _build_dataset(self,
                     file_pattern: str,
                     batch_size: int,
                     list_size: Optional[int] = None,
                     randomize_input: bool = True,
                     num_epochs: Optional[int] = None) -> tf.data.Dataset:
    """Returns `tf.data.Dataset` for training or validating the model.

    Args:
      file_pattern: File pattern for input data.
      batch_size: Number of input examples to process per batch.
      list_size: The list size for an ELWC example.
      randomize_input: If true, randomize input example order. It should almost
        always be true except for unittest/debug purposes.
      num_epochs: Number of times the input dataset must be repeated. None to
        repeat the data indefinitely.

    Returns:
      A `tf.data.Dataset`.
    """
    # TODO: Remove defaults common in Estimator pipeline and here.
    dataset = data.build_ranking_dataset(
        file_pattern=file_pattern,
        data_format=data.ELWC,
        batch_size=batch_size,
        list_size=list_size,
        context_feature_spec=dict(
            list(self._context_feature_spec.items()) +
            list(self._training_only_context_spec.items())),
        example_feature_spec=dict(
            list(self._example_feature_spec.items()) +
            list(self._training_only_example_spec.items())),
        mask_feature_name=self._mask_feature_name,
        reader=self._hparams.dataset_reader,
        reader_args=None,
        num_epochs=num_epochs,
        shuffle=randomize_input,
        shuffle_buffer_size=1000,
        shuffle_seed=None,
        prefetch_buffer_size=10000,
        reader_num_threads=64,
        sloppy_ordering=True,
        drop_final_batch=False,
        shuffle_examples=False)

    return dataset.map(
        self._features_and_labels,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
Example #5
0
def _inner_input_fn():
    context_feature_spec = tf.feature_column.make_parse_example_spec(
        list(context_feature_column().values()))
    label_column = tf.feature_column.numeric_column(
        _LABEL_FEATURE, default_value=_PADDING_LABEL)
    example_feature_spec = tf.feature_column.make_parse_example_spec(
        list(example_feature_columns().values()) + [label_column])
    dataset = data.build_ranking_dataset(
        file_pattern=DATA_FILE,
        data_format=data.ELWC,
        batch_size=10,
        context_feature_spec=context_feature_spec,
        example_feature_spec=example_feature_spec,
        list_size=2,
        reader=tf.data.TFRecordDataset)
    features = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
    label = tf.squeeze(features.pop(_LABEL_FEATURE), axis=2)
    return features, label