def _build_task(self, task_name: str, data_dir: str,
                    vocabulary: seqio.Vocabulary) -> seqio.Task:
        split_to_filepattern = {
            tfds.Split.TRAIN: os.path.join(data_dir, 'train.tfr*'),
            tfds.Split.VALIDATION: os.path.join(data_dir, 'valid.tfr*')
        }
        if _has_test_split(task_name):
            split_to_filepattern[tfds.Split.TEST] = os.path.join(
                data_dir, 'test.tfr*')

        source_features = {
            'inputs': tf.io.FixedLenFeature([], tf.string, ''),
            'targets': tf.io.FixedLenFeature([], tf.string, '')
        }
        data_source = seqio.TFExampleDataSource(
            split_to_filepattern=split_to_filepattern,
            feature_description=source_features,
            num_input_examples=_get_num_examples(task_name))

        output_features = {
            'inputs':
            seqio.Feature(vocabulary=vocabulary, add_eos=True, required=False),
            'targets':
            seqio.Feature(vocabulary=vocabulary, add_eos=True)
        }
        task = seqio.Task(
            name=task_name,
            source=data_source,
            output_features=output_features,
            preprocessors=[
                seqio.preprocessors.tokenize, seqio.preprocessors.append_eos
            ],
            shuffle_buffer_size=None  # disable shuffling.
        )
        return task
  def __init__(self,
               name,
               split_to_filepattern,
               feature_description,
               text_preprocessor,
               metric_fns,
               reader=tf.data.TFRecordDataset,
               **task_kwargs):
    """TextLineTask constructor.

    Args:
      name: string, a unique name for the Task. A ValueError will be raised if
        another task with this name is already registered.
      split_to_filepattern: dict of string (split name) to string (filename or
        filepattern).
      feature_description: dict, a mapping of string feature keys to
        `tf.io.FixedLenFeature` or `tf.io.VarLenFeature` values.
      text_preprocessor: a function (or list of functions) that (each) takes in
        a tf.data.Dataset of string features and returns a tf.data.Dataset of
        string features. Can be set to None as a no-op. If a list is given, they
        will be executed sequentially.
      metric_fns: list(callable), list of metric functions with the signature
        metric_fn(targets, predictions) to use during evaluation.
      reader: `tf.data.Dataset`, a dataset class to read the input files.
      **task_kwargs: dict, additional keyword arguments for the parent `Task`
        class.
    """

    super().__init__(
        name,
        source=seqio.TFExampleDataSource(
            split_to_filepattern=split_to_filepattern,
            feature_description=feature_description,
            reader_cls=reader,
            num_input_examples=task_kwargs.pop("num_input_examples", None)),
        text_preprocessor=text_preprocessor,
        metric_fns=metric_fns,
        dataset_fn=None,
        splits=None,
        **task_kwargs)
Beispiel #3
0
def _register_w_defaults(
    name,
    split_to_filepattern,
    task,
    delimiter_type,
):
    """Register a WikiDiff task w/ default params."""

    delimiter_range_pair = rendering_utils.get_default_delimiter_range_pair(
        task,
        delimiter_type,
    )

    metric_fns = [*DEFAULT_METRIC_FNS]

    normalize_fn = functools.partial(
        rendering_utils.normalize,
        delimiter_range_pair=delimiter_range_pair,
        task=task,
    )
    postprocess_fn = functools.partial(
        postprocessors.postprocess_wikidiff,
        vocabulary=t5.data.get_default_vocabulary(),
        normalize_fn=normalize_fn,
    )
    seqio.TaskRegistry.add(
        name,
        source=seqio.TFExampleDataSource(
            split_to_filepattern=split_to_filepattern,
            feature_description=DEFAULT_FEATURE_DESCRIPTION,
        ),
        preprocessors=DEFAULT_PREPROCESSORS,
        postprocess_fn=postprocess_fn,
        metric_fns=metric_fns,
        output_features=DEFAULT_OUTPUT_FEATURES,
    )