Ejemplo n.º 1
0
 def __call__(self,
              dataset: tf.data.Dataset = None,
              model_feature_lengths: Mapping[str, int] = None):
     if predict_output is None:
         return []
     task = dataset_providers.get_mixture_or_task(task_name)
     return list(
         enumerate(task.output_features[target_feature_name].vocabulary.
                   encode(s) for s in predict_output))
Ejemplo n.º 2
0
def test_preprocessing(task_name, raw_data):
    """Test the preprocessing functionality of a given task.

  This function injects `raw_data` into `task` and runs the preprocessing
  routines from `task`, returning the output of
  `next(task.get_dataset().as_numpy_iterator())`.

  Args:
    task_name: A SeqIO task name.
    raw_data: A string-keyed dict of string-keyed dicts. The top-level dict
      should be keyed by dataset splits, and the second-level dict should hold
      the dataset data.

  Returns:
    The result of running the tasks' preprocessing code on `raw_data`.
  """
    with DataInjector(task_name, raw_data):
        task = dataset_providers.get_mixture_or_task(task_name)
        return next(task.get_dataset(sequence_length=None).as_numpy_iterator())
    def __init__(self,
                 mixture_or_task_name: str,
                 feature_converter: FeatureConverter = EncDecFeatureConverter,
                 eval_split: str = "validation",
                 use_cached: bool = False,
                 sequence_length: Mapping[str, int] = None,
                 summary_dir: Optional[str] = None):
        """Evaluator constructor.

    Args:
      mixture_or_task_name: a registered task or mixture name.
      feature_converter: a feature converter object to use to convert the task
        features to model features. Must be a subclass of
        seqio.FeatureConverter.
      eval_split: evaluation split. Typically "validation" or "test".
      use_cached: whether to use the cached dataset instead of processing it on
        the fly.
      sequence_length: an optional length specification. If specified, these
        will be the hard-limit on the evaluation data used for prediction. If
        none of the preprocessors depend on the sequence length, it can be left
        unspecified and the maximum length for each feature will be used. These
        lengths are computed while caching the datasets.
      summary_dir: an optional directory to save the evaluation results in Event
        protocol buffer format.
    Raises:
      ValueError if `sequence_length` is None but a preprocessor depends on its
      value.
    """
        logging.info("Initializing Evaluator for '%s'", mixture_or_task_name)
        eval_tasks = dataset_providers.get_subtasks(
            dataset_providers.get_mixture_or_task(mixture_or_task_name))
        self._eval_tasks = get_valid_eval_tasks(eval_tasks, eval_split)

        if not self._eval_tasks:
            logging.warning(
                "No eval task with valid split and metric fn found. Skipping eval."
            )
            return

        # Determine if sequence_length arg is required. This occurs when any of the
        # task preprocessors have a `sequence_length` arg with no default value.
        sequence_length_required = False
        for task in eval_tasks:
            for prep in task.preprocessors:
                prep_params = inspect.signature(prep).parameters
                if ("sequence_length" in prep_params
                        and prep_params["sequence_length"].default
                        == inspect.Parameter.empty):
                    if sequence_length is None:
                        raise ValueError(
                            f"Preprocessor '{prep.__name__}' in task '{task.name}' has a "
                            "`sequence_length` argument, making it incompatible with "
                            "automatic sequence length detection. Pass a valid "
                            "`sequence_length` to `Evaluator` and try again.")
                    sequence_length_required = True
                    break

        def dataset_fn(task: Task) -> tf.data.Dataset:
            return task.get_dataset(sequence_length=sequence_length,
                                    split=eval_split,
                                    shuffle=False,
                                    use_cached=use_cached)

        # `task_datasets` have the output features from seqio.Task.get_dataset.
        # These features will be converted to "model features" by the feature
        # converter before being cached.
        cached_targets, cached_task_datasets, max_lengths = (
            get_targets_and_examples(tasks=self._eval_tasks,
                                     dataset_fn=dataset_fn))

        if sequence_length is None:
            logging.info("Setting sequence lengths to %s", max_lengths)
            sequence_length = max_lengths
        elif (sequence_length["inputs"] > max_lengths["inputs"]
              or sequence_length["targets"] > max_lengths["targets"]):
            logging.warning(
                "Given sequence lengths are longer than necessary for some "
                "evaluation inputs or targets, resulting in wasted computation. "
                "Consider passing `None` for `sequence_length` to have them be "
                "automatically computed.\n Got: %s,\n Max Lengths: %s",
                sequence_length, max_lengths)
        elif not sequence_length_required and (
                sequence_length["inputs"] == max_lengths["inputs"]
                or sequence_length["targets"] == max_lengths["targets"]):
            logging.warning(
                "Given sequence lengths *may be* insufficient for some evaluation "
                "inputs or targets. Such sequences will be truncated to fit, "
                "likely leading to sub-optimal results. Consider passing `None` "
                "for `sequence_length` to have them be automatically computed.\n"
            )

        self._cached_model_datasets = {}
        # Convert the task features to model features
        for task in self._eval_tasks:
            eval_ds = feature_converter(cached_task_datasets[task.name],
                                        sequence_length)

            # The eval dataset is enumerated to ensure that the order is preserved
            # throughout the entire evaluation process.
            self._cached_model_datasets[task.name] = eval_ds.enumerate()

        self._cached_targets = cached_targets
        self._cached_task_datasets = cached_task_datasets
        self._model_feature_lengths = feature_converter.get_model_feature_lengths(
            sequence_length)

        if summary_dir:
            with tf.compat.v1.Graph().as_default():
                self._summary_writer = tf.compat.v1.summary.FileWriter(
                    summary_dir)
        else:
            self._summary_writer = None
Ejemplo n.º 4
0
def create_prediction(task_name, s, output_feature_name="targets"):
    task = dataset_providers.get_mixture_or_task(task_name)
    return [(0, task.output_features[output_feature_name].vocabulary.encode(s))
            ]
Ejemplo n.º 5
0
def encode_str(task_name, s, output_feature_name="targets"):
    task = dataset_providers.get_mixture_or_task(task_name)
    return task.output_features[output_feature_name].vocabulary.encode(s)
Ejemplo n.º 6
0
    def __init__(self, task_name, per_split_data):
        self._task = dataset_providers.get_mixture_or_task(task_name)

        self.per_split_data = per_split_data
        self._saved_source = self._task._source
Ejemplo n.º 7
0
    def __init__(self,
                 mixture_or_task_name: str,
                 feature_converter: FeatureConverter = EncDecFeatureConverter,
                 eval_split: str = "validation",
                 use_cached: bool = False,
                 sequence_lengths: Mapping[str, int] = None,
                 summary_dir: Optional[str] = None):
        """Evaluator constructor.

    Args:
      mixture_or_task_name: a registered task or mixture name.
      feature_converter: a feature converter object to use to convert the task
        features to model features. Must be a subclass of
        seqio.FeatureConverter.
      eval_split: evaluation split. Typically "validation" or "test".
      use_cached: whether to use the cached dataset instead of processing it on
        the fly.
      sequence_lengths: an optional length specification. If specified, these
        will be the hard-limit on the evaluation data used for prediction. If
        unspecified, the maximum length for each feature will be used. These
        lengths are computed while caching the datasets.
      summary_dir: an optional directory to save the evaluation results in Event
        protocol buffer format.
    """
        eval_tasks = dataset_providers.get_subtasks(
            dataset_providers.get_mixture_or_task(mixture_or_task_name))
        self._eval_tasks = get_valid_eval_tasks(eval_tasks, eval_split)

        if not self._eval_tasks:
            logging.warning(
                "No eval task with valid split and metric fn found. Skipping eval."
            )
            return

        def dataset_fn(task: Task) -> tf.data.Dataset:
            return task.get_dataset(sequence_length=None,
                                    split=eval_split,
                                    shuffle=False,
                                    use_cached=use_cached)

        # `task_datasets` have the output features from seqio.Task.get_dataset.
        # These features will be converted to "model features" by the feature
        # converter before being cached.
        cached_targets, cached_task_datasets, max_lengths = (
            get_targets_and_examples(tasks=self._eval_tasks,
                                     dataset_fn=dataset_fn))

        if sequence_lengths is None:
            logging.info("Setting sequence lengths to %s", max_lengths)
            lengths = max_lengths
        elif (sequence_lengths["inputs"] < max_lengths["inputs"]
              or sequence_lengths["targets"] < max_lengths["targets"]):
            logging.warning(
                "Given sequence lengths are insufficient for some evaluation inputs "
                "or targets. These sequences will be truncated to fit, likely "
                "leading to sub-optimal results. Consider passing `None` for "
                "sequence_lengths to have them be automatically computed.\n Got: %s, "
                "\n Max Lengths:%s", sequence_lengths, max_lengths)
            lengths = sequence_lengths
        elif (sequence_lengths["inputs"] > max_lengths["inputs"]
              or sequence_lengths["targets"] > max_lengths["targets"]):
            logging.warning(
                "Given sequence lengths are longer than necessary for some "
                "evaluation inputs or targets, resulting in wasted computation. "
                "Consider passing `None` for sequence_lengths to have them be "
                "automatically computed.\n Got: %s,\n Max Lengths: %s",
                sequence_lengths, max_lengths)
            lengths = sequence_lengths

        self._cached_model_datasets = {}
        # Convert the task features to model features
        for task in self._eval_tasks:
            eval_ds = feature_converter(cached_task_datasets[task.name],
                                        lengths)

            # The eval dataset is enumerated to ensure that the order is preserved
            # throughout the entire evaluation process.
            self._cached_model_datasets[task.name] = eval_ds.enumerate()

        self._cached_targets = cached_targets
        self._cached_task_datasets = cached_task_datasets

        if summary_dir:
            with tf.compat.v1.Graph().as_default():
                self._summary_writer = tf.compat.v1.summary.FileWriter(
                    summary_dir)