def test_optional_features(self):
        def _dummy_preprocessor(output):
            return lambda _: tf.data.Dataset.from_tensors(output)

        default_vocab = test_utils.sentencepiece_vocab()
        features = {
            "inputs": seqio.Feature(vocabulary=default_vocab, required=False),
            "targets": seqio.Feature(vocabulary=default_vocab, required=True),
        }

        task = self.add_t5_task("task_missing_optional_feature",
                                dataset_providers.TfdsTask,
                                tfds_name="fake:0.0.0",
                                output_features=features,
                                text_preprocessor=_dummy_preprocessor(
                                    {"targets": "a"}))
        task.get_dataset({"targets": 13}, "train", use_cached=False)

        task = self.add_t5_task("task_missing_required_feature",
                                dataset_providers.TfdsTask,
                                tfds_name="fake:0.0.0",
                                output_features=features,
                                text_preprocessor=_dummy_preprocessor(
                                    {"inputs": "a"}))
        with self.assertRaisesRegex(
                ValueError,
                "Task dataset is missing expected output feature after preprocessing: "
                "targets"):
            task.get_dataset({"inputs": 13}, "train", use_cached=False)
    def _build_task(self, task_name: str, data_dir: str,
                    vocabulary: seqio.Vocabulary) -> seqio.Task:
        split_to_filepattern = {
            tfds.Split.TRAIN: os.path.join(data_dir, 'train.tfr*'),
            tfds.Split.VALIDATION: os.path.join(data_dir, 'valid.tfr*')
        }
        if _has_test_split(task_name):
            split_to_filepattern[tfds.Split.TEST] = os.path.join(
                data_dir, 'test.tfr*')

        source_features = {
            'inputs': tf.io.FixedLenFeature([], tf.string, ''),
            'targets': tf.io.FixedLenFeature([], tf.string, '')
        }
        data_source = seqio.TFExampleDataSource(
            split_to_filepattern=split_to_filepattern,
            feature_description=source_features,
            num_input_examples=_get_num_examples(task_name))

        output_features = {
            'inputs':
            seqio.Feature(vocabulary=vocabulary, add_eos=True, required=False),
            'targets':
            seqio.Feature(vocabulary=vocabulary, add_eos=True)
        }
        task = seqio.Task(
            name=task_name,
            source=data_source,
            output_features=output_features,
            preprocessors=[
                seqio.preprocessors.tokenize, seqio.preprocessors.append_eos
            ],
            shuffle_buffer_size=None  # disable shuffling.
        )
        return task
 def test_no_eos(self):
     default_vocab = test_utils.sentencepiece_vocab()
     features = {
         "inputs": seqio.Feature(add_eos=True, vocabulary=default_vocab),
         "targets": seqio.Feature(add_eos=False, vocabulary=default_vocab),
     }
     self.add_t5_task("task_no_eos",
                      dataset_providers.TfdsTask,
                      tfds_name="fake:0.0.0",
                      output_features=features)
     self.verify_task_matches_fake_datasets("task_no_eos", use_cached=False)
def get_output_features(
        vocab: seqio.Vocabulary,
        add_inputs_eos: bool = False,
        add_targets_eos: bool = False) -> Mapping[str, seqio.Feature]:
    """Returns BIG-bench output features."""

    return {
        "inputs":
        seqio.Feature(vocabulary=vocab, add_eos=add_inputs_eos,
                      required=False),
        "targets":
        seqio.Feature(vocabulary=vocab, add_eos=add_targets_eos)
    }
Example #5
0
def get_custom_output_features(add_eos=True, extra_ids=DEFAULT_EXTRA_IDS):
    """Construct output features with custom vocabs."""
    sentence_piece_model_path = gin.query_parameter(
        "seqio.SentencePieceVocabulary.sentencepiece_model_file")

    custom_vocab = seqio.SentencePieceVocabulary(sentence_piece_model_path,
                                                 extra_ids)
    return {
        "inputs":
        seqio.Feature(vocabulary=custom_vocab, add_eos=add_eos,
                      required=False),
        "targets":
        seqio.Feature(vocabulary=custom_vocab, add_eos=add_eos)
    }
 def add_t5_task(self,
                 name,
                 cls,
                 text_preprocessor=(test_utils.test_text_preprocessor, ),
                 output_features=None,
                 **kwargs):
     output_features = output_features or {
         "inputs": seqio.Feature(test_utils.sentencepiece_vocab()),
         "targets": seqio.Feature(test_utils.sentencepiece_vocab())
     }
     return TaskRegistry.add(name,
                             cls,
                             text_preprocessor=text_preprocessor,
                             metric_fns=[],
                             output_features=output_features,
                             **kwargs)
  def __init__(self, checkpoint_path, model_type='mt3'):

    # Model Constants.
    if model_type == 'ismir2021':
      num_velocity_bins = 127
      self.encoding_spec = note_sequences.NoteEncodingSpec
      self.inputs_length = 512
    elif model_type == 'mt3':
      num_velocity_bins = 1
      self.encoding_spec = note_sequences.NoteEncodingWithTiesSpec
      self.inputs_length = 256
    else:
      raise ValueError('unknown model_type: %s' % model_type)

    gin_files = ['/content/mt3/gin/model.gin',
                 f'/content/mt3/gin/{model_type}.gin']

    self.batch_size = 8
    self.outputs_length = 1024
    self.sequence_length = {'inputs': self.inputs_length, 
                            'targets': self.outputs_length}

    self.partitioner = t5x.partitioning.ModelBasedPjitPartitioner(
        model_parallel_submesh=(1, 1, 1, 1), num_partitions=1)

    # Build Codecs and Vocabularies.
    self.spectrogram_config = spectrograms.SpectrogramConfig()
    self.codec = vocabularies.build_codec(
        vocab_config=vocabularies.VocabularyConfig(
            num_velocity_bins=num_velocity_bins))
    self.vocabulary = vocabularies.vocabulary_from_codec(self.codec)
    self.output_features = {
        'inputs': seqio.ContinuousFeature(dtype=tf.float32, rank=2),
        'targets': seqio.Feature(vocabulary=self.vocabulary),
    }

    # Create a T5X model.
    self._parse_gin(gin_files)
    self.model = self._load_model()

    # Restore from checkpoint.
    self.restore_from_checkpoint(checkpoint_path)
Example #8
0
def add_task(dataset_name, subset_name, template_name, task_name=None, split_mapping=None):

    template = all_templates.get_dataset(dataset_name, subset_name)[template_name]

    task_name = task_name or utils.get_task_name(dataset_name, subset_name, template_name)
    if task_name in CLEAN_EVAL_TASKS:
        metrics = EVAL_METRICS[task_name]
    else:
        metrics = [t5.evaluation.metrics.sequence_accuracy]

    dataset_splits = utils.get_dataset_splits(dataset_name, subset_name)
    split_mapping = split_mapping or {k: k for k in dataset_splits.keys()}

    dataset_fn = functools.partial(
        get_tf_dataset,
        seed=None,
        dataset_name=dataset_name,
        subset_name=subset_name,
        template=template,
        split_mapping=split_mapping,
    )
    data_source = seqio.FunctionDataSource(
        dataset_fn,
        splits=list(split_mapping.keys()),
        num_input_examples={s: dataset_splits[split_mapping[s]].num_examples for s in split_mapping.keys()},
    )
    output_features = {
        "inputs": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=False, dtype=tf.int32),
        "targets": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=True, dtype=tf.int32),
    }
    preprocessors = [
        seqio.preprocessors.tokenize,
        seqio.preprocessors.append_eos,
        seqio.CacheDatasetPlaceholder(required=False),
    ]

    # Add train and normal eval tasks
    seqio.TaskRegistry.add(
        task_name,
        data_source,
        preprocessors=preprocessors,
        output_features=output_features,
        metric_fns=metrics,
        postprocess_fn=maybe_get_class_id_postprocessor(template),
    )

    # Add rank classification eval task
    labels = get_label_strings(template)
    if labels:
        rank_classification_preprocessor = functools.partial(
            t5.data.preprocessors.rank_classification,
            inputs_fn=lambda ex: tf.fill((len(labels),), ex["inputs"]),
            targets_fn=lambda ex: labels,
            is_correct_fn=lambda ex: tf.equal(labels, tf.strings.strip(ex["targets"])),
            weight_fn=lambda ex: 1.0,
        )
        seqio.TaskRegistry.add(
            task_name + "_score_eval",
            data_source,
            preprocessors=[rank_classification_preprocessor] + preprocessors,
            output_features=output_features,
            metric_fns=[functools.partial(t5.evaluation.metrics.rank_classification, num_classes=len(labels))],
            postprocess_fn=t5.data.postprocessors.rank_classification,
        )
Example #9
0
t5.data.MixtureRegistry.add("ke_t5_all_proportional",
                            [(t, ke_t5.proc_utils.dedupe(t))
                             for t in _all_tasks])

t5.data.MixtureRegistry.add("ke_t5_all_equal", _all_tasks, default_rate=1.0)

from ke_t5.proc_utils import KLUE_META
from ke_t5.proc_utils import (base_preproc_for_classification,
                              base_preproc_for_regression,
                              re_preproc_for_classification, preprocess_quad,
                              string_label_to_class_id, string_to_float)

GENERATIVE_OUTPUT_FEATURES = {
    "inputs":
    seqio.Feature(vocabulary=DEFAULT_VOCAB,
                  add_eos=False,
                  required=False,
                  dtype=tf.int32),
    "targets":
    seqio.Feature(vocabulary=DEFAULT_VOCAB, add_eos=True, dtype=tf.int32)
}

# CLASSIFICATION_OUTPUT_FEATURES = {
#     "inputs": seqio.Feature(
#         vocabulary=DEFAULT_VOCAB, add_eos=False, required=False, dtype=tf.int32)
# }

# ============ KLUE topic classification: Generative ============
seqio.TaskRegistry.add(
    "klue_tc_gen",
    seqio.TfdsDataSource(tfds_name="klue/tc:1.0.0"),
    preprocessors=[
Example #10
0
from t5.data import preprocessors as t5_preprocessors
from t5.evaluation import metrics as t5_metrics

MixtureRegistry = seqio.MixtureRegistry
TaskRegistry = seqio.TaskRegistry

DEFAULT_SPM_PATH = "gs://t5-data/vocabs/cc_all.32000/sentencepiece.model"  # GCS
DEFAULT_EXTRA_IDS = 100
NQ_TRAIN_SPLIT_START = 7830
NQ_TRAIN_SPLIT_END = 79168
NQO_TRAIN_SPLIT_END = 79168
WQ_TRAIN_SPLIT_END = 3417
TQA_TRAIN_SPLIT_END = 78785

DEFAULT_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(vocabulary=get_default_vocabulary(), add_eos=True),
    "targets": seqio.Feature(vocabulary=get_default_vocabulary(), add_eos=True)
}

# ========================== Natural Questions =================================

# Natural Questions open domain variant that most closely matches the official
# evaluation procedure.
# The model is trained to predict all ground-truth answers
# and is only considered correct if it predicts all answers for any one of the
# annotators. As in the official evaluation, we consider questions with fewer
# than two non-null annotations unanswerable (given the context) but because we
# cannot predict unanswerability without the context, we only compute the recall
# metric. Further, because our model does not have access to the oracle context,
# we also normalize predicted and ground-truth answers when comparing them.
Example #11
0
import tensorflow_datasets as tfds
import t5_closed_book_qa.t5_cbqa.preprocessors as t5_cbqa_preprocessors

TaskRegistry = seqio.TaskRegistry

EN_VOCAB_SPM_PATH = "gs://t5-data/vocabs/cc_en.32000/sentencepiece.model"
WMT14_CUSTOM_SPM_PATH = "gs://t5-data/vocabs/wmt_ende.37000/spm.model"

WMT14_VOCAB_EXTRA_100 = seqio.SentencePieceVocabulary(WMT14_CUSTOM_SPM_PATH,
                                                      extra_ids=100)
EN_VOCAB_EXTRA_100 = seqio.SentencePieceVocabulary(EN_VOCAB_SPM_PATH,
                                                   extra_ids=100)

EN_VOCAB_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(vocabulary=EN_VOCAB_EXTRA_100, add_eos=True),
    "targets": seqio.Feature(vocabulary=EN_VOCAB_EXTRA_100, add_eos=True)
}

#================================ English only vocab ===========================
for version in ("2.2.0", "2.3.0", "2.3.1"):
    TaskRegistry.add(
        "c4_v{}_unsupervised_en32k".format(version.replace(".", "")),
        source=seqio.TfdsDataSource(tfds_name="c4/en:{}".format(version)),
        preprocessors=[
            functools.partial(t5_preprocessors.rekey,
                              key_map={
                                  "inputs": None,
                                  "targets": "text"
                              }),
            seqio.preprocessors.tokenize,
  def __init__(self,
               name,
               dataset_fn,
               splits,
               text_preprocessor,
               metric_fns=None,
               postprocess_fn=None,
               token_preprocessor=None,
               output_features=None,
               num_input_examples=None,
               supports_caching=True,
               shuffle_buffer_size=SHUFFLE_BUFFER_SIZE,
               source=None):

    if (dataset_fn, source).count(None) != 1:
      raise ValueError(
          "Exactly one of either `dataset_fn` or `source` must be provided.")

    if source and (splits or num_input_examples):
      raise ValueError(
          "If `source` is provided, `splits` and `num_input_examples` should "
          "not also be provided to the Task.")
    source = source or seqio.FunctionDataSource(
        dataset_fn=dataset_fn,
        splits=splits,
        num_input_examples=num_input_examples)

    if text_preprocessor and not hasattr(text_preprocessor, "__iter__"):
      text_preprocessor = [text_preprocessor]
    if token_preprocessor and not hasattr(token_preprocessor, "__iter__"):
      token_preprocessor = [token_preprocessor]

    preprocessors = list(text_preprocessor or [])
    preprocessors.append(seqio.preprocessors.tokenize)
    if supports_caching:
      preprocessors.append(seqio.CacheDatasetPlaceholder())
    preprocessors.extend(token_preprocessor or [])
    preprocessors.append(seqio.preprocessors.append_eos_after_trim)

    if hasattr(output_features, "__len__") and not output_features:
      raise ValueError("output_features must be non-empty.")
    if output_features is None:
      output_features = seqio.Feature(utils.get_default_vocabulary())
    if isinstance(output_features, dict):
      pass
    elif isinstance(output_features, seqio.Feature):
      output_features = {k: output_features for k in _DEFAULT_FEATURE_KEYS}
    elif isinstance(output_features, list) and all(
        isinstance(f, str) for f in output_features):
      output_features = {
          k: seqio.Feature(utils.get_default_vocabulary())
          for k in output_features
      }
    else:
      raise ValueError(
          "output_features must be a dict, Feature, list of str, or None")

    if hasattr(postprocess_fn, "__iter__"):
      postprocess_fns = postprocess_fn

      def postprocess_fn(x, **postprocess_kwargs):  # pylint:disable=function-redefined
        for post_fn in postprocess_fns:
          x = post_fn(x, **postprocess_kwargs)
        return x

    super().__init__(
        name=name,
        source=source,
        output_features=output_features,
        preprocessors=preprocessors,
        postprocess_fn=postprocess_fn,
        metric_fns=metric_fns,
        shuffle_buffer_size=shuffle_buffer_size)
Example #13
0
DEFAULT_PREPROCESSORS = [
    seqio.preprocessors.tokenize,
    seqio.CacheDatasetPlaceholder(),
    seqio.preprocessors.append_eos_after_trim,
]

# only include edit_rouge now; skip metrics.print_predictions,
# metrics.surface_recall and metrics.exact_match for now.

DEFAULT_METRIC_FNS = [
    metrics.edit_rouge,
]
DEFAULT_OUTPUT_FEATURES = {
    "inputs":
    seqio.Feature(vocabulary=t5.data.get_default_vocabulary(),
                  add_eos=True,
                  required=True),
    "targets":
    seqio.Feature(vocabulary=t5.data.get_default_vocabulary(), add_eos=True)
}


def _register_w_defaults(
    name,
    split_to_filepattern,
    task,
    delimiter_type,
):
    """Register a WikiDiff task w/ default params."""

    delimiter_range_pair = rendering_utils.get_default_delimiter_range_pair(
from t5.data import preprocessors
from t5.data.glue_utils import get_glue_metric
from t5.data.glue_utils import get_glue_postprocess_fn
from t5.data.glue_utils import get_glue_text_preprocessor
from t5.data.glue_utils import get_super_glue_metric
from t5.evaluation import metrics
import tensorflow_datasets as tfds

TaskRegistry = seqio.TaskRegistry
TfdsTask = t5.data.TfdsTask



DEFAULT_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True,
        required=False),
    "targets": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True)
}

# ==================================== C4 ======================================
# Final pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
    "c4_v220_span_corruption",
    source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"),
    preprocessors=[
        functools.partial(
            preprocessors.rekey, key_map={
                "inputs": None,
                "targets": "text"