def test_optional_features(self): def _dummy_preprocessor(output): return lambda _: tf.data.Dataset.from_tensors(output) default_vocab = test_utils.sentencepiece_vocab() features = { "inputs": seqio.Feature(vocabulary=default_vocab, required=False), "targets": seqio.Feature(vocabulary=default_vocab, required=True), } task = self.add_t5_task("task_missing_optional_feature", dataset_providers.TfdsTask, tfds_name="fake:0.0.0", output_features=features, text_preprocessor=_dummy_preprocessor( {"targets": "a"})) task.get_dataset({"targets": 13}, "train", use_cached=False) task = self.add_t5_task("task_missing_required_feature", dataset_providers.TfdsTask, tfds_name="fake:0.0.0", output_features=features, text_preprocessor=_dummy_preprocessor( {"inputs": "a"})) with self.assertRaisesRegex( ValueError, "Task dataset is missing expected output feature after preprocessing: " "targets"): task.get_dataset({"inputs": 13}, "train", use_cached=False)
def _build_task(self, task_name: str, data_dir: str, vocabulary: seqio.Vocabulary) -> seqio.Task: split_to_filepattern = { tfds.Split.TRAIN: os.path.join(data_dir, 'train.tfr*'), tfds.Split.VALIDATION: os.path.join(data_dir, 'valid.tfr*') } if _has_test_split(task_name): split_to_filepattern[tfds.Split.TEST] = os.path.join( data_dir, 'test.tfr*') source_features = { 'inputs': tf.io.FixedLenFeature([], tf.string, ''), 'targets': tf.io.FixedLenFeature([], tf.string, '') } data_source = seqio.TFExampleDataSource( split_to_filepattern=split_to_filepattern, feature_description=source_features, num_input_examples=_get_num_examples(task_name)) output_features = { 'inputs': seqio.Feature(vocabulary=vocabulary, add_eos=True, required=False), 'targets': seqio.Feature(vocabulary=vocabulary, add_eos=True) } task = seqio.Task( name=task_name, source=data_source, output_features=output_features, preprocessors=[ seqio.preprocessors.tokenize, seqio.preprocessors.append_eos ], shuffle_buffer_size=None # disable shuffling. ) return task
def test_no_eos(self): default_vocab = test_utils.sentencepiece_vocab() features = { "inputs": seqio.Feature(add_eos=True, vocabulary=default_vocab), "targets": seqio.Feature(add_eos=False, vocabulary=default_vocab), } self.add_t5_task("task_no_eos", dataset_providers.TfdsTask, tfds_name="fake:0.0.0", output_features=features) self.verify_task_matches_fake_datasets("task_no_eos", use_cached=False)
def get_output_features( vocab: seqio.Vocabulary, add_inputs_eos: bool = False, add_targets_eos: bool = False) -> Mapping[str, seqio.Feature]: """Returns BIG-bench output features.""" return { "inputs": seqio.Feature(vocabulary=vocab, add_eos=add_inputs_eos, required=False), "targets": seqio.Feature(vocabulary=vocab, add_eos=add_targets_eos) }
def get_custom_output_features(add_eos=True, extra_ids=DEFAULT_EXTRA_IDS): """Construct output features with custom vocabs.""" sentence_piece_model_path = gin.query_parameter( "seqio.SentencePieceVocabulary.sentencepiece_model_file") custom_vocab = seqio.SentencePieceVocabulary(sentence_piece_model_path, extra_ids) return { "inputs": seqio.Feature(vocabulary=custom_vocab, add_eos=add_eos, required=False), "targets": seqio.Feature(vocabulary=custom_vocab, add_eos=add_eos) }
def add_t5_task(self, name, cls, text_preprocessor=(test_utils.test_text_preprocessor, ), output_features=None, **kwargs): output_features = output_features or { "inputs": seqio.Feature(test_utils.sentencepiece_vocab()), "targets": seqio.Feature(test_utils.sentencepiece_vocab()) } return TaskRegistry.add(name, cls, text_preprocessor=text_preprocessor, metric_fns=[], output_features=output_features, **kwargs)
def __init__(self, checkpoint_path, model_type='mt3'): # Model Constants. if model_type == 'ismir2021': num_velocity_bins = 127 self.encoding_spec = note_sequences.NoteEncodingSpec self.inputs_length = 512 elif model_type == 'mt3': num_velocity_bins = 1 self.encoding_spec = note_sequences.NoteEncodingWithTiesSpec self.inputs_length = 256 else: raise ValueError('unknown model_type: %s' % model_type) gin_files = ['/content/mt3/gin/model.gin', f'/content/mt3/gin/{model_type}.gin'] self.batch_size = 8 self.outputs_length = 1024 self.sequence_length = {'inputs': self.inputs_length, 'targets': self.outputs_length} self.partitioner = t5x.partitioning.ModelBasedPjitPartitioner( model_parallel_submesh=(1, 1, 1, 1), num_partitions=1) # Build Codecs and Vocabularies. self.spectrogram_config = spectrograms.SpectrogramConfig() self.codec = vocabularies.build_codec( vocab_config=vocabularies.VocabularyConfig( num_velocity_bins=num_velocity_bins)) self.vocabulary = vocabularies.vocabulary_from_codec(self.codec) self.output_features = { 'inputs': seqio.ContinuousFeature(dtype=tf.float32, rank=2), 'targets': seqio.Feature(vocabulary=self.vocabulary), } # Create a T5X model. self._parse_gin(gin_files) self.model = self._load_model() # Restore from checkpoint. self.restore_from_checkpoint(checkpoint_path)
def add_task(dataset_name, subset_name, template_name, task_name=None, split_mapping=None): template = all_templates.get_dataset(dataset_name, subset_name)[template_name] task_name = task_name or utils.get_task_name(dataset_name, subset_name, template_name) if task_name in CLEAN_EVAL_TASKS: metrics = EVAL_METRICS[task_name] else: metrics = [t5.evaluation.metrics.sequence_accuracy] dataset_splits = utils.get_dataset_splits(dataset_name, subset_name) split_mapping = split_mapping or {k: k for k in dataset_splits.keys()} dataset_fn = functools.partial( get_tf_dataset, seed=None, dataset_name=dataset_name, subset_name=subset_name, template=template, split_mapping=split_mapping, ) data_source = seqio.FunctionDataSource( dataset_fn, splits=list(split_mapping.keys()), num_input_examples={s: dataset_splits[split_mapping[s]].num_examples for s in split_mapping.keys()}, ) output_features = { "inputs": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=False, dtype=tf.int32), "targets": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=True, dtype=tf.int32), } preprocessors = [ seqio.preprocessors.tokenize, seqio.preprocessors.append_eos, seqio.CacheDatasetPlaceholder(required=False), ] # Add train and normal eval tasks seqio.TaskRegistry.add( task_name, data_source, preprocessors=preprocessors, output_features=output_features, metric_fns=metrics, postprocess_fn=maybe_get_class_id_postprocessor(template), ) # Add rank classification eval task labels = get_label_strings(template) if labels: rank_classification_preprocessor = functools.partial( t5.data.preprocessors.rank_classification, inputs_fn=lambda ex: tf.fill((len(labels),), ex["inputs"]), targets_fn=lambda ex: labels, is_correct_fn=lambda ex: tf.equal(labels, tf.strings.strip(ex["targets"])), weight_fn=lambda ex: 1.0, ) seqio.TaskRegistry.add( task_name + "_score_eval", data_source, preprocessors=[rank_classification_preprocessor] + preprocessors, output_features=output_features, metric_fns=[functools.partial(t5.evaluation.metrics.rank_classification, num_classes=len(labels))], postprocess_fn=t5.data.postprocessors.rank_classification, )
t5.data.MixtureRegistry.add("ke_t5_all_proportional", [(t, ke_t5.proc_utils.dedupe(t)) for t in _all_tasks]) t5.data.MixtureRegistry.add("ke_t5_all_equal", _all_tasks, default_rate=1.0) from ke_t5.proc_utils import KLUE_META from ke_t5.proc_utils import (base_preproc_for_classification, base_preproc_for_regression, re_preproc_for_classification, preprocess_quad, string_label_to_class_id, string_to_float) GENERATIVE_OUTPUT_FEATURES = { "inputs": seqio.Feature(vocabulary=DEFAULT_VOCAB, add_eos=False, required=False, dtype=tf.int32), "targets": seqio.Feature(vocabulary=DEFAULT_VOCAB, add_eos=True, dtype=tf.int32) } # CLASSIFICATION_OUTPUT_FEATURES = { # "inputs": seqio.Feature( # vocabulary=DEFAULT_VOCAB, add_eos=False, required=False, dtype=tf.int32) # } # ============ KLUE topic classification: Generative ============ seqio.TaskRegistry.add( "klue_tc_gen", seqio.TfdsDataSource(tfds_name="klue/tc:1.0.0"), preprocessors=[
from t5.data import preprocessors as t5_preprocessors from t5.evaluation import metrics as t5_metrics MixtureRegistry = seqio.MixtureRegistry TaskRegistry = seqio.TaskRegistry DEFAULT_SPM_PATH = "gs://t5-data/vocabs/cc_all.32000/sentencepiece.model" # GCS DEFAULT_EXTRA_IDS = 100 NQ_TRAIN_SPLIT_START = 7830 NQ_TRAIN_SPLIT_END = 79168 NQO_TRAIN_SPLIT_END = 79168 WQ_TRAIN_SPLIT_END = 3417 TQA_TRAIN_SPLIT_END = 78785 DEFAULT_OUTPUT_FEATURES = { "inputs": seqio.Feature(vocabulary=get_default_vocabulary(), add_eos=True), "targets": seqio.Feature(vocabulary=get_default_vocabulary(), add_eos=True) } # ========================== Natural Questions ================================= # Natural Questions open domain variant that most closely matches the official # evaluation procedure. # The model is trained to predict all ground-truth answers # and is only considered correct if it predicts all answers for any one of the # annotators. As in the official evaluation, we consider questions with fewer # than two non-null annotations unanswerable (given the context) but because we # cannot predict unanswerability without the context, we only compute the recall # metric. Further, because our model does not have access to the oracle context, # we also normalize predicted and ground-truth answers when comparing them.
import tensorflow_datasets as tfds import t5_closed_book_qa.t5_cbqa.preprocessors as t5_cbqa_preprocessors TaskRegistry = seqio.TaskRegistry EN_VOCAB_SPM_PATH = "gs://t5-data/vocabs/cc_en.32000/sentencepiece.model" WMT14_CUSTOM_SPM_PATH = "gs://t5-data/vocabs/wmt_ende.37000/spm.model" WMT14_VOCAB_EXTRA_100 = seqio.SentencePieceVocabulary(WMT14_CUSTOM_SPM_PATH, extra_ids=100) EN_VOCAB_EXTRA_100 = seqio.SentencePieceVocabulary(EN_VOCAB_SPM_PATH, extra_ids=100) EN_VOCAB_OUTPUT_FEATURES = { "inputs": seqio.Feature(vocabulary=EN_VOCAB_EXTRA_100, add_eos=True), "targets": seqio.Feature(vocabulary=EN_VOCAB_EXTRA_100, add_eos=True) } #================================ English only vocab =========================== for version in ("2.2.0", "2.3.0", "2.3.1"): TaskRegistry.add( "c4_v{}_unsupervised_en32k".format(version.replace(".", "")), source=seqio.TfdsDataSource(tfds_name="c4/en:{}".format(version)), preprocessors=[ functools.partial(t5_preprocessors.rekey, key_map={ "inputs": None, "targets": "text" }), seqio.preprocessors.tokenize,
def __init__(self, name, dataset_fn, splits, text_preprocessor, metric_fns=None, postprocess_fn=None, token_preprocessor=None, output_features=None, num_input_examples=None, supports_caching=True, shuffle_buffer_size=SHUFFLE_BUFFER_SIZE, source=None): if (dataset_fn, source).count(None) != 1: raise ValueError( "Exactly one of either `dataset_fn` or `source` must be provided.") if source and (splits or num_input_examples): raise ValueError( "If `source` is provided, `splits` and `num_input_examples` should " "not also be provided to the Task.") source = source or seqio.FunctionDataSource( dataset_fn=dataset_fn, splits=splits, num_input_examples=num_input_examples) if text_preprocessor and not hasattr(text_preprocessor, "__iter__"): text_preprocessor = [text_preprocessor] if token_preprocessor and not hasattr(token_preprocessor, "__iter__"): token_preprocessor = [token_preprocessor] preprocessors = list(text_preprocessor or []) preprocessors.append(seqio.preprocessors.tokenize) if supports_caching: preprocessors.append(seqio.CacheDatasetPlaceholder()) preprocessors.extend(token_preprocessor or []) preprocessors.append(seqio.preprocessors.append_eos_after_trim) if hasattr(output_features, "__len__") and not output_features: raise ValueError("output_features must be non-empty.") if output_features is None: output_features = seqio.Feature(utils.get_default_vocabulary()) if isinstance(output_features, dict): pass elif isinstance(output_features, seqio.Feature): output_features = {k: output_features for k in _DEFAULT_FEATURE_KEYS} elif isinstance(output_features, list) and all( isinstance(f, str) for f in output_features): output_features = { k: seqio.Feature(utils.get_default_vocabulary()) for k in output_features } else: raise ValueError( "output_features must be a dict, Feature, list of str, or None") if hasattr(postprocess_fn, "__iter__"): postprocess_fns = postprocess_fn def postprocess_fn(x, **postprocess_kwargs): # pylint:disable=function-redefined for post_fn in postprocess_fns: x = post_fn(x, **postprocess_kwargs) return x super().__init__( name=name, source=source, output_features=output_features, preprocessors=preprocessors, postprocess_fn=postprocess_fn, metric_fns=metric_fns, shuffle_buffer_size=shuffle_buffer_size)
DEFAULT_PREPROCESSORS = [ seqio.preprocessors.tokenize, seqio.CacheDatasetPlaceholder(), seqio.preprocessors.append_eos_after_trim, ] # only include edit_rouge now; skip metrics.print_predictions, # metrics.surface_recall and metrics.exact_match for now. DEFAULT_METRIC_FNS = [ metrics.edit_rouge, ] DEFAULT_OUTPUT_FEATURES = { "inputs": seqio.Feature(vocabulary=t5.data.get_default_vocabulary(), add_eos=True, required=True), "targets": seqio.Feature(vocabulary=t5.data.get_default_vocabulary(), add_eos=True) } def _register_w_defaults( name, split_to_filepattern, task, delimiter_type, ): """Register a WikiDiff task w/ default params.""" delimiter_range_pair = rendering_utils.get_default_delimiter_range_pair(
from t5.data import preprocessors from t5.data.glue_utils import get_glue_metric from t5.data.glue_utils import get_glue_postprocess_fn from t5.data.glue_utils import get_glue_text_preprocessor from t5.data.glue_utils import get_super_glue_metric from t5.evaluation import metrics import tensorflow_datasets as tfds TaskRegistry = seqio.TaskRegistry TfdsTask = t5.data.TfdsTask DEFAULT_OUTPUT_FEATURES = { "inputs": seqio.Feature( vocabulary=t5.data.get_default_vocabulary(), add_eos=True, required=False), "targets": seqio.Feature( vocabulary=t5.data.get_default_vocabulary(), add_eos=True) } # ==================================== C4 ====================================== # Final pretraining task used in Raffel et al., 2019. TaskRegistry.add( "c4_v220_span_corruption", source=seqio.TfdsDataSource(tfds_name="c4/en:2.2.0"), preprocessors=[ functools.partial( preprocessors.rekey, key_map={ "inputs": None, "targets": "text"