Exemple #1
0
DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(vocabulary=get_default_vocabulary(), add_eos=True),
    "targets": Feature(vocabulary=get_default_vocabulary(), add_eos=True)
}

# ==================================== C4 ======================================
_c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"]
for config_suffix in _c4_config_suffixes:
    TaskRegistry.add(
        "c4{name}_v020_unsupervised".format(
            name=config_suffix.replace(".", "_")),
        TfdsTask,
        tfds_name="c4/en{config}:2.2.0".format(config=config_suffix),
        text_preprocessor=functools.partial(preprocessors.rekey,
                                            key_map={
                                                "inputs": None,
                                                "targets": "text"
                                            }),
        token_preprocessor=preprocessors.unsupervised,
        output_features=DEFAULT_OUTPUT_FEATURES,
        metric_fns=[])

# ================================ Wikipedia ===================================
TaskRegistry.add("wikipedia_20190301.en_v003_unsupervised",
                 TfdsTask,
                 tfds_name="wikipedia/20190301.en:1.0.0",
                 text_preprocessor=functools.partial(preprocessors.rekey,
                                                     key_map={
                                                         "inputs": None,
                                                         "targets": "text"
DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(vocabulary=get_default_vocabulary(), add_eos=True),
    "targets": Feature(vocabulary=get_default_vocabulary(), add_eos=True)
}

# ==================================== C4 ======================================
# Configurable tasks used for comparisons in Raffel et al., 2019.
_c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"]
for config_suffix in _c4_config_suffixes:
    TaskRegistry.add(
        "c4{name}_v020_unsupervised".format(
            name=config_suffix.replace(".", "_")),
        TfdsTask,
        tfds_name="c4/en{config}:2.2.0".format(config=config_suffix),
        text_preprocessor=functools.partial(preprocessors.rekey,
                                            key_map={
                                                "inputs": None,
                                                "targets": "text"
                                            }),
        token_preprocessor=preprocessors.unsupervised,
        output_features=DEFAULT_OUTPUT_FEATURES,
        metric_fns=[])

# Final pretraining task used in Raffel et al., 2019.
TaskRegistry.add(
    "c4_v220_span_corruption",
    TfdsTask,
    tfds_name="c4/en:2.2.0".format(config=config_suffix),
    text_preprocessor=functools.partial(preprocessors.rekey,
                                        key_map={
                                            "inputs": None,
Exemple #3
0
from t5.data import preprocessors
from t5.data.utils import DEFAULT_SPM_PATH
from t5.data.utils import TaskRegistry
from t5.data.utils import TfdsTask
from t5.evaluation import metrics

# ==================================== C4 ======================================
_c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"]
for config_suffix in _c4_config_suffixes:
    TaskRegistry.add(
        "c4{name}_v020_unsupervised".format(
            name=config_suffix.replace(".", "_")),
        TfdsTask,
        tfds_name="c4/en{config}:1.0.0".format(config=config_suffix),
        text_preprocessor=functools.partial(preprocessors.rekey,
                                            key_map={
                                                "inputs": None,
                                                "targets": "text"
                                            }),
        token_preprocessor=preprocessors.unsupervised,
        sentencepiece_model_path=DEFAULT_SPM_PATH,
        metric_fns=[])

# ================================ Wikipedia ===================================
TaskRegistry.add(
    "wikipedia_20190301.en_v003_unsupervised",
    TfdsTask,
    # 0.0.4 is identical to 0.0.3 except empty records removed.
    tfds_name="wikipedia/20190301.en:0.0.4",
    text_preprocessor=functools.partial(preprocessors.rekey,
                                        key_map={