コード例 #1
0
class Wmt15Translate(wmt.WmtTranslate):
    """WMT 15 translation datasets for all {xx, "en"} language pairs."""

    BUILDER_CONFIGS = [
        wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
            description="WMT 2015 %s-%s translation task dataset." % (l1, l2),
            url=_URL,
            citation=_CITATION,
            language_pair=(l1, l2),
            version=tfds.core.Version("1.0.0"),
            supported_versions=[
                tfds.core.Version(
                    "0.0.4", experiments={tfds.core.Experiment.S3: False}),
            ],
        ) for l1, l2 in _LANGUAGE_PAIRS
    ] + [
        wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
            description=(
                "WMT 2015 %s-%s translation task dataset with subword encoding."
                % (l1, l2)),
            url=_URL,
            citation=_CITATION,
            language_pair=(l1, l2),
            text_encoder_config=tfds.features.text.TextEncoderConfig(
                encoder_cls=tfds.features.text.SubwordTextEncoder,
                name="subwords8k",
                vocab_size=2**13),
            version=tfds.core.Version("1.0.0"),
            supported_versions=[
                tfds.core.Version(
                    "0.0.4", experiments={tfds.core.Experiment.S3: False}),
            ],
        ) for l1, l2 in _LANGUAGE_PAIRS
    ]

    @property
    def _subsets(self):
        return {
            tfds.Split.TRAIN: [
                "europarl_v7", "europarl_v8_16", "commoncrawl", "multiun",
                "newscommentary_v10", "gigafren", "czeng_10", "yandexcorpus",
                "wikiheadlines_fi", "wikiheadlines_ru"
            ],
            tfds.Split.VALIDATION:
            ["newsdev2015", "newsdiscussdev2015", "newstest2014"],
            tfds.Split.TEST: [
                "newstest2015",
                "newsdiscusstest2015",
            ]
        }
コード例 #2
0
class Wmt17Translate(wmt.WmtTranslate):
  """WMT 17 translation datasets for all {xx, "en"} language pairs."""

  BUILDER_CONFIGS = [
      wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
          description="WMT 2017 %s-%s translation task dataset." % (l1, l2),
          url=_URL,
          citation=_CITATION,
          language_pair=(l1, l2),
          version=tfds.core.Version(
              "0.0.3", experiments={tfds.core.Experiment.S3: False}))
      for l1, l2 in _LANGUAGE_PAIRS
  ]

  @property
  def _subsets(self):
    return {
        tfds.Split.TRAIN: [
            "europarl_v7", "europarl_v8_16", "commoncrawl",
            "newscommentary_v12", "czeng_16", "yandexcorpus",
            "wikiheadlines_fi", "wikiheadlines_ru", "setimes_2", "uncorpus_v1",
            "rapid_2016", "leta_v1", "dcep_v1", "onlinebooks_v1"
        ] + wmt.CWMT_SUBSET_NAMES,
        tfds.Split.VALIDATION: [
            "newsdev2017", "newstest2016", "newstestB2016"
        ],
        tfds.Split.TEST: [
            "newstest2017", "newstestB2017"
        ]
    }
コード例 #3
0
ファイル: wmt18.py プロジェクト: joseph-zhong/datasets
class Wmt18Translate(wmt.WmtTranslate):
    """WMT 18 translation datasets for all {xx, "en"} language pairs."""

    BUILDER_CONFIGS = [
        wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
            description="WMT 2018 %s-%s translation task dataset." % (l1, l2),
            url=_URL,
            citation=_CITATION,
            language_pair=(l1, l2),
            version="0.0.2") for l1, l2 in _LANGUAGE_PAIRS
    ]

    @property
    def _subsets(self):
        return {
            tfds.Split.TRAIN: [
                "europarl_v7", "europarl_v8_18", "paracrawl_v1", "commoncrawl",
                "newscommentary_v13", "czeng_17", "yandexcorpus",
                "wikiheadlines_fi", "wikiheadlines_ru", "setimes_2",
                "uncorpus_v1", "rapid_2016"
            ] + wmt.CWMT_SUBSET_NAMES,
            tfds.Split.VALIDATION: [
                "newsdev2014", "newsdev2015", "newsdev2016", "newsdev2017",
                "newsdev2018", "newsdiscussdev2015", "newsdiscusstest2015",
                "newssyscomb2009", "newstest2008", "newstest2009",
                "newstest2010", "newstest2011", "newstest2012", "newstest2013",
                "newstest2014", "newstest2015", "newstest2016",
                "newstestB2016", "newstest2017", "newstestB2017"
            ],
            tfds.Split.TEST: ["newstest2018"]
        }
コード例 #4
0
ファイル: wmt_t2t.py プロジェクト: yousefmasry4/datasets
class WmtT2tTranslate(wmt.WmtTranslate):
  """The WMT EnDe Translate dataset used by the Tensor2Tensor library."""

  BUILDER_CONFIGS = [
      wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
          description="WMT T2T EnDe translation task dataset.",
          url=_URL,
          citation=_CITATION,
          language_pair=("de", "en"),
          version=tfds.core.Version("1.0.0"),
      )
  ]

  @property
  def _subsets(self):
    return {
        tfds.Split.TRAIN: [
            "europarl_v7", "commoncrawl", "newscommentary_v13"],
        tfds.Split.VALIDATION: [
            "newstest2013"
        ],
        tfds.Split.TEST: [
            "newstest2014"
        ]
    }
コード例 #5
0
class Wmt13Translate(wmt.WmtTranslate):
    """WMT 13 translation datasets for all {xx, "en"} language pairs."""

    # Version history:
    # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
    BUILDER_CONFIGS = [
        wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
            description="WMT 2013 %s-%s translation task dataset." % (l1, l2),
            url=_URL,
            citation=_CITATION,
            language_pair=(l1, l2),
            version=tfds.core.Version("1.0.0"),
        ) for l1, l2 in _LANGUAGE_PAIRS
    ]

    @property
    def _subsets(self):
        return {
            tfds.Split.TRAIN: [
                "europarl_v7", "commoncrawl", "multiun", "newscommentary_v8",
                "gigafren", "wikiheadlines_ru", "yandexcorpus", "czeng_10"
            ],
            tfds.Split.VALIDATION: [
                "newstest2012", "newstest2011", "newstest2010", "newstest2009",
                "newstest2008", "newssyscomb2009"
            ],
            tfds.Split.TEST: ["newstest2013"]
        }
コード例 #6
0
ファイル: wmt18.py プロジェクト: newcooldiscoveries/datasets
class Wmt18Translate(wmt.WmtTranslate):
  """WMT 18 translation datasets for all {xx, "en"} language pairs."""

  # Version history:
  # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
  BUILDER_CONFIGS = [
      wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
          description="WMT 2018 %s-%s translation task dataset." % (l1, l2),
          url=_URL,
          citation=_CITATION,
          language_pair=(l1, l2),
          version=tfds.core.Version("1.0.0"),
      ) for l1, l2 in _LANGUAGE_PAIRS
  ]

  @property
  def _subsets(self):
    return {
        tfds.Split.TRAIN: [
            "europarl_v7", "europarl_v8_18", "paracrawl_v1", "commoncrawl",
            "newscommentary_v13", "czeng_17", "yandexcorpus",
            "wikiheadlines_fi", "wikiheadlines_ru", "setimes_2", "uncorpus_v1",
            "rapid_2016"
        ] + wmt.CWMT_SUBSET_NAMES,
        tfds.Split.VALIDATION: ["newsdev2018", "newstest2017", "newstestB2017"],
        tfds.Split.TEST: ["newstest2018"]
    }
コード例 #7
0
class Wmt14Translate(wmt.WmtTranslate):
    """WMT 14 translation datasets for all {xx, "en"} language pairs."""

    # Version history:
    # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
    # 0.0.3: Initial version.
    BUILDER_CONFIGS = [
        wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
            description="WMT 2014 %s-%s translation task dataset." % (l1, l2),
            url=_URL,
            citation=_CITATION,
            language_pair=(l1, l2),
            version=tfds.core.Version("1.0.0"),
            supported_versions=[
                tfds.core.Version("0.0.3",
                                  experiments={tfds.core.Experiment.S3:
                                               False}),
            ],
        ) for l1, l2 in _LANGUAGE_PAIRS
    ]

    @property
    def _subsets(self):
        return {
            tfds.Split.TRAIN: [
                "europarl_v7", "commoncrawl", "multiun", "newscommentary_v9",
                "gigafren", "czeng_10", "yandexcorpus", "wikiheadlines_hi",
                "wikiheadlines_ru", "hindencorp_01"
            ],
            tfds.Split.VALIDATION: ["newsdev2014", "newstest2013"],
            tfds.Split.TEST: ["newstest2014"]
        }
コード例 #8
0
class Wmt16Translate(wmt.WmtTranslate):
  """WMT 16 translation datasets for all {xx, "en"} language pairs."""

  BUILDER_CONFIGS = [
      wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
          description="WMT 2016 %s-%s translation task dataset." % (l1, l2),
          url=_URL,
          citation=_CITATION,
          language_pair=(l1, l2),
          version=tfds.core.Version("1.0.0"),
      ) for l1, l2 in _LANGUAGE_PAIRS
  ]

  @property
  def _subsets(self):
    return {
        tfds.Split.TRAIN: [
            "europarl_v7", "europarl_v8_16", "commoncrawl",
            "newscommentary_v11", "czeng_16pre", "yandexcorpus",
            "wikiheadlines_fi", "wikiheadlines_ru", "setimes_2"
        ],
        tfds.Split.VALIDATION: [
            "newsdev2016", "newstest2015"
        ],
        tfds.Split.TEST: [
            "newstest2016", "newstestB2016"
        ]
    }
コード例 #9
0
class Wmt14Translate(wmt.WmtTranslate):
    """WMT 14 translation datasets for all {xx, "en"} language pairs."""

    BUILDER_CONFIGS = [
        wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
            description="WMT 2014 %s-%s translation task dataset." % (l1, l2),
            url=_URL,
            citation=_CITATION,
            language_pair=(l1, l2),
            version="0.0.1") for l1, l2 in _LANGUAGE_PAIRS
    ]

    @property
    def _subsets(self):
        return {
            tfds.Split.TRAIN: [
                "europarl_v7", "commoncrawl", "multiun", "newscommentary_v9",
                "gigafren", "czeng_10", "yandexcorpus", "wikiheadlines_hi",
                "wikiheadlines_ru", "hindencorp_01"
            ],
            tfds.Split.VALIDATION: [
                "newsdev2014", "newssyscomb2009", "newstest2008",
                "newstest2009", "newstest2010", "newstest2011", "newstest2012",
                "newstest2013"
            ],
            tfds.Split.TEST: ["newstest2014"]
        }
コード例 #10
0
ファイル: wmt19.py プロジェクト: wangjianfeng7/datasets
class Wmt19Translate(wmt.WmtTranslate):
    """WMT 19 translation datasets for {(xx, "en")} + ("fr", "de") pairs."""

    BUILDER_CONFIGS = [
        wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
            description="WMT 2019 %s-%s translation task dataset." % (l1, l2),
            url=_URL,
            citation=_CITATION,
            language_pair=(l1, l2),
            version="0.0.3") for l1, l2 in _LANGUAGE_PAIRS
    ]

    @property
    def _subsets(self):
        return {
            tfds.Split.TRAIN: [
                "europarl_v9", "europarl_v7_frde", "paracrawl_v3",
                "paracrawl_v1_ru", "paracrawl_v3_frde", "commoncrawl",
                "commoncrawl_frde", "newscommentary_v14",
                "newscommentary_v14_frde", "czeng_17", "yandexcorpus",
                "wikititles_v1", "uncorpus_v1", "rapid_2016_ltfi", "rapid_2019"
            ] + wmt.CWMT_SUBSET_NAMES,
            tfds.Split.VALIDATION:
            ["euelections_dev2019", "newsdev2019", "newstest2018"]
        }
コード例 #11
0
class WmtT2tTranslate(wmt.WmtTranslate):
    """The WMT EnDe Translate dataset used by the Tensor2Tensor library."""

    # Version history:
    # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
    # 0.0.1: Initial version.
    BUILDER_CONFIGS = [
        wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
            description="WMT T2T EnDe translation task dataset.",
            url=_URL,
            citation=_CITATION,
            language_pair=("de", "en"),
            version=tfds.core.Version(
                "0.0.1", experiments={tfds.core.Experiment.S3: False}),
            supported_versions=[
                tfds.core.Version("1.0.0"),
            ])
    ]

    @property
    def _subsets(self):
        return {
            tfds.Split.TRAIN:
            ["europarl_v7", "commoncrawl", "newscommentary_v13"],
            tfds.Split.VALIDATION: ["newstest2013"],
            tfds.Split.TEST: ["newstest2014"]
        }
コード例 #12
0
class Wmt19Translate(wmt.WmtTranslate):
    """WMT 19 translation datasets for {(xx, "en")} + ("fr", "de") pairs."""

    # Version history:
    # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
    # 0.0.3: Initial version.
    BUILDER_CONFIGS = [
        wmt.WmtConfig(  # pylint:disable=g-complex-comprehension
            description="WMT 2019 %s-%s translation task dataset." % (l1, l2),
            url=_URL,
            citation=_CITATION,
            language_pair=(l1, l2),
            version=tfds.core.Version("1.0.0"),
            supported_versions=[
                tfds.core.Version("0.0.3",
                                  experiments={tfds.core.Experiment.S3:
                                               False}),
            ],
        ) for l1, l2 in _LANGUAGE_PAIRS
    ]

    @property
    def _subsets(self):
        return {
            tfds.Split.TRAIN: [
                "europarl_v9", "europarl_v7_frde", "paracrawl_v3",
                "paracrawl_v1_ru", "paracrawl_v3_frde", "commoncrawl",
                "commoncrawl_frde", "newscommentary_v14",
                "newscommentary_v14_frde", "czeng_17", "yandexcorpus",
                "wikititles_v1", "uncorpus_v1", "rapid_2016_ltfi", "rapid_2019"
            ] + wmt.CWMT_SUBSET_NAMES,
            tfds.Split.VALIDATION:
            ["euelections_dev2019", "newsdev2019", "newstest2018"]
        }
コード例 #13
0
    def setUpClass(cls):
        super(TranslateWmtCustomConfigTest, cls).setUpClass()

        config = wmt.WmtConfig(
            name="small",
            language_pair=("cs", "en"),
            description="Example of custom config",
            subsets={
                "train": ["paracrawl_v3"],
                "validation": ["newstest2009", "newstest2010"],
            },
            version=tfds.core.Version("1.0.0"),
        )
        wmt.WmtTranslate.BUILDER_CONFIGS = [config]