Esempio n. 1
0
class Esnli(nlp.GeneratorBasedBuilder):
    """e-SNLI: Natural Language Inference with Natural Language Explanations corpus."""

    # Version History
    # 0.0.2 Added explanation_2, explanation_3 fields which exist in the dev/test
    # splits only.
    # 0.0.1 Initial version
    BUILDER_CONFIGS = [
        nlp.BuilderConfig(name="plain_text", version=nlp.Version("0.0.2"), description="Plain text import of e-SNLI",)
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "premise": nlp.Value("string"),
                    "hypothesis": nlp.Value("string"),
                    "label": nlp.features.ClassLabel(names=["entailment", "neutral", "contradiction"]),
                    "explanation_1": nlp.Value("string"),
                    "explanation_2": nlp.Value("string"),
                    "explanation_3": nlp.Value("string"),
                }
            ),
            supervised_keys=None,
            homepage="https://github.com/OanaMariaCamburu/e-SNLI",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        files = dl_manager.download_and_extract(
            {
                "train": [os.path.join(_URL, "esnli_train_1.csv"), os.path.join(_URL, "esnli_train_2.csv")],
                "validation": [os.path.join(_URL, "esnli_dev.csv")],
                "test": [os.path.join(_URL, "esnli_test.csv")],
            }
        )

        return [
            nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files": files["train"]},),
            nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"files": files["validation"]},),
            nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"files": files["test"]},),
        ]

    def _generate_examples(self, files):
        """Yields examples."""
        for filepath in files:
            with open(filepath) as f:
                reader = csv.DictReader(f)
                for _, row in enumerate(reader):
                    yield row["pairID"], {
                        "premise": row["Sentence1"],
                        "hypothesis": row["Sentence2"],
                        "label": row["gold_label"],
                        "explanation_1": row["Explanation_1"],
                        "explanation_2": row.get("Explanation_2", ""),
                        "explanation_3": row.get("Explanation_3", ""),
                    }
Esempio n. 2
0
class Snli(nlp.GeneratorBasedBuilder):
    """The Stanford Natural Language Inference (SNLI) Corpus."""

    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name="plain_text",
            version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
            description="Plain text import of SNLI",
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "premise": nlp.Value("string"),
                    "hypothesis": nlp.Value("string"),
                    "label": nlp.features.ClassLabel(names=["entailment", "neutral", "contradiction"]),
                }
            ),
            # No default supervised_keys (as we have to pass both premise
            # and hypothesis as input).
            supervised_keys=None,
            homepage="https://nlp.stanford.edu/projects/snli/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(_DATA_URL)
        data_dir = os.path.join(dl_dir, "snli_1.0")
        return [
            nlp.SplitGenerator(
                name=nlp.Split.TEST, gen_kwargs={"filepath": os.path.join(data_dir, "snli_1.0_test.txt")}
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION, gen_kwargs={"filepath": os.path.join(data_dir, "snli_1.0_dev.txt")}
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN, gen_kwargs={"filepath": os.path.join(data_dir, "snli_1.0_train.txt")}
            ),
        ]

    def _generate_examples(self, filepath):
        """This function returns the examples in the raw (text) form."""
        print(filepath)
        print("==" * 100)
        with open(filepath) as f:
            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
            for idx, row in enumerate(reader):
                label = -1 if row["gold_label"] == "-" else row["gold_label"]
                yield idx, {
                    "premise": row["sentence1"],
                    "hypothesis": row["sentence2"],
                    "label": label,
                }
Esempio n. 3
0
class TedMultiTranslate(nlp.GeneratorBasedBuilder):
    """TED talk multilingual data set."""

    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name="plain_text",
            version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
            description="Plain text import of multilingual TED talk translations",
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "translations": nlp.features.TranslationVariableLanguages(languages=_LANGUAGES),
                    "talk_name": nlp.Value("string"),
                }
            ),
            homepage="https://github.com/neulab/word-embeddings-for-nmt",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(_DATA_URL)

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN, gen_kwargs={"data_file": os.path.join(dl_dir, "all_talks_train.tsv")}
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION, gen_kwargs={"data_file": os.path.join(dl_dir, "all_talks_dev.tsv")}
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST, gen_kwargs={"data_file": os.path.join(dl_dir, "all_talks_test.tsv")}
            ),
        ]

    def _generate_examples(self, data_file):
        """This function returns the examples in the raw (text) form."""
        with open(data_file) as f:
            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
            for idx, row in enumerate(reader):
                # Everything in the row except for 'talk_name' will be a translation.
                # Missing/incomplete translations will contain the string "__NULL__" or
                # "_ _ NULL _ _".
                yield idx, {
                    "translations": {
                        lang: text
                        for lang, text in six.iteritems(row)
                        if lang != "talk_name" and _is_translation_complete(text)
                    },
                    "talk_name": row["talk_name"],
                }
Esempio n. 4
0
class DefinitePronounResolution(nlp.GeneratorBasedBuilder):
    """The Definite Pronoun Resolution Dataset."""

    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name="plain_text",
            version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
            description="Plain text import of the Definite Pronoun Resolution Dataset.",  # pylint: disable=line-too-long
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(
                {
                    "sentence": nlp.Value("string"),
                    "pronoun": nlp.Value("string"),
                    "candidates": nlp.features.Sequence(nlp.Value("string"), length=2),
                    "label": nlp.features.ClassLabel(num_classes=2),
                }
            ),
            supervised_keys=("sentence", "label"),
            homepage="http://www.hlt.utdallas.edu/~vince/data/emnlp12/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        files = dl_manager.download_and_extract(
            {"train": _DATA_URL_PATTERN.format("train"), "test": _DATA_URL_PATTERN.format("test"),}
        )
        return [
            nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": files["test"]}),
            nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": files["train"]}),
        ]

    def _generate_examples(self, filepath):
        with open(filepath) as f:
            line_num = -1
            while True:
                line_num += 1
                sentence = f.readline().strip()
                pronoun = f.readline().strip()
                candidates = [c.strip() for c in f.readline().strip().split(",")]
                correct = f.readline().strip()
                f.readline()
                if not sentence:
                    break
                yield line_num, {
                    "sentence": sentence,
                    "pronoun": pronoun,
                    "candidates": candidates,
                    "label": candidates.index(correct),
                }
Esempio n. 5
0
def _generate_builder_configs():
  """Generate configs with different subsets of mathematics dataset."""
  configs = []
  for module in sorted(set(_MODULES)):
    configs.append(
        nlp.BuilderConfig(
            name=module,
            version=nlp.Version("1.0.0"),
            description=_DESCRIPTION,
        ))

  return configs
Esempio n. 6
0
class Xnli(nlp.GeneratorBasedBuilder):
    """XNLI: The Cross-Lingual NLI Corpus. Version 1.0."""

    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name="plain_text",
            version=nlp.Version(
                "1.0.0",
                "New split API (https://tensorflow.org/datasets/splits)"),
            description="Plain text import of XNLI",
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features({
                "premise":
                nlp.features.Translation(languages=_LANGUAGES, ),
                "hypothesis":
                nlp.features.TranslationVariableLanguages(
                    languages=_LANGUAGES, ),
                "label":
                nlp.features.ClassLabel(
                    names=["entailment", "neutral", "contradiction"]),
            }),
            # No default supervised_keys (as we have to pass both premise
            # and hypothesis as input).
            supervised_keys=None,
            homepage="https://www.nyu.edu/projects/bowman/xnli/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(_DATA_URL)
        data_dir = os.path.join(dl_dir, "XNLI-1.0")
        return [
            nlp.SplitGenerator(name=nlp.Split.TEST,
                               gen_kwargs={
                                   "filepath":
                                   os.path.join(data_dir, "xnli.test.tsv")
                               }),
            nlp.SplitGenerator(name=nlp.Split.VALIDATION,
                               gen_kwargs={
                                   "filepath":
                                   os.path.join(data_dir, "xnli.dev.tsv")
                               }),
        ]

    def _generate_examples(self, filepath):
        """This function returns the examples in the raw (text) form."""
        rows_per_pair_id = collections.defaultdict(list)

        with open(filepath) as f:
            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
            for row in reader:
                rows_per_pair_id[row["pairID"]].append(row)

        for rows in six.itervalues(rows_per_pair_id):
            premise = {row["language"]: row["sentence1"] for row in rows}
            hypothesis = {row["language"]: row["sentence2"] for row in rows}
            yield rows[0]["pairID"], {
                "premise": premise,
                "hypothesis": hypothesis,
                "label": rows[0]["gold_label"],
            }
Esempio n. 7
0
class StyleChangeDetection(nlp.GeneratorBasedBuilder):
    """Style Change Detection Dataset from PAN20"""

    VERSION = nlp.Version("1.0.0")
    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name="narrow",
            version=nlp.Version("1.0.0", "Version 1"),
            description=
            "The narrow subset contains texts from a relatively narrow set of subjects matters (all related to technology).",
        ),
        nlp.BuilderConfig(
            name="wide",
            version=nlp.Version("1.0.0", "Version 1"),
            description=
            "The wide subset adds additional subject areas (travel, philosophy, economics, history, etc.).",
        ),
    ]

    @property
    def manual_download_instructions(self):
        return """\
  You should download the dataset from https://zenodo.org/record/3660984
  The dataset needs requesting.

  Download each file, extract it and place in a dir of your choice,
  which will be used as a manual_dir, e.g. `~/.manual_dirs/style_change_detection`
  Style Change Detection can then be loaded via:
  `nlp.load_dataset("style_change_detection", data_dir="~/.manual_dirs/style_change_detection")`.
  """

    def _info(self):
        features = {
            "id": nlp.Value("string"),
            "text": nlp.Value("string"),
            "authors": nlp.Value("int32"),
            "structure": nlp.features.Sequence(nlp.Value("string")),
            "site": nlp.Value("string"),
            "multi-author": nlp.Value("bool"),
            "changes": nlp.features.Sequence(nlp.Value("bool")),
        }

        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(features),
            homepage=
            "https://pan.webis.de/clef20/pan20-web/style-change-detection.html",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))

        train_dir = os.path.join(data_dir, "train",
                                 "dataset-" + self.config.name)
        val_dir = os.path.join(data_dir, "validation",
                               "dataset-" + self.config.name)

        if not os.path.exists(train_dir):
            raise FileNotFoundError(
                "{} does not exist. Make sure you insert a manual dir via `nlp.load_dataset('style_change_detection', data_dir=...)` that includes {}. Manual download instructions: {}"
                .format(train_dir, train_dir,
                        self.manual_download_instructions))

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={
                    "articles":
                    [f for f in os.listdir(train_dir) if f.endswith(".txt")],
                    "base_dir":
                    train_dir,
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={
                    "articles":
                    [f for f in os.listdir(val_dir) if f.endswith(".txt")],
                    "base_dir":
                    val_dir
                },
            ),
        ]

    def _generate_examples(self, articles=None, base_dir=None):
        """Yields examples."""
        for idx, article_filename in enumerate(articles):
            label_path = os.path.join(
                base_dir, "truth-" + article_filename[:-4] + ".json")
            with open(label_path) as f:
                example = json.load(f)
                example["id"] = article_filename[8:-4]
                example["text"] = open(os.path.join(base_dir,
                                                    article_filename)).read()

                # Convert integers into boolean
                example["multi-author"] = example["multi-author"] == 1
                for i in range(len(example["changes"])):
                    example["changes"][i] = example["changes"][i] == 1

                yield idx, example
Esempio n. 8
0
class Snli(nlp.GeneratorBasedBuilder):
    """The Stanford Natural Language Inference (SNLI) Corpus."""
    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name='plain_text',
            version=nlp.Version(
                '1.0.0',
                'New split API (https://tensorflow.org/datasets/splits)'),
            description='Plain text import of SNLI',
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features({
                'premise':
                nlp.Value('string'),
                'hypothesis':
                nlp.Value('string'),
                'label':
                nlp.features.ClassLabel(
                    names=['entailment', 'neutral', 'contradiction']),
            }),
            # No default supervised_keys (as we have to pass both premise
            # and hypothesis as input).
            supervised_keys=None,
            homepage='https://nlp.stanford.edu/projects/snli/',
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(_DATA_URL)
        data_dir = os.path.join(dl_dir, 'snli_1.0')
        return [
            nlp.SplitGenerator(name=nlp.Split.TEST,
                               gen_kwargs={
                                   'filepath':
                                   os.path.join(data_dir, 'snli_1.0_test.txt')
                               }),
            nlp.SplitGenerator(name=nlp.Split.VALIDATION,
                               gen_kwargs={
                                   'filepath':
                                   os.path.join(data_dir, 'snli_1.0_dev.txt')
                               }),
            nlp.SplitGenerator(name=nlp.Split.TRAIN,
                               gen_kwargs={
                                   'filepath':
                                   os.path.join(data_dir, 'snli_1.0_train.txt')
                               }),
        ]

    def _generate_examples(self, filepath):
        """This function returns the examples in the raw (text) form."""
        print(filepath)
        print('==' * 100)
        with open(filepath) as f:
            reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
            for idx, row in enumerate(reader):
                label = -1 if row['gold_label'] == '-' else row['gold_label']
                yield idx, {
                    'premise': row['sentence1'],
                    'hypothesis': row['sentence2'],
                    'label': label,
                }
Esempio n. 9
0
class HyperpartisanNewsDetection(nlp.GeneratorBasedBuilder):
    """Hyperpartisan News Detection Dataset."""

    VERSION = nlp.Version("1.0.0")
    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name="byarticle",
            version=nlp.Version("1.0.0", "Version Training and validation v1"),
            description=textwrap.dedent("""
                    This part of the data (filename contains "byarticle") is labeled through crowdsourcing on an article basis.
                    The data contains only articles for which a consensus among the crowdsourcing workers existed. It contains
                    a total of 645 articles. Of these, 238 (37%) are hyperpartisan and 407 (63%) are not, We will use a similar
                    (but balanced!) test set. Again, none of the publishers in this set will occur in the test set.
                """),
        ),
        nlp.BuilderConfig(
            name="bypublisher",
            version=nlp.Version("1.0.0", "Version Training and validation v1"),
            description=textwrap.dedent("""
                    This part of the data (filename contains "bypublisher") is labeled by the overall bias of the publisher as provided
                    by BuzzFeed journalists or MediaBiasFactCheck.com. It contains a total of 750,000 articles, half of which (375,000)
                    are hyperpartisan and half of which are not. Half of the articles that are hyperpartisan (187,500) are on the left side
                    of the political spectrum, half are on the right side. This data is split into a training set (80%, 600,000 articles) and
                    a validation set (20%, 150,000 articles), where no publisher that occurs in the training set also occurs in the validation
                    set. Similarly, none of the publishers in those sets will occur in the test set.
                """),
        ),
    ]

    @property
    def manual_download_instructions(self):
        return """\
  You should download the dataset from https://zenodo.org/record/1489920
  The dataset needs requesting.

  Download each file, extract it and place in a dir of your choice,
  which will be used as a manual_dir, e.g. `~/.manual_dirs/hyperpartisan_news_detection`
  Hyperpartisan News Detection can then be loaded via:
  `nlp.load_dataset("hyperpartisan_news_detection", data_dir="~/.manual_dirs/hyperpartisan_news_detection")`.
  """

    def _info(self):
        features = {
            "text": nlp.Value("string"),
            "title": nlp.Value("string"),
            "hyperpartisan": nlp.Value("bool"),
            "url": nlp.Value("string"),
            "published_at": nlp.Value("string"),
        }

        if self.config.name == "bypublisher":
            # Bias is only included in the bypublisher config
            features["bias"] = nlp.ClassLabel(names=[
                "right", "right-center", "least", "left-center", "left"
            ])

        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features(features),
            supervised_keys=("text", "label"),
            homepage="https://pan.webis.de/semeval19/semeval19-web/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))

        splits = [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={
                    "articles_file":
                    os.path.join(
                        data_dir, "articles-training-" + self.config.name +
                        "-20181122.xml"),
                    "labels_file":
                    os.path.join(
                        data_dir, "ground-truth-training-" + self.config.name +
                        "-20181122.xml"),
                },
            )
        ]
        if self.config.name == "bypublisher":
            splits.append(
                nlp.SplitGenerator(
                    name=nlp.Split.VALIDATION,
                    gen_kwargs={
                        "articles_file":
                        os.path.join(
                            data_dir, "articles-validation-" +
                            self.config.name + "-20181122.xml"),
                        "labels_file":
                        os.path.join(
                            data_dir, "ground-truth-validation-" +
                            self.config.name + "-20181122.xml"),
                    },
                ))
        return splits

    def _generate_examples(self, articles_file=None, labels_file=None):
        """Yields examples."""

        labels = {}

        with open(labels_file, "rb") as f_labels:
            tree = ET.parse(f_labels)
            root = tree.getroot()
            for label in root:
                article_id = label.attrib["id"]
                del label.attrib["labeled-by"]
                labels[article_id] = label.attrib

        with open(articles_file, "rb") as f_articles:
            tree = ET.parse(f_articles)
            root = tree.getroot()
            for idx, article in enumerate(root):
                example = {}
                example["title"] = article.attrib["title"]
                example["published_at"] = article.attrib.get(
                    "published-at", "")
                example["id"] = article.attrib["id"]
                example = {**example, **labels[example["id"]]}
                example["hyperpartisan"] = example["hyperpartisan"] == "true"

                example["text"] = ""
                for child in article.getchildren():
                    example["text"] += ET.tostring(child).decode() + "\n"
                example["text"] = example["text"].strip()
                del example["id"]
                yield idx, example
Esempio n. 10
0
class Xnli(nlp.GeneratorBasedBuilder):
    """XNLI: The Cross-Lingual NLI Corpus. Version 1.0."""
    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name='plain_text',
            version=nlp.Version(
                '1.0.0',
                'New split API (https://tensorflow.org/datasets/splits)'),
            description='Plain text import of XNLI',
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features({
                'premise':
                nlp.features.Translation(languages=_LANGUAGES, ),
                'hypothesis':
                nlp.features.TranslationVariableLanguages(
                    languages=_LANGUAGES, ),
                'label':
                nlp.features.ClassLabel(
                    names=['entailment', 'neutral', 'contradiction']),
            }),
            # No default supervised_keys (as we have to pass both premise
            # and hypothesis as input).
            supervised_keys=None,
            homepage='https://www.nyu.edu/projects/bowman/xnli/',
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(_DATA_URL)
        data_dir = os.path.join(dl_dir, 'XNLI-1.0')
        return [
            nlp.SplitGenerator(name=nlp.Split.TEST,
                               gen_kwargs={
                                   'filepath':
                                   os.path.join(data_dir, 'xnli.test.tsv')
                               }),
            nlp.SplitGenerator(name=nlp.Split.VALIDATION,
                               gen_kwargs={
                                   'filepath':
                                   os.path.join(data_dir, 'xnli.dev.tsv')
                               }),
        ]

    def _generate_examples(self, filepath):
        """This function returns the examples in the raw (text) form."""
        rows_per_pair_id = collections.defaultdict(list)

        with open(filepath) as f:
            reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
            for row in reader:
                rows_per_pair_id[row['pairID']].append(row)

        for rows in six.itervalues(rows_per_pair_id):
            premise = {row['language']: row['sentence1'] for row in rows}
            hypothesis = {row['language']: row['sentence2'] for row in rows}
            yield rows[0]['pairID'], {
                'premise': premise,
                'hypothesis': hypothesis,
                'label': rows[0]['gold_label'],
            }
Esempio n. 11
0
class DefinitePronounResolution(nlp.GeneratorBasedBuilder):
  """The Definite Pronoun Resolution Dataset."""
  BUILDER_CONFIGS = [
      nlp.BuilderConfig(
          name='plain_text',
          version=nlp.Version(
              '1.0.0',
              'New split API (https://tensorflow.org/datasets/splits)'),
          description='Plain text import of the Definite Pronoun Resolution Dataset.',  # pylint: disable=line-too-long
      )
  ]

  def _info(self):
    return nlp.DatasetInfo(
        description=_DESCRIPTION,
        features=nlp.Features({
            'sentence':
                nlp.Value('string'),
            'pronoun':
                nlp.Value('string'),
            'candidates':
                nlp.features.Sequence(nlp.Value('string'), length=2),
            'label':
                nlp.features.ClassLabel(num_classes=2),
        }),
        supervised_keys=('sentence', 'label'),
        homepage='http://www.hlt.utdallas.edu/~vince/data/emnlp12/',
        citation=_CITATION,
    )

  def _split_generators(self, dl_manager):
    files = dl_manager.download_and_extract({
        'train': _DATA_URL_PATTERN.format('train'),
        'test': _DATA_URL_PATTERN.format('test'),
    })
    return [
        nlp.SplitGenerator(
            name=nlp.Split.TEST,
            gen_kwargs={'filepath': files['test']}),
        nlp.SplitGenerator(
            name=nlp.Split.TRAIN,
            gen_kwargs={'filepath': files['train']}),
    ]

  def _generate_examples(self, filepath):
    with open(filepath) as f:
      line_num = -1
      while True:
        line_num += 1
        sentence = f.readline().strip()
        pronoun = f.readline().strip()
        candidates = [c.strip() for c in f.readline().strip().split(',')]
        correct = f.readline().strip()
        f.readline()
        if not sentence:
          break
        yield line_num, {
            'sentence': sentence,
            'pronoun': pronoun,
            'candidates': candidates,
            'label': candidates.index(correct),
        }
Esempio n. 12
0
class Esnli(nlp.GeneratorBasedBuilder):
    """e-SNLI: Natural Language Inference with Natural Language Explanations corpus."""

    # Version History
    # 0.0.2 Added explanation_2, explanation_3 fields which exist in the dev/test
    # splits only.
    # 0.0.1 Initial version
    BUILDER_CONFIGS = [
        nlp.BuilderConfig(
            name='plain_text',
            version=nlp.Version('0.0.2'),
            description='Plain text import of e-SNLI',
        )
    ]

    def _info(self):
        return nlp.DatasetInfo(
            description=_DESCRIPTION,
            features=nlp.Features({
                'premise':
                nlp.Value('string'),
                'hypothesis':
                nlp.Value('string'),
                'label':
                nlp.features.ClassLabel(
                    names=['entailment', 'neutral', 'contradiction']),
                'explanation_1':
                nlp.Value('string'),
                'explanation_2':
                nlp.Value('string'),
                'explanation_3':
                nlp.Value('string'),
            }),
            supervised_keys=None,
            homepage='https://github.com/OanaMariaCamburu/e-SNLI',
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        files = dl_manager.download_and_extract({
            'train': [
                os.path.join(_URL, 'esnli_train_1.csv'),
                os.path.join(_URL, 'esnli_train_2.csv')
            ],
            'validation': [os.path.join(_URL, 'esnli_dev.csv')],
            'test': [os.path.join(_URL, 'esnli_test.csv')]
        })

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                gen_kwargs={'files': files['train']},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                gen_kwargs={'files': files['validation']},
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                gen_kwargs={'files': files['test']},
            ),
        ]

    def _generate_examples(self, files):
        """Yields examples."""
        for filepath in files:
            with open(filepath) as f:
                reader = csv.DictReader(f)
                for _, row in enumerate(reader):
                    yield row['pairID'], {
                        'premise': row['Sentence1'],
                        'hypothesis': row['Sentence2'],
                        'label': row['gold_label'],
                        'explanation_1': row['Explanation_1'],
                        'explanation_2': row.get('Explanation_2', ''),
                        'explanation_3': row.get('Explanation_3', ''),
                    }
Esempio n. 13
0
File: mlsum.py Progetto: vinayya/nlp
class Mlsum(nlp.GeneratorBasedBuilder):

    BUILDER_CONFIGS = (
        [
            nlp.BuilderConfig(
                name=lang,
                version=nlp.Version("1.0.0"),
                description="",
            )
            for lang in _LANG
        ]
    )

    def _info(self):
        return nlp.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # nlp.features.FeatureConnectors

            features=nlp.Features(
                {
                    "text": nlp.Value("string"),
                    "summary": nlp.Value("string"),
                    "topic": nlp.Value("string"),
                    "url": nlp.Value("string"),
                    "title": nlp.Value("string"),
                    "date":nlp.Value("string")
                    # These are the features of your dataset like images, labels ...
                }
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # dl_manager is a nlp.download.DownloadManager that can be used to
        # download and extract URLs
        
        lang = str(self.config.name)
        urls_to_download = {
            "test": os.path.join(_URL, lang+"_test.zip"),
            "train": os.path.join(_URL, lang+"_train.zip"),
            "validation": os.path.join(_URL, lang+"_val.zip")
        }
        downloaded_files = dl_manager.download_and_extract(urls_to_download)

        return [
            nlp.SplitGenerator(
                name=nlp.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(downloaded_files["train"], lang+'_train.jsonl'),
                    "lang": lang,
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.VALIDATION,
                    # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(downloaded_files["validation"], lang+'_val.jsonl'),
                    "lang": lang,
                },
            ),
            nlp.SplitGenerator(
                name=nlp.Split.TEST,
                    # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(downloaded_files["test"], lang+'_test.jsonl'),
                    "lang": lang,
                },
            )
        ]

    def _generate_examples(self, filepath, lang):
        """Yields examples."""
        with open(filepath, encoding="utf-8") as f:
            i = 0
            for line in f: 
                data = json.loads(line)
                i +=1
                yield i, {
                    "text": data["text"],
                    "summary": data["summary"],
                    "topic": data["topic"],
                    "url":data['url'],
                    "title":data["title"],
                    "date":data["date"]
                }