Beispiel #1
0
class HAREM(datasets.GeneratorBasedBuilder):
    """HAREM dataset."""

    VERSION = datasets.Version("1.0.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="default",
            version=VERSION,
            description=
            "All the tags (PESSOA, ORGANIZACAO, LOCAL, TEMPO, VALOR, ABSTRACCAO, ACONTECIMENTO, COISA, OBRA, OUTRO) will be used",
        ),
        datasets.BuilderConfig(
            name="selective",
            version=VERSION,
            description=
            "Only a subset of the tags (PESSOA, ORGANIZACAO, LOCAL, TEMPO, VALOR) will be used",
        ),
    ]

    DEFAULT_CONFIG_NAME = "default"

    def _info(self):

        tags = [
            "O",
            "B-PESSOA",
            "I-PESSOA",
            "B-ORGANIZACAO",
            "I-ORGANIZACAO",
            "B-LOCAL",
            "I-LOCAL",
            "B-TEMPO",
            "I-TEMPO",
            "B-VALOR",
            "I-VALOR",
        ]

        if self.config.name == "default":
            tags += [
                "B-ABSTRACCAO",
                "I-ABSTRACCAO",
                "B-ACONTECIMENTO",
                "I-ACONTECIMENTO",
                "B-COISA",
                "I-COISA",
                "B-OBRA",
                "I-OBRA",
                "B-OUTRO",
                "I-OUTRO",
            ]

        features = datasets.Features({
            "id":
            datasets.Value("string"),
            "tokens":
            datasets.Sequence(datasets.Value("string")),
            "ner_tags":
            datasets.Sequence(datasets.features.ClassLabel(names=tags)),
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        my_urls = _URLs[self.config.name]
        data_dir = dl_manager.download_and_extract(my_urls)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": data_dir["train"],
                    "split": "train"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": data_dir["test"],
                    "split": "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": data_dir["dev"],
                    "split": "dev"
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """ Yields examples. """

        logger.info("⏳ Generating examples from = %s", filepath)

        with open(filepath, "r", encoding="utf-8") as f:

            input_data = json.load(f)
            id_ = 0

            for document in input_data:
                doc_text = document["doc_text"]
                doc_id = document["doc_id"]

                doc_tokens, char_to_word_offset = tokenize(doc_text)
                tags = ["O"] * len(doc_tokens)

                def set_label(index, tag):
                    if tags[index] != "O":
                        logger.warning(
                            "Overwriting tag %s at position %s to %s",
                            tags[index],
                            index,
                            tag,
                        )
                    tags[index] = tag

                for entity in document["entities"]:
                    entity_text = entity["text"]
                    entity_type = entity["label"]
                    start_token = None
                    end_token = None

                    entity_start_offset = entity["start_offset"]
                    entity_end_offset = entity["end_offset"]
                    start_token = char_to_word_offset[entity_start_offset]

                    # end_offset is NOT inclusive to the text, e.g.,
                    # entity_text == doc_text[start_offset:end_offset]
                    end_token = char_to_word_offset[entity_end_offset - 1]

                    assert start_token <= end_token, "End token cannot come before start token."
                    reconstructed_text = reconstruct_text_from_tokens(
                        doc_tokens[start_token:(end_token + 1)])
                    assert (
                        entity_text.strip() == reconstructed_text
                    ), "Entity text and reconstructed text are not equal: %s != %s" % (
                        entity_text,
                        reconstructed_text,
                    )

                    for token_index in range(start_token, end_token + 1):
                        if token_index == start_token:
                            tag = "B-" + entity_type
                        else:
                            tag = "I-" + entity_type
                        set_label(token_index, tag)

                yield id_, {
                    "id": doc_id,
                    "tokens": [x.text for x in doc_tokens],
                    "ner_tags": tags,
                }
                id_ += 1
Beispiel #2
0
class HopeEdi(datasets.GeneratorBasedBuilder):
    """HopeEDI dataset."""

    VERSION = datasets.Version("1.0.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="english",
            version=VERSION,
            description="This part of my dataset covers English dataset"),
        datasets.BuilderConfig(
            name="tamil",
            version=VERSION,
            description="This part of my dataset covers Tamil dataset"),
        datasets.BuilderConfig(
            name="malayalam",
            version=VERSION,
            description="This part of my dataset covers Tamil dataset"),
    ]

    def _info(self):

        if self.config.name == "english":  # This is the name of the configuration selected in BUILDER_CONFIGS above
            features = datasets.Features({
                "text":
                datasets.Value("string"),
                "label":
                datasets.features.ClassLabel(
                    names=["Hope_speech", "Non_hope_speech", "not-English"]),
            })
        elif self.config.name == "tamil":
            features = datasets.Features({
                "text":
                datasets.Value("string"),
                "label":
                datasets.features.ClassLabel(
                    names=["Hope_speech", "Non_hope_speech", "not-Tamil"]),
            })

        # else self.config.name == "malayalam":
        else:
            features = datasets.Features({
                "text":
                datasets.Value("string"),
                "label":
                datasets.features.ClassLabel(
                    names=["Hope_speech", "Non_hope_speech", "not-malayalam"]),
            })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        my_urls = _URLs[self.config.name]

        train_path = dl_manager.download_and_extract(
            my_urls["TRAIN_DOWNLOAD_URL"])
        validation_path = dl_manager.download_and_extract(
            my_urls["VALIDATION_DOWNLOAD_URL"])

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": train_path,
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": validation_path,
                    "split": "validation",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """Generate HopeEDI examples."""

        with open(filepath, encoding="utf-8") as csv_file:
            csv_reader = csv.reader(csv_file,
                                    quotechar='"',
                                    delimiter="\t",
                                    quoting=csv.QUOTE_NONE,
                                    skipinitialspace=False)

            for id_, row in enumerate(csv_reader):
                text, label, dummy = row
                yield id_, {"text": text, "label": label}
Beispiel #3
0
class Piqa(datasets.GeneratorBasedBuilder):
    """PIQA dataset."""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="plain_text",
            description="Plain text",
            version=VERSION,
        )
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "goal":
                datasets.Value("string"),
                "sol1":
                datasets.Value("string"),
                "sol2":
                datasets.Value("string"),
                "label":
                datasets.ClassLabel(names=["0", "1"]),
            }),
            supervised_keys=None,
            homepage="https://yonatanbisk.com/piqa/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = dl_manager.download_and_extract(_URLs)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "input_filepath":
                    os.path.join(data_dir["train-dev"],
                                 "physicaliqa-train-dev", "train.jsonl"),
                    "label_filepath":
                    os.path.join(data_dir["train-dev"],
                                 "physicaliqa-train-dev", "train-labels.lst"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "input_filepath": data_dir["test"],
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "input_filepath":
                    os.path.join(data_dir["train-dev"],
                                 "physicaliqa-train-dev", "dev.jsonl"),
                    "label_filepath":
                    os.path.join(data_dir["train-dev"],
                                 "physicaliqa-train-dev", "dev-labels.lst"),
                },
            ),
        ]

    def _generate_examples(self, input_filepath, label_filepath=None):
        """ Yields examples. """
        with open(input_filepath, encoding="utf-8") as input_file:
            inputs = input_file.read().splitlines()

            if label_filepath is not None:
                with open(label_filepath, encoding="utf-8") as label_file:
                    labels = label_file.read().splitlines()
            else:
                # Labels are not available for the test set.
                # Filling the `label` column with -1 by default
                labels = [-1] * len(inputs)

            for idx, (row, lab) in enumerate(zip(inputs, labels)):
                data = json.loads(row)
                goal = data["goal"]
                sol1 = data["sol1"]
                sol2 = data["sol2"]
                yield idx, {
                    "goal": goal,
                    "sol1": sol1,
                    "sol2": sol2,
                    "label": lab
                }
Beispiel #4
0
class Cifar10(datasets.GeneratorBasedBuilder):
    """CIFAR-10 Data Set"""

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="plain_text",
            version=datasets.Version("1.0.0", ""),
            description="Plain text import of CIFAR-10 Data Set",
        )
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "img":
                datasets.Array3D(shape=(32, 32, 3), dtype="uint8"),
                "label":
                datasets.features.ClassLabel(names=[
                    "airplane",
                    "automobile",
                    "bird",
                    "cat",
                    "deer",
                    "dog",
                    "frog",
                    "horse",
                    "ship",
                    "truck",
                ]),
            }),
            supervised_keys=("img", "label"),
            homepage="https://www.cs.toronto.edu/~kriz/cifar.html",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(_DATA_URL)
        final_dir = os.path.join(dl_dir, "cifar-10-batches-py")

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN,
                                    gen_kwargs={
                                        "filepath": final_dir,
                                        "split": "train"
                                    }),
            datasets.SplitGenerator(name=datasets.Split.TEST,
                                    gen_kwargs={
                                        "filepath": final_dir,
                                        "split": "test"
                                    }),
        ]

    def _generate_examples(self, filepath, split):
        """This function returns the examples in the raw (text) form."""

        if split == "train":
            batches = [
                "data_batch_1", "data_batch_2", "data_batch_3", "data_batch_4",
                "data_batch_5"
            ]

        if split == "test":
            batches = ["test_batch"]

        for batch in batches:

            file = os.path.join(filepath, batch)

            with open(file, "rb") as fo:

                dict = pickle.load(fo, encoding="bytes")

                labels = dict[b"labels"]
                images = dict[b"data"]

                for idx, _ in enumerate(images):

                    img_reshaped = np.transpose(
                        np.reshape(images[idx], (3, 32, 32)), (1, 2, 0))

                    yield idx, {
                        "img": img_reshaped,
                        "label": labels[idx],
                    }
Beispiel #5
0
class Tlc(datasets.GeneratorBasedBuilder):
    """Thai Literature Corpora (TLC): Corpora of machine-ingestible Thai classical literature texts."""

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="tlcv1.0", version=datasets.Version("1.0.0"), description="Thai Literature Corpora"
        ),
        datasets.BuilderConfig(
            name="tlcv2.0", version=datasets.Version("2.0.0"), description="Thai Literature Corpora"
        ),
        datasets.BuilderConfig(
            name="tnhcv1.0",
            version=datasets.Version("1.0.0"),
            description="Thai Literature Corpora: Thai National Historical Corpus",
        ),
    ]

    DEFAULT_CONFIG_NAME = "tlcv2.0"

    def _info(self):
        if self.config.name.startswith("tlc"):
            features = datasets.Features(
                {
                    "ch_num": datasets.Value("string"),
                    "title": datasets.Value("string"),
                    "text": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
                }
            )
        else:
            features = datasets.Features(
                {
                    "text": datasets.Sequence((datasets.Value("string"))),
                }
            )

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        data_path = dl_manager.download_and_extract(_URLs[self.config.name])

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={"directory": data_path},
            )
        ]

    def _generate_examples(self, directory):
        if self.config.name.startswith("tlc"):
            files = [os.path.join(directory, "นิราศอิเหนา.json")]
        else:
            files = [os.path.join(directory, "กาพย์เห่เรือ.json")]

        _id = 0
        for txt_file in files:
            data = json.load(open(txt_file, encoding="utf-8"))
            for d in data:
                if self.config.name.startswith("tlc"):
                    yield _id, d
                else:
                    yield _id, {"text": d}
                _id += 1
class QaSrl(datasets.GeneratorBasedBuilder):
    """QA-SRL: Question-Answer Driven Semantic Role Labeling (qa_srl) corpus"""

    VERSION = datasets.Version("1.0.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="plain_text",
            version=VERSION,
            description="This provides WIKIPEDIA dataset for qa_srl corpus"),
    ]

    DEFAULT_CONFIG_NAME = (
        "plain_text"  # It's not mandatory to have a default configuration. Just use one if it make sense.
    )

    def _info(self):
        features = datasets.Features({
            "sentence":
            datasets.Value("string"),
            "sent_id":
            datasets.Value("string"),
            "predicate_idx":
            datasets.Value("int32"),
            "predicate":
            datasets.Value("string"),
            "question":
            datasets.Sequence(datasets.Value("string")),
            "answers":
            datasets.Sequence(datasets.Value("string")),
        })
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        train_fpath = dl_manager.download(_URLs["wiki_train"])
        dev_fpath = dl_manager.download(_URLs["wiki_dev"])
        test_fpath = dl_manager.download(_URLs["wiki_test"])

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": train_fpath,
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": dev_fpath,
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": test_fpath,
                },
            ),
        ]

    def _generate_examples(self, filepath):
        """Yields examples."""

        with open(filepath, encoding="utf-8") as f:

            qa_counter = 0
            # Start reading entries
            sent_id, predicates_cnt = f.readline().rstrip("\n").split("\t")
            while True:

                sentence = f.readline().rstrip("\n")

                # Loop for every predicate
                predicates_counter = int(predicates_cnt)
                while predicates_counter != 0:
                    predicates_counter -= 1
                    predicate_details = f.readline().rstrip("\n").split("\t")
                    predicate_idx, predicate, qa_pairs_cnt = (
                        predicate_details[0],
                        predicate_details[1],
                        predicate_details[2],
                    )
                    pairs = int(qa_pairs_cnt)

                    while pairs != 0:
                        pairs -= 1
                        line = f.readline().rstrip("\n").split("\t")
                        question = line[:8]
                        answers_list = line[8:]
                        qa_counter += 1

                        if "###" in answers_list[0]:
                            answers = [
                                answer.strip()
                                for answer in answers_list[0].split("###")
                            ]
                        else:
                            answers = answers_list

                        yield qa_counter, {
                            "sentence": sentence,
                            "sent_id": sent_id,
                            "predicate_idx": predicate_idx,
                            "predicate": predicate,
                            "question": question,
                            "answers": answers,
                        }

                # Pass the blank line
                f.readline()
                nextline = f.readline()
                if not nextline:

                    break
                else:
                    sent_id, predicates_cnt = nextline.rstrip("\n").split("\t")
Beispiel #7
0
class Sharc(datasets.GeneratorBasedBuilder):
    """ShARC: A Conversational Question Answering dataset focussing on question answering from texts containing rules."""

    VERSION = datasets.Version("1.0.0")
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="sharc",
                               version=datasets.Version("1.0.0")),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "id":
                datasets.Value("string"),
                "utterance_id":
                datasets.Value("string"),
                "source_url":
                datasets.Value("string"),
                "snippet":
                datasets.Value("string"),
                "question":
                datasets.Value("string"),
                "scenario":
                datasets.Value("string"),
                "history": [{
                    "follow_up_question": datasets.Value("string"),
                    "follow_up_answer": datasets.Value("string")
                }],
                "evidence": [{
                    "follow_up_question": datasets.Value("string"),
                    "follow_up_answer": datasets.Value("string")
                }],
                "answer":
                datasets.Value("string"),
                "negative_question":
                datasets.Value("bool_"),
                "negative_scenario":
                datasets.Value("bool_"),
            }),
            supervised_keys=None,
            homepage="https://sharc-data.github.io/index.html",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        extracted_path = dl_manager.download_and_extract(_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "data_dir": os.path.join(extracted_path,
                                             "sharc1-official"),
                    "split": "train"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "data_dir": os.path.join(extracted_path,
                                             "sharc1-official"),
                    "split": "dev"
                },
            ),
        ]

    def _generate_examples(self, data_dir, split):
        with open(
                os.path.join(data_dir, "negative_sample_utterance_ids",
                             "sharc_negative_scenario_utterance_ids.txt"),
                encoding="utf-8",
        ) as f:
            negative_scenario_ids = f.readlines()
            negative_scenario_ids = [
                id_.strip() for id_ in negative_scenario_ids
            ]
        with open(
                os.path.join(data_dir, "negative_sample_utterance_ids",
                             "sharc_negative_question_utterance_ids.txt"),
                encoding="utf-8",
        ) as f:
            negative_question_ids = f.readlines()
            negative_question_ids = [
                id_.strip() for id_ in negative_question_ids
            ]

        data_file = os.path.join(data_dir, "json", f"sharc_{split}.json")
        with open(data_file, encoding="utf-8") as f:
            examples = json.load(f)
            for i, example in enumerate(examples):
                example.pop("tree_id")

                example["negative_question"] = example[
                    "utterance_id"] in negative_question_ids
                example["negative_scenario"] = example[
                    "utterance_id"] in negative_scenario_ids

                example["id"] = example["utterance_id"]

                # the keys are misspelled for one of the example in dev set
                # fix it here
                for evidence in example["evidence"]:
                    if evidence.get("followup_answer") is not None:
                        evidence["follow_up_answer"] = evidence.pop(
                            "followup_answer")
                    if evidence.get("followup_question") is not None:
                        evidence["follow_up_question"] = evidence.pop(
                            "followup_question")
                yield example["id"], example
Beispiel #8
0
class FashionMnist(datasets.GeneratorBasedBuilder):
    """FashionMNIST Data Set"""

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="fashion_mnist",
            version=datasets.Version("1.0.0"),
            description=_DESCRIPTION,
        )
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "image":
                datasets.Array2D(shape=(28, 28), dtype="uint8"),
                "label":
                datasets.features.ClassLabel(names=[
                    "T - shirt / top",
                    "Trouser",
                    "Pullover",
                    "Dress",
                    "Coat",
                    "Sandal",
                    "Shirt",
                    "Sneaker",
                    "Bag",
                    "Ankle boot",
                ]),
            }),
            supervised_keys=("image", "label"),
            homepage="https://github.com/zalandoresearch/fashion-mnist",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        urls_to_download = {key: _URL + fname for key, fname in _URLS.items()}
        downloaded_files = dl_manager.download_and_extract(urls_to_download)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": [
                        downloaded_files["train_images"],
                        downloaded_files["train_labels"]
                    ],
                    "split":
                    "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": [
                        downloaded_files["test_images"],
                        downloaded_files["test_labels"]
                    ],
                    "split":
                    "test",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """This function returns the examples in the raw form."""
        # Images
        with open(filepath[0], "rb") as f:
            # First 16 bytes contain some metadata
            _ = f.read(4)
            size = struct.unpack(">I", f.read(4))[0]
            _ = f.read(8)
            images = np.frombuffer(f.read(),
                                   dtype=np.uint8).reshape(size, 28, 28)

        # Labels
        with open(filepath[1], "rb") as f:
            # First 8 bytes contain some metadata
            _ = f.read(8)
            labels = np.frombuffer(f.read(), dtype=np.uint8)

        for idx in range(size):
            yield idx, {"image": images[idx], "label": int(labels[idx])}
Beispiel #9
0
class Atomic(datasets.GeneratorBasedBuilder):
    """Atomic Common Sense Dataset"""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="atomic",
                               version=VERSION,
                               description="The Atomic dataset"),
    ]

    def _info(self):
        features = datasets.Features({
            "event":
            datasets.Value("string"),
            "oEffect":
            datasets.Sequence(datasets.Value("string")),
            "oReact":
            datasets.Sequence(datasets.Value("string")),
            "oWant":
            datasets.Sequence(datasets.Value("string")),
            "xAttr":
            datasets.Sequence(datasets.Value("string")),
            "xEffect":
            datasets.Sequence(datasets.Value("string")),
            "xIntent":
            datasets.Sequence(datasets.Value("string")),
            "xNeed":
            datasets.Sequence(datasets.Value("string")),
            "xReact":
            datasets.Sequence(datasets.Value("string")),
            "xWant":
            datasets.Sequence(datasets.Value("string")),
            "prefix":
            datasets.Sequence(datasets.Value("string")),
            "split":
            datasets.Value("string"),
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs[self.config.name]
        data_dir = dl_manager.download_and_extract(my_urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "v4_atomic_trn.csv"),
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "v4_atomic_tst.csv"),
                    "split": "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "v4_atomic_dev.csv"),
                    "split": "dev",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """ Yields examples from the Atomic dataset. """

        with open(filepath, encoding="utf-8") as f:
            for id_, row in enumerate(f):
                if row.startswith("event"):
                    continue
                row = row.replace('"[', "[").replace(']"', "]")
                sent, rest = row.split("[", 1)
                sent = sent.strip(', "')
                rest = "[" + rest
                rest = rest.replace('""', '"').replace('\\\\"]', '"]')
                rest = rest.split(",")
                rest[-1] = '"' + rest[-1].strip() + '"'
                rest = ",".join(rest)
                row = '["' + sent + '",' + rest + "]"
                row = json.loads(row)
                yield id_, {
                    "event": str(row[0]),
                    "oEffect": row[1],
                    "oReact": row[2],
                    "oWant": row[3],
                    "xAttr": row[4],
                    "xEffect": row[5],
                    "xIntent": row[6],
                    "xNeed": row[7],
                    "xReact": row[8],
                    "xWant": row[9],
                    "prefix": row[10],
                    "split": str(row[11]),
                }
class DutchSocial(datasets.GeneratorBasedBuilder):
    """
    Annotated Covid19 tweets in Dutch language. The tweets were filtered for users who had indicated
    their location within Netherlands or if the tweets were in Dutch language. The purpose of curating
    these tweets is to measure the economic impact of the Covid19 pandemic
    """

    VERSION = datasets.Version("1.1.0")

    # This is an example of a dataset with multiple configurations.
    # If you don't want/need to define several sub-sets in your dataset,
    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.

    # If you need to make complex sub-parts in the datasets with configurable options
    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
    # BUILDER_CONFIG_CLASS = MyBuilderConfig

    # You will be able to load one or the other configurations in the following list with
    # data = datasets.load_dataset('my_dataset', 'first_domain')
    # data = datasets.load_dataset('my_dataset', 'second_domain')
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="dutch_social",
            version=VERSION,
            description=
            "This part of my dataset provides config for the entire dataset")
        # datasets.BuilderConfig(name="second_domain", version=VERSION, description="This part of my dataset covers a second domain"),
    ]

    def _info(self):
        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
        features = datasets.Features({
            "full_text":
            datasets.Value("string"),
            "text_translation":
            datasets.Value("string"),
            "screen_name":
            datasets.Value("string"),
            "description":
            datasets.Value("string"),
            "desc_translation":
            datasets.Value("string"),
            "location":
            datasets.Value("string"),
            "weekofyear":
            datasets.Value("int64"),
            "weekday":
            datasets.Value("int64"),
            "month":
            datasets.Value("int64"),
            "year":
            datasets.Value("int64"),
            "day":
            datasets.Value("int64"),
            "point_info":
            datasets.Value("string"),
            "point":
            datasets.Value("string"),
            "latitude":
            datasets.Value("float64"),
            "longitude":
            datasets.Value("float64"),
            "altitude":
            datasets.Value("float64"),
            "province":
            datasets.Value("string"),
            "hisco_standard":
            datasets.Value("string"),
            "hisco_code":
            datasets.Value("string"),
            "industry":
            datasets.Value("bool_"),
            "sentiment_pattern":
            datasets.Value("float64"),
            "subjective_pattern":
            datasets.Value("float64"),
            "label":
            datasets.ClassLabel(num_classes=3,
                                names=["neg", "neu", "pos"],
                                names_file=None,
                                id=None),
        })
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
        my_urls = _URLs[self.config.name]
        data_dir = dl_manager.download_and_extract(my_urls)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "train.jsonl"),
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "test.jsonl"),
                    "split": "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "dev.jsonl"),
                    "split": "dev",
                },
            ),
        ]

    def _generate_examples(self, filepath, split, key=None):
        """ Yields examples. """
        # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
        # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
        # The key is not important, it's more here for legacy reason (legacy from tfds)

        with open(filepath, encoding="utf-8") as f:
            tweets = json.load(f)
            for id_, data in enumerate(tweets):
                yield id_, {
                    "full_text":
                    " " if not isinstance(data["full_text"], str) else
                    data["full_text"],
                    "text_translation":
                    " " if not isinstance(data["text_translation"], str) else
                    data["text_translation"],
                    "screen_name":
                    " " if not isinstance(data["screen_name"], str) else
                    data["screen_name"],
                    "description":
                    " " if not isinstance(data["description"], str) else
                    data["description"],
                    "desc_translation":
                    " " if not isinstance(data["desc_translation"], str) else
                    data["desc_translation"],
                    "location":
                    " " if not isinstance(data["location"], str) else
                    data["location"],
                    "weekofyear":
                    -1 if data["weekofyear"] is None else data["weekofyear"],
                    "weekday":
                    -1 if data["weekday"] is None else data["weekday"],
                    "month":
                    -1 if data["month"] is None else data["month"],
                    "year":
                    -1 if data["year"] is None else data["year"],
                    "day":
                    -1 if data["day"] is None else data["day"],
                    "point_info":
                    " " if isinstance(data["point_info"],
                                      str) else data["point_info"],
                    "point":
                    " "
                    if not isinstance(data["point"], str) else data["point"],
                    "latitude":
                    -1 if data["latitude"] is None else data["latitude"],
                    "longitude":
                    -1 if data["longitude"] is None else data["longitude"],
                    "altitude":
                    -1 if data["altitude"] is None else data["altitude"],
                    "province":
                    " " if not isinstance(data["province"], str) else
                    data["province"],
                    "hisco_standard":
                    " " if not isinstance(data["hisco_standard"], str) else
                    data["hisco_standard"],
                    "hisco_code":
                    " " if not isinstance(data["hisco_code"], str) else
                    data["hisco_code"],
                    "industry":
                    False if not isinstance(data["industry"], bool) else
                    data["industry"],
                    "sentiment_pattern":
                    -1 if data["sentiment_pattern"] is None else
                    data["sentiment_pattern"],
                    "subjective_pattern":
                    -1 if data["subjective_pattern"] is None else
                    data["subjective_pattern"],
                    "label":
                    data["label"],
                }
class SentiWS(datasets.GeneratorBasedBuilder):
    """SentiWS: German-language resource for sentiment analysis, pos-tagging"""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="pos-tagging",
                               version=VERSION,
                               description="This covers pos-tagging task"),
        datasets.BuilderConfig(
            name="sentiment-scoring",
            version=VERSION,
            description=
            "This covers the sentiment-scoring in [-1, 1] corresponding to (negative, positive) sentiment",
        ),
    ]

    DEFAULT_CONFIG_NAME = "pos-tagging"

    def _info(self):

        if (
                self.config.name == "pos-tagging"
        ):  # the pos-tags are ["NN", "VVINF", "ADJX", "ADV"] -> ["noun", "verb", "adjective", "adverb"]
            features = datasets.Features({
                "word":
                datasets.Value("string"),
                "pos-tag":
                datasets.ClassLabel(names=["NN", "VVINF", "ADJX", "ADV"]),
            })
        else:  # This is an example to show how to have different features for "first_domain" and "second_domain"
            features = datasets.Features({
                "word":
                datasets.Value("string"),
                "sentiment-score":
                datasets.Value("float32"),
            })
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
        my_urls = _URLs
        data_dir = dl_manager.download_and_extract(my_urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "sourcefiles": [
                        os.path.join(data_dir[0], f) for f in [
                            "SentiWS_v2.0_Positive.txt",
                            "SentiWS_v2.0_Negative.txt"
                        ]
                    ],
                    "split":
                    "train",
                },
            ),
        ]

    def _generate_examples(self, sourcefiles, split):
        """Yields examples."""
        # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
        # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
        # The key is not important, it's more here for legacy reason (legacy from tfds)
        for file_idx, filepath in enumerate(sourcefiles):
            with open(filepath, encoding="utf-8") as f:
                for id_, row in enumerate(f):
                    word = row.split("|")[0]
                    if self.config.name == "pos-tagging":
                        tag = row.split("|")[1].split("\t")[0]
                        yield f"{file_idx}_{id_}", {
                            "word": word,
                            "pos-tag": tag
                        }
                    else:
                        sentiscore = row.split("|")[1].split("\t")[1]
                        yield f"{file_idx}_{id_}", {
                            "word": word,
                            "sentiment-score": float(sentiscore)
                        }
class SquadKorV2(datasets.GeneratorBasedBuilder):
    """KorQuAD 2.1 dataset"""

    VERSION = datasets.Version("2.1.0")
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="squad_kor_v2", version=VERSION, description=_DESCRIPTION),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "title": datasets.Value("string"),
                    "context": datasets.Value("string"),
                    "question": datasets.Value("string"),
                    "answer": datasets.Features(
                        {
                            "text": datasets.Value("string"),
                            "answer_start": datasets.Value("int32"),
                            "html_answer_start": datasets.Value("int32"),
                        }
                    ),
                    "url": datasets.Value("string"),
                    "raw_html": datasets.Value("string"),
                }
            ),
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # download and extract URLs
        urls_to_download = _URLS
        downloaded_files = dl_manager.download_and_extract(urls_to_download)

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"dirs": downloaded_files["train"]}),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION, gen_kwargs={"dirs": downloaded_files["validation"]}
            ),
        ]

    def _generate_examples(self, dirs):
        """Yields examples."""

        for d in dirs:
            filepaths = sorted(os.scandir(d), key=lambda x: x.name)
            for filepath in filepaths:
                with open(filepath, encoding="utf-8") as f:
                    squad = json.load(f)
                    for example in squad["data"]:
                        title = example.get("title", "").strip()
                        url = example.get("url", "").strip()
                        raw_html = example.get("raw_html", "").strip()
                        context = example["context"].strip()
                        for qa in example["qas"]:
                            question = qa["question"].strip()
                            answer = qa["answer"]
                            id_ = qa["id"]

                            answer_start = answer["answer_start"]
                            html_answer_start = answer["html_answer_start"]
                            answer_text = answer["text"].strip()

                            yield id_, {
                                "title": title,
                                "context": context,
                                "question": question,
                                "id": id_,
                                "answer": {
                                    "answer_start": answer_start,
                                    "html_answer_start": html_answer_start,
                                    "text": answer_text,
                                },
                                "url": url,
                                "raw_html": raw_html,
                            }
Beispiel #13
0
class VCTK(datasets.GeneratorBasedBuilder):
    """VCTK dataset."""

    VERSION = datasets.Version("0.9.2")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="main", version=VERSION, description="VCTK dataset"),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "speaker_id": datasets.Value("string"),
                    "audio": datasets.features.Audio(sampling_rate=48_000),
                    "file": datasets.Value("string"),
                    "text": datasets.Value("string"),
                    "text_id": datasets.Value("string"),
                    "age": datasets.Value("string"),
                    "gender": datasets.Value("string"),
                    "accent": datasets.Value("string"),
                    "region": datasets.Value("string"),
                    "comment": datasets.Value("string"),
                }
            ),
            supervised_keys=("file", "text"),
            homepage=_URL,
            citation=_CITATION,
            task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")],
        )

    def _split_generators(self, dl_manager):
        root_path = dl_manager.download_and_extract(_DL_URL)

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"root_path": root_path}),
        ]

    def _generate_examples(self, root_path):
        """Generate examples from the VCTK corpus root path."""

        meta_path = os.path.join(root_path, "speaker-info.txt")
        txt_root = os.path.join(root_path, "txt")
        wav_root = os.path.join(root_path, "wav48_silence_trimmed")
        # NOTE: "comment" is handled separately in logic below
        fields = ["speaker_id", "age", "gender", "accent", "region"]

        key = 0
        with open(meta_path, encoding="utf-8") as meta_file:
            _ = next(iter(meta_file))
            for line in meta_file:
                data = {}
                line = line.strip()
                search = re.search(r"\(.*\)", line)
                if search is None:
                    data["comment"] = ""
                else:
                    start, _ = search.span()
                    data["comment"] = line[start:]
                    line = line[:start]
                values = line.split()
                for i, field in enumerate(fields):
                    if field == "region":
                        data[field] = " ".join(values[i:])
                    else:
                        data[field] = values[i] if i < len(values) else ""
                speaker_id = data["speaker_id"]
                speaker_txt_path = os.path.join(txt_root, speaker_id)
                speaker_wav_path = os.path.join(wav_root, speaker_id)
                # NOTE: p315 does not have text
                if not os.path.exists(speaker_txt_path):
                    continue
                for txt_file in sorted(os.listdir(speaker_txt_path)):
                    filename, _ = os.path.splitext(txt_file)
                    _, text_id = filename.split("_")
                    for i in [1, 2]:
                        wav_file = os.path.join(speaker_wav_path, f"{filename}_mic{i}.flac")
                        # NOTE: p280 does not have mic2 files
                        if not os.path.exists(wav_file):
                            continue
                        with open(os.path.join(speaker_txt_path, txt_file), encoding="utf-8") as text_file:
                            text = text_file.readline().strip()
                            more_data = {
                                "file": wav_file,
                                "audio": wav_file,
                                "text": text,
                                "text_id": text_id,
                            }
                            yield key, {**data, **more_data}
                        key += 1
class Turk(datasets.GeneratorBasedBuilder):

    VERSION = datasets.Version("1.0.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="simplification",
            version=VERSION,
            description=
            "A set of original sentences aligned with 8 possible simplifications for each.",
        )
    ]

    def _info(self):
        features = datasets.Features({
            "original":
            datasets.Value("string"),
            "simplifications":
            datasets.Sequence(datasets.Value("string")),
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        data_dir = dl_manager.download_and_extract(_URLs)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepaths": data_dir,
                    "split": "valid",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepaths": data_dir,
                    "split": "test"
                },
            ),
        ]

    def _generate_examples(self, filepaths, split):
        """Yields examples."""
        if split == "valid":
            split = "tune"
        files = [
            open(filepaths[f"{split}.8turkers.tok.norm"], encoding="utf-8")
        ] + [
            open(filepaths[f"{split}.8turkers.tok.turk.{i}"], encoding="utf-8")
            for i in range(8)
        ]
        for id_, lines in enumerate(zip(*files)):
            yield id_, {
                "original": lines[0].strip(),
                "simplifications": [line.strip() for line in lines[1:]]
            }
Beispiel #15
0
class Scitldr(datasets.GeneratorBasedBuilder):
    """Dataset for TLDR: Extreme Summarization of Scientific Documents."""

    VERSION = datasets.Version("1.1.0")

    # You will be able to load one or the other configurations in the following list with
    # data = datasets.load_dataset('scitldr', 'Abstract')
    # data = datasets.load_dataset('scitldr', 'AIC')
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="Abstract",
            description="This part contains only abstracts of the paper"),
        datasets.BuilderConfig(
            name="AIC",
            description=
            "This part contains Abstracts, Introduction and Conclusion (AIC) sections of the paper",
        ),
        datasets.BuilderConfig(
            name="FullText",
            description="This part contains the full text of the paper"),
    ]

    DEFAULT_CONFIG_NAME = (
        "Abstract"  # It's not mandatory to have a default configuration. Just use one if it make sense.
    )

    def _info(self):
        if self.config.name == "AIC":  # This is the name of the configuration selected in BUILDER_CONFIGS above
            features = datasets.Features({
                "source":
                datasets.Sequence(datasets.Value("string")),
                "source_labels":
                datasets.Sequence(
                    datasets.ClassLabel(num_classes=2, names=[0, 1])),
                "rouge_scores":
                datasets.Sequence(datasets.Value("float32")),
                "paper_id":
                datasets.Value("string"),
                "ic":
                datasets.Value("bool_"),
                "target":
                datasets.features.Sequence(datasets.Value("string"))
                # These are the features of your dataset like images, labels ...
            })
        else:
            features = datasets.Features({
                "source":
                datasets.Sequence(datasets.Value("string")),
                "source_labels":
                datasets.Sequence(
                    datasets.ClassLabel(num_classes=2,
                                        names=["non-oracle", "oracle"])),
                "rouge_scores":
                datasets.Sequence(datasets.Value("float32")),
                "paper_id":
                datasets.Value("string"),
                "target":
                datasets.Sequence(datasets.Value("string"))
                # These are the features of your dataset like images, labels ...
            })
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=(_SOURCE, _TARGET),
            # Homepage of the dataset for documentation
            homepage="https://github.com/allenai/scitldr",
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
        urls = {
            "train": os.path.join(_URLs[self.config.name], _TRAIN_DATA),
            "valid": os.path.join(_URLs[self.config.name], _VALID_DATA),
            "test": os.path.join(_URLs[self.config.name], _TEST_DATA),
        }
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir["train"]),
                    "split": "train"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir["test"]),
                    "split": "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir["valid"]),
                    "split": "dev"
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """ Yields examples. """
        # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
        # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
        # The key is not important, it's more here for legacy reason (legacy from tfds)

        with open(filepath, encoding="utf-8") as f:
            for id_, row in enumerate(f):
                data = json.loads(row)
                if self.config.name == "AIC":
                    yield id_, {
                        "source": data["source"],
                        "source_labels": data["source_labels"],
                        "rouge_scores": data["rouge_scores"],
                        "paper_id": data["paper_id"],
                        "ic": True if data["ic"] else False,
                        "target": data["target"],
                    }
                else:
                    yield id_, {
                        "source": data["source"],
                        "source_labels": data["source_labels"],
                        "rouge_scores": data["rouge_scores"],
                        "paper_id": data["paper_id"],
                        "target": data["target"],
                    }
Beispiel #16
0
class CataloniaIndependence(datasets.GeneratorBasedBuilder):
    """This dataset contains two corpora in Spanish and Catalan that consist of annotated Twitter messages for automatic stance detection."""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="catalan",
            version=VERSION,
            description=
            "This part of the corpus contains annotated tweets posted in Catalan.",
        ),
        datasets.BuilderConfig(
            name="spanish",
            version=VERSION,
            description=
            "This part of the corpus contains annotated tweets posted in Spanish.",
        ),
    ]

    DEFAULT_CONFIG_NAME = "catalan"

    def _info(self):
        features = datasets.Features({
            "id_str":
            datasets.Value("string"),
            "TWEET":
            datasets.Value("string"),
            "LABEL":
            datasets.ClassLabel(names=["AGAINST", "FAVOR", "NEUTRAL"]),
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        data_dir = dl_manager.download_and_extract(_URLs[self.config.name])
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, f"{self.config.name}_train.csv")
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, f"{self.config.name}_test.csv")
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, f"{self.config.name}_val.csv")
                },
            ),
        ]

    def _generate_examples(self, filepath):
        with open(filepath, encoding="utf-8") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter="\t")
            # skip header
            next(csv_reader)
            for _id, row in enumerate(csv_reader):
                yield _id, {"id_str": row[0], "TWEET": row[1], "LABEL": row[2]}
Beispiel #17
0
class HyperpartisanNewsDetection(datasets.GeneratorBasedBuilder):
    """Hyperpartisan News Detection Dataset."""

    VERSION = datasets.Version("1.0.0")
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="byarticle",
            version=datasets.Version("1.0.0",
                                     "Version Training and validation v1"),
            description=textwrap.dedent("""
                    This part of the data (filename contains "byarticle") is labeled through crowdsourcing on an article basis.
                    The data contains only articles for which a consensus among the crowdsourcing workers existed. It contains
                    a total of 645 articles. Of these, 238 (37%) are hyperpartisan and 407 (63%) are not, We will use a similar
                    (but balanced!) test set. Again, none of the publishers in this set will occur in the test set.
                """),
        ),
        datasets.BuilderConfig(
            name="bypublisher",
            version=datasets.Version("1.0.0",
                                     "Version Training and validation v1"),
            description=textwrap.dedent("""
                    This part of the data (filename contains "bypublisher") is labeled by the overall bias of the publisher as provided
                    by BuzzFeed journalists or MediaBiasFactCheck.com. It contains a total of 750,000 articles, half of which (375,000)
                    are hyperpartisan and half of which are not. Half of the articles that are hyperpartisan (187,500) are on the left side
                    of the political spectrum, half are on the right side. This data is split into a training set (80%, 600,000 articles) and
                    a validation set (20%, 150,000 articles), where no publisher that occurs in the training set also occurs in the validation
                    set. Similarly, none of the publishers in those sets will occur in the test set.
                """),
        ),
    ]

    def _info(self):
        features = {
            "text": datasets.Value("string"),
            "title": datasets.Value("string"),
            "hyperpartisan": datasets.Value("bool"),
            "url": datasets.Value("string"),
            "published_at": datasets.Value("string"),
        }

        if self.config.name == "bypublisher":
            # Bias is only included in the bypublisher config
            features["bias"] = datasets.ClassLabel(names=[
                "right", "right-center", "least", "left-center", "left"
            ])

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(features),
            supervised_keys=("text", "label"),
            homepage="https://pan.webis.de/semeval19/semeval19-web/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        urls = {
            datasets.Split.TRAIN: {
                "articles_file":
                _URL_BASE + "articles-training-" + self.config.name +
                "-20181122.zip?download=1",
                "labels_file":
                _URL_BASE + "ground-truth-training-" + self.config.name +
                "-20181122.zip?download=1",
            },
        }
        if self.config.name == "bypublisher":
            urls[datasets.Split.VALIDATION] = {
                "articles_file":
                _URL_BASE + "articles-training-" + self.config.name +
                "-20181122.zip?download=1",
                "labels_file":
                _URL_BASE + "ground-truth-training-" + self.config.name +
                "-20181122.zip?download=1",
            }

        data_dir = {}
        for key in urls:
            data_dir[key] = dl_manager.download_and_extract(urls[key])

        splits = []
        for split in data_dir:
            for key in data_dir[split]:
                data_dir[split][key] = os.path.join(
                    data_dir[split][key],
                    os.listdir(data_dir[split][key])[0])
            splits.append(
                datasets.SplitGenerator(name=split,
                                        gen_kwargs=data_dir[split]))
        return splits

    def _generate_examples(self, articles_file=None, labels_file=None):
        """Yields examples."""
        labels = {}
        with open(labels_file, "rb") as f_labels:
            tree = ET.parse(f_labels)
            root = tree.getroot()
            for label in root:
                article_id = label.attrib["id"]
                del label.attrib["labeled-by"]
                labels[article_id] = label.attrib

        with open(articles_file, "rb") as f_articles:
            tree = ET.parse(f_articles)
            root = tree.getroot()
            for idx, article in enumerate(root):
                example = {}
                example["title"] = article.attrib["title"]
                example["published_at"] = article.attrib.get(
                    "published-at", "")
                example["id"] = article.attrib["id"]
                example = {**example, **labels[example["id"]]}
                example["hyperpartisan"] = example["hyperpartisan"] == "true"

                example["text"] = ""
                for child in article:
                    example["text"] += ET.tostring(child).decode() + "\n"
                example["text"] = example["text"].strip()
                del example["id"]
                yield idx, example
Beispiel #18
0
class Nell(datasets.GeneratorBasedBuilder):
    """NELL dataset for knowledge bases and knowledge graphs and underlying sentences."""

    VERSION = datasets.Version("0.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="nell_belief",
                               description="The beliefs in raw data form",
                               version="1115.0.0"),
        datasets.BuilderConfig(
            name="nell_candidate",
            description="The candidate beliefs in raw data form",
            version="1110.0.0"),
        datasets.BuilderConfig(
            name="nell_belief_sentences",
            description=
            "The underlying sentences available for the nell beliefs",
            version="1115.0.0",
        ),
        datasets.BuilderConfig(
            name="nell_candidate_sentences",
            description=
            "The underlying sentences available for the nell candidate beliefs",
            version="1110.0.0",
        ),
    ]

    DEFAULT_CONFIG_NAME = "nell"

    def _info(self):
        if self.config.name in ("nell_belief", "nell_candidate"):
            features = datasets.Features({
                "entity":
                datasets.Value("string"),
                "relation":
                datasets.Value("string"),
                "value":
                datasets.Value("string"),
                "iteration_of_promotion":
                datasets.Value("string"),
                "score":
                datasets.Value("string"),
                "source":
                datasets.Value("string"),
                "entity_literal_strings":
                datasets.Value("string"),
                "value_literal_strings":
                datasets.Value("string"),
                "best_entity_literal_string":
                datasets.Value("string"),
                "best_value_literal_string":
                datasets.Value("string"),
                "categories_for_entity":
                datasets.Value("string"),
                "categories_for_value":
                datasets.Value("string"),
                "candidate_source":
                datasets.Value("string"),
            })
        else:
            features = datasets.Features({
                "entity":
                datasets.Value("string"),
                "relation":
                datasets.Value("string"),
                "value":
                datasets.Value("string"),
                "score":
                datasets.Value("string"),
                "sentence":
                datasets.Value("string"),
                "count":
                datasets.Value("int32"),
                "url":
                datasets.Value("string"),
                "sentence_type":
                datasets.Value("string"),
            })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage="http://rtw.ml.cmu.edu/rtw/",
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs[self.config.name]
        data_dir = dl_manager.download_and_extract(my_urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": data_dir,
                    "split": "train",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """ Yields examples from the NELL belief knowledge base and candidate bleifs knowledge base if the config is 'nell_belief' and 'nell_candidate', respectively, otherwise yields the sentences for two dataset if the config is 'nell_belief_sentences' and 'nell_candidate_sentences' respectively. """

        with open(filepath, encoding="utf-8") as f:
            id_ = -1
            for row in f:
                row = row.strip().split("\t")
                if "[" in row[3]:
                    row[3] = row[3].strip("[]").split(",")[0]
                if "[" in row[4]:
                    row[4] = row[4].strip("[]").split(",")[0]
                if self.config.name in ("nell_belief", "nell_candidate"):
                    id_ += 1
                    yield id_, {
                        "entity": row[0].strip(),
                        "relation": row[1].strip(),
                        "value": row[2].strip(),
                        "iteration_of_promotion": row[3].strip(),
                        "score": row[4].strip(),
                        "source": row[5].strip(),
                        "entity_literal_strings": row[6].strip(),
                        "value_literal_strings": row[7].strip(),
                        "best_entity_literal_string": row[8].strip(),
                        "best_value_literal_string": row[9].strip(),
                        "categories_for_entity": row[10].strip(),
                        "categories_for_value": row[11].strip(),
                        "candidate_source": row[12].strip(),
                    }
                else:
                    best_arg1 = row[8]
                    best_arg2 = row[9]
                    iter_type = ""
                    for s2 in unquote(row[12]).strip("[]").split("-Iter"):
                        if iter_type in ("CPL", "OE"):
                            arr = unescape(
                                s2.split(">", 1)[-1].strip("-").replace(
                                    "+", " ")).split("\t")
                            la = len(arr)
                            count = 0
                            url = ""
                            for i in range(0, la, 2):
                                sentence = arr[i]
                                if i + 1 == la:
                                    count = 1
                                    url = ""
                                else:
                                    try:
                                        count = int(arr[i + 1].split(",")[0])
                                        url = ""
                                    except ValueError:
                                        count = 1
                                        url = ""
                                        if arr[i + 1].startswith("http"):
                                            url = arr[i + 1].split(",")[0]
                                if iter_type == "CPL":
                                    if "_" in sentence:
                                        sentence = sentence.replace(
                                            "_", "[[ " + best_arg1 + " ]]")
                                    elif "arg1" in sentence:
                                        sentence = sentence.replace(
                                            "arg1",
                                            "[[ " + best_arg1 + " ]]").replace(
                                                "arg2",
                                                "[[ " + best_arg2 + " ]]")
                                    else:
                                        continue
                                if sentence.endswith("CPL"):
                                    sentence = sentence[:-5]
                                if sentence.endswith("OE"):
                                    sentence = sentence[:-4]
                                id_ += 1
                                yield id_, {
                                    "entity":
                                    row[0].replace("candidate:",
                                                   "").replace("concept:",
                                                               "").strip(),
                                    "relation":
                                    row[1].replace("candidate:",
                                                   "").replace("concept:",
                                                               "").strip(),
                                    "value":
                                    row[2].replace("candidate:",
                                                   "").replace("concept:",
                                                               "").strip(),
                                    "score":
                                    row[4].strip(),
                                    "sentence":
                                    sentence.strip(),
                                    "count":
                                    int(count),
                                    "url":
                                    url.strip(),
                                    "sentence_type":
                                    iter_type,
                                }
                        iter_type = s2.split(",")[-1].strip("+")
Beispiel #19
0
class StoryCloze(datasets.GeneratorBasedBuilder):
    """Story Cloze."""

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="2016",
                               description="Story Cloze Test Spring 2016 set"),
        datasets.BuilderConfig(name="2018",
                               description="Story Cloze Test Winter 2018 set"),
    ]

    @property
    def manual_download_instructions(self):
        return (
            "To use Story Cloze you have to download it manually. Please fill this "
            "google form (http://goo.gl/forms/aQz39sdDrO). Complete the form. "
            "Then you will recieve a download link for the dataset. Load it using: "
            "`datasets.load_dataset('story_cloze', data_dir='path/to/folder/folder_name')`"
        )

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "story_id":
                datasets.Value("string"),
                "input_sentence_1":
                datasets.Value("string"),
                "input_sentence_2":
                datasets.Value("string"),
                "input_sentence_3":
                datasets.Value("string"),
                "input_sentence_4":
                datasets.Value("string"),
                "sentence_quiz1":
                datasets.Value("string"),
                "sentence_quiz2":
                datasets.Value("string"),
                "answer_right_ending":
                datasets.Value("int32"),
            }),
            homepage="https://cs.rochester.edu/nlp/rocstories/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        path_to_manual_folder = os.path.abspath(
            os.path.expanduser(dl_manager.manual_dir))

        if self.config.name == "2016":
            test_file = os.path.join(
                path_to_manual_folder,
                "cloze_test_test__spring2016 - cloze_test_ALL_test.csv")
            val_file = os.path.join(
                path_to_manual_folder,
                "cloze_test_val__spring2016 - cloze_test_ALL_val.csv")
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    gen_kwargs={
                        "filepath": val_file,
                    },
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TEST,
                    gen_kwargs={
                        "filepath": test_file,
                    },
                ),
            ]

        else:
            val_file = os.path.join(
                path_to_manual_folder,
                "cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")

            return [
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    gen_kwargs={
                        "filepath": val_file,
                    },
                ),
            ]

    def _generate_examples(self, filepath):
        """Generate Story Cloze examples."""
        with open(filepath, encoding="utf-8") as csv_file:
            csv_reader = csv.reader(csv_file,
                                    quotechar='"',
                                    delimiter=",",
                                    quoting=csv.QUOTE_ALL,
                                    skipinitialspace=True)
            _ = next(csv_reader)
            for id_, row in enumerate(csv_reader):
                if row and len(row) == 8:
                    yield id_, {
                        "story_id": row[0],
                        "input_sentence_1": row[1],
                        "input_sentence_2": row[2],
                        "input_sentence_3": row[3],
                        "input_sentence_4": row[4],
                        "sentence_quiz1": row[5],
                        "sentence_quiz2": row[6],
                        "answer_right_ending": int(row[7]),
                    }
class MultiWozV22(datasets.GeneratorBasedBuilder):

    VERSION = datasets.Version("2.2.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="v2.2",
                               version=datasets.Version("2.2.0"),
                               description="MultiWOZ v2.2"),
        datasets.BuilderConfig(
            name="v2.2_active_only",
            version=datasets.Version("2.2.0"),
            description=
            "MultiWOZ v2.2, only keeps around frames with an active intent",
        ),
        datasets.BuilderConfig(
            name="v2.2_turns",
            version=datasets.Version("2.2.0"),
            description="MultiWOZ v2.2, turn-based",
        ),
        datasets.BuilderConfig(
            name="v2.2_slots_original",
            version=datasets.Version("2.2.0"),
            description="MultiWOZ v2.2, slot-based",
        ),
        datasets.BuilderConfig(
            name="v2.2_slots_desc",
            version=datasets.Version("2.2.0"),
            description="MultiWOZ v2.2, slot-based",
        ),
        datasets.BuilderConfig(
            name="v2.2_slots_onto",
            version=datasets.Version("2.2.0"),
            description="MultiWOZ v2.2, slot-based",
        ),
        datasets.BuilderConfig(
            name="v2.2_slots_both",
            version=datasets.Version("2.2.0"),
            description="MultiWOZ v2.2, slot-based",
        ),
        datasets.BuilderConfig(
            name="v2.1_slots_original",
            version=datasets.Version("2.1.0"),
            description="MultiWOZ v2.1, slot-based",
        ),
        datasets.BuilderConfig(
            name="v2.1_slots_desc",
            version=datasets.Version("2.1.0"),
            description="MultiWOZ v2.1, slot-based",
        ),
        datasets.BuilderConfig(
            name="v2.1_slots_onto",
            version=datasets.Version("2.1.0"),
            description="MultiWOZ v2.1, slot-based",
        ),
        datasets.BuilderConfig(
            name="v2.1_slots_both",
            version=datasets.Version("2.1.0"),
            description="MultiWOZ v2.1, slot-based",
        ),
    ]

    dialogue_based_config_names = {"v2.2", "v2.2_active_only"}
    turn_based_config_names = {
        "v2.2_turns",
    }
    slot_based_config_names = {
        "v2.2_slots_original",
        "v2.2_slots_desc",
        "v2.2_slots_onto",
        "v2.2_slots_both",
        "v2.1_slots_original",
        "v2.1_slots_desc",
        "v2.1_slots_onto",
        "v2.1_slots_both",
    }

    version21_config_names = {
        "v2.1_slots_original",
        "v2.1_slots_desc",
        "v2.1_slots_onto",
        "v2.1_slots_both",
    }
    version22_config_names = {
        "v2.2",
        "v2.2_active_only"
        "v2.2_turns",
        "v2.2_slots_original",
        "v2.2_slots_desc",
        "v2.2_slots_onto",
        "v2.2_slots_both",
        "v2.1_slots_original",
        "v2.1_slots_desc",
        "v2.1_slots_onto",
        "v2.1_slots_both",
    }

    version21_prefix = {
        "v2.1_slots_original": "v2.1_",
        "v2.1_slots_desc": "v2.1_",
        "v2.1_slots_onto": "v2.1_both_",
        "v2.1_slots_both": "v2.1_both_",
    }

    ORDERED_TRACK_SLOTS = [
        "attraction-area",
        "attraction-name",
        "attraction-type",
        "hotel-area",
        "hotel-bookday",
        "hotel-bookpeople",
        "hotel-bookstay",
        "hotel-internet",
        "hotel-name",
        "hotel-parking",
        "hotel-pricerange",
        "hotel-stars",
        "hotel-type",
        "restaurant-area",
        "restaurant-bookday",
        "restaurant-bookpeople",
        "restaurant-booktime",
        "restaurant-food",
        "restaurant-name",
        "restaurant-pricerange",
        "taxi-arriveby",
        "taxi-departure",
        "taxi-destination",
        "taxi-leaveat",
        "train-arriveby",
        "train-bookpeople",
        "train-day",
        "train-departure",
        "train-destination",
        "train-leaveat",
    ]
    assert len(ORDERED_TRACK_SLOTS) == 30, ORDERED_TRACK_SLOTS

    DEFAULT_CONFIG_NAME = "v2.2_active_only"

    def _info(self):
        if self.config.name in self.turn_based_config_names:
            features = datasets.Features({
                "dialogue_id":
                datasets.Value("string"),
                "services":
                datasets.Sequence(datasets.Value("string")),
                "turn_id":
                datasets.Value("string"),
                "speaker":
                datasets.ClassLabel(names=["USER", "SYSTEM"]),
                "utterance":
                datasets.Value("string"),
                "frames":
                datasets.Sequence({
                    "service":
                    datasets.Value("string"),
                    "state": {
                        "active_intent":
                        datasets.Value("string"),
                        "requested_slots":
                        datasets.Sequence(datasets.Value("string")),
                        "slots_values":
                        datasets.Sequence({
                            "slots_values_name":
                            datasets.Value("string"),
                            "slots_values_list":
                            datasets.Sequence(datasets.Value("string")),
                        }),
                    },
                    "slots":
                    datasets.Sequence({
                        "slot":
                        datasets.Value("string"),
                        "value":
                        datasets.Value("string"),
                        "start":
                        datasets.Value("int32"),
                        "exclusive_end":
                        datasets.Value("int32"),
                        "copy_from":
                        datasets.Value("string"),
                        "copy_from_value":
                        datasets.Sequence(datasets.Value("string")),
                    }),
                }),
                "dialogue_acts":
                datasets.Features({
                    "dialog_act":
                    datasets.Sequence({
                        "act_type":
                        datasets.Value("string"),
                        "act_slots":
                        datasets.Sequence(
                            datasets.Features({
                                "slot_name":
                                datasets.Value("string"),
                                "slot_value":
                                datasets.Value("string"),
                            }), ),
                    }),
                    "span_info":
                    datasets.Sequence({
                        "act_type":
                        datasets.Value("string"),
                        "act_slot_name":
                        datasets.Value("string"),
                        "act_slot_value":
                        datasets.Value("string"),
                        "span_start":
                        datasets.Value("int32"),
                        "span_end":
                        datasets.Value("int32"),
                    }),
                }),
            })
        elif self.config.name in self.slot_based_config_names:
            features = datasets.Features({
                "dialogue_id":
                datasets.Value("string"),
                "services":
                datasets.Sequence(datasets.Value("string")),
                "turn_id":
                datasets.Value("string"),
                "speaker":
                datasets.ClassLabel(names=["USER", "SYSTEM"]),
                "utterance":
                datasets.Value("string"),
                "history":
                datasets.Value("string"),
                "name":
                datasets.Value("string"),
                "description":
                datasets.Value("string"),
                "value":
                datasets.Value("string"),
                "service+description+history":
                datasets.Value("string"),
            })
        elif self.config.name in self.dialogue_based_config_names:
            features = datasets.Features({
                "dialogue_id":
                datasets.Value("string"),
                "services":
                datasets.Sequence(datasets.Value("string")),
                "turns":
                datasets.Sequence({
                    "turn_id":
                    datasets.Value("string"),
                    "speaker":
                    datasets.ClassLabel(names=["USER", "SYSTEM"]),
                    "utterance":
                    datasets.Value("string"),
                    "frames":
                    datasets.Sequence({
                        "service":
                        datasets.Value("string"),
                        "state": {
                            "active_intent":
                            datasets.Value("string"),
                            "requested_slots":
                            datasets.Sequence(datasets.Value("string")),
                            "slots_values":
                            datasets.Sequence({
                                "slots_values_name":
                                datasets.Value("string"),
                                "slots_values_list":
                                datasets.Sequence(datasets.Value("string")),
                            }),
                        },
                        "slots":
                        datasets.Sequence({
                            "slot":
                            datasets.Value("string"),
                            "value":
                            datasets.Value("string"),
                            "start":
                            datasets.Value("int32"),
                            "exclusive_end":
                            datasets.Value("int32"),
                            "copy_from":
                            datasets.Value("string"),
                            "copy_from_value":
                            datasets.Sequence(datasets.Value("string")),
                        }),
                    }),
                    "dialogue_acts":
                    datasets.Features({
                        "dialog_act":
                        datasets.Sequence({
                            "act_type":
                            datasets.Value("string"),
                            "act_slots":
                            datasets.Sequence(
                                datasets.Features({
                                    "slot_name":
                                    datasets.Value("string"),
                                    "slot_value":
                                    datasets.Value("string"),
                                }), ),
                        }),
                        "span_info":
                        datasets.Sequence({
                            "act_type":
                            datasets.Value("string"),
                            "act_slot_name":
                            datasets.Value("string"),
                            "act_slot_value":
                            datasets.Value("string"),
                            "span_start":
                            datasets.Value("int32"),
                            "span_end":
                            datasets.Value("int32"),
                        }),
                    }),
                }),
            })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=
            features,  # Here we define them above because they are different between the two configurations
            supervised_keys=None,
            homepage=
            "https://github.com/budzianowski/multiwoz/tree/master/data/MultiWOZ_2.2",
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        data_files = dl_manager.download_and_extract(_URLs)
        self.stored_dialogue_acts = json.load(open(
            data_files["dialogue_acts"]))
        self.schema = json.load(open(data_files[self.config.name]))
        self.slot_description = {
            slot["name"]: slot["description"]
            for service in self.schema for slot in service["slots"]
        }
        self.ordered_slots = sorted(self.slot_description.keys())

        return [
            datasets.SplitGenerator(
                name=spl_enum,
                gen_kwargs={
                    "filepaths": data_files,
                    "split": spl,
                },
            ) for spl, spl_enum in [
                # ("train", datasets.Split.TRAIN),
                # ("dev", datasets.Split.VALIDATION),
                ("test", datasets.Split.TEST),
            ]
        ]

    def _generate_examples(self, filepaths, split):
        id_ = -1
        data_file_prefix = (split
                            if self.config.name in self.version22_config_names
                            else self.version21_prefix[self.config.name])
        file_list = [
            fpath for fname, fpath in filepaths.items()
            if fname.startswith(data_file_prefix)
        ]
        for filepath in file_list:
            dialogues = json.load(open(filepath))
            for dialogue in dialogues:
                id_ += 1
                mapped_acts = self.stored_dialogue_acts.get(
                    dialogue["dialogue_id"], {})
                res = {
                    "dialogue_id":
                    dialogue["dialogue_id"],
                    "services":
                    dialogue["services"],
                    "turns": [{
                        "turn_id":
                        turn["turn_id"],
                        "speaker":
                        turn["speaker"],
                        "utterance":
                        turn["utterance"],
                        "frames": [{
                            "service":
                            frame["service"],
                            "state": {
                                "active_intent":
                                frame["state"]["active_intent"]
                                if "state" in frame else "",
                                "requested_slots":
                                frame["state"]["requested_slots"]
                                if "state" in frame else [],
                                "slots_values": {
                                    "slots_values_name": [
                                        sv_name for sv_name, sv_list in
                                        frame["state"]["slot_values"].items()
                                    ] if "state" in frame else [],
                                    "slots_values_list": [
                                        sv_list for sv_name, sv_list in
                                        frame["state"]["slot_values"].items()
                                    ] if "state" in frame else [],
                                },
                            },
                            "slots": [{
                                "slot":
                                slot["slot"],
                                "value":
                                "" if "copy_from" in slot else slot["value"],
                                "start":
                                slot.get("exclusive_end", -1),
                                "exclusive_end":
                                slot.get("start", -1),
                                "copy_from":
                                slot.get("copy_from", ""),
                                "copy_from_value":
                                slot["value"] if "copy_from" in slot else [],
                            } for slot in frame["slots"]],
                        } for frame in turn["frames"]
                                   if ("active_only" not in self.config.name
                                       or frame.get("state", {}).get(
                                           "active_intent", "NONE") != "NONE")
                                   ],
                        "dialogue_acts": {
                            "dialog_act": [{
                                "act_type": act_type,
                                "act_slots": {
                                    "slot_name": [
                                        sl_name
                                        for sl_name, sl_val in dialog_act
                                    ],
                                    "slot_value":
                                    [sl_val for sl_name, sl_val in dialog_act],
                                },
                            } for act_type, dialog_act in mapped_acts.get(
                                turn["turn_id"], {}).get("dialog_act",
                                                         {}).items()],
                            "span_info": [{
                                "act_type": span_info[0],
                                "act_slot_name": span_info[1],
                                "act_slot_value": span_info[2],
                                "span_start": span_info[3],
                                "span_end": span_info[4],
                            } for span_info in mapped_acts.get(
                                turn["turn_id"], {}).get("span_info", [])],
                        },
                    } for turn in dialogue["turns"]],
                }
                if self.config.name in self.turn_based_config_names:
                    for turn in res["turns"]:
                        turn_res = {**deepcopy(res), **deepcopy(turn)}
                        del turn_res["turns"]
                        yield id_, turn_res
                elif self.config.name in self.slot_based_config_names:
                    history = ""
                    for turn in res["turns"]:
                        history += f"{turn['speaker']}: {turn['utterance']} "
                        if turn["speaker"] == "USER":
                            for name in self.ORDERED_TRACK_SLOTS:
                                service = name.split("-")[0]
                                frame = [
                                    frame for frame in turn["frames"]
                                    if frame["service"] == service
                                ][0]
                                value = (frame["state"]["slots_values"]
                                         ["slots_values_list"]
                                         [frame["state"]["slots_values"]
                                          ["slots_values_name"].index(name)] if
                                         name in frame["state"]["slots_values"]
                                         ["slots_values_name"] else "none")
                                if isinstance(name, list):
                                    name = name[0]
                                if isinstance(value, list):
                                    value = value[0]
                                assert isinstance(name, str), name
                                assert isinstance(value, str), value
                                description = self.slot_description[name]
                                slot_res = {
                                    "name": name,
                                    "value": value,
                                    "history": history.strip(),
                                    "description": description,
                                    "service+description+history":
                                    f"{description} context: {history}",
                                    **deepcopy(res),
                                    **deepcopy(turn),
                                }
                                del slot_res["turns"]
                                del slot_res["frames"]
                                del slot_res["dialogue_acts"]
                                yield id_, slot_res
                elif self.config.name in self.dialogue_based_config_names:
                    yield id_, res
                else:
                    raise ValueError(f"Unknown config: {self.config}")
Beispiel #21
0
class FewRel(datasets.GeneratorBasedBuilder):
    """The FewRelDataset."""

    VERSION = datasets.Version("1.0.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="default",
            version=VERSION,
            description="This covers the entire FewRel dataset."),
    ]

    def _info(self):
        features = datasets.Features({
            "relation":
            datasets.Value("string"),
            "tokens":
            datasets.Sequence(datasets.Value("string")),
            "head": {
                "text":
                datasets.Value("string"),
                "type":
                datasets.Value("string"),
                "indices":
                datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
            },
            "tail": {
                "text":
                datasets.Value("string"),
                "type":
                datasets.Value("string"),
                "indices":
                datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
            },
            "names":
            datasets.Sequence(datasets.Value("string"))
            # These are the features of your dataset like images, labels ...
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = dl_manager.download_and_extract(_URLs)
        return [
            datasets.SplitGenerator(
                name=datasets.Split(key),
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir[key],
                    "pid2name": data_dir["pid2name"],
                    "return_names": key
                    in ["train_wiki", "val_wiki", "val_nyt"],
                },
            ) for key in data_dir.keys() if key != "pid2name"
        ]

    def _generate_examples(self, filepath, pid2name, return_names):
        """ Yields examples. """
        pid2name_dict = {}
        with open(pid2name, encoding="utf-8") as f:
            data = json.load(f)
        for key in list(data.keys()):
            name_1 = data[key][0]
            name_2 = data[key][1]
            pid2name_dict[key] = [name_1, name_2]

        with open(filepath, encoding="utf-8") as f:
            data = json.load(f)
            # print(data)
        if isinstance(data, dict):
            id = 0
            for key in list(data.keys()):
                for items in data[key]:
                    tokens = items["tokens"]
                    h_0 = items["h"][0]
                    h_1 = items["h"][1]
                    h_2 = items["h"][2]
                    t_0 = items["t"][0]
                    t_1 = items["t"][1]
                    t_2 = items["t"][2]
                    id += 1
                    yield id, {
                        "relation": key,
                        "tokens": tokens,
                        "head": {
                            "text": h_0,
                            "type": h_1,
                            "indices": h_2
                        },
                        "tail": {
                            "text": t_0,
                            "type": t_1,
                            "indices": t_2
                        },
                        "names": pid2name_dict[key] if return_names else [
                            key,
                        ],
                    }
        else:  # For `pubmed_unsupervised.json`
            id = 0
            for items in data:
                tokens = items["tokens"]
                h_0 = items["h"][0]
                h_1 = items["h"][1]
                h_2 = items["h"][2]
                t_0 = items["t"][0]
                t_1 = items["t"][1]
                t_2 = items["t"][2]
                id += 1
                yield id, {
                    "relation": "",
                    "tokens": tokens,
                    "head": {
                        "text": h_0,
                        "type": h_1,
                        "indices": h_2
                    },
                    "tail": {
                        "text": t_0,
                        "type": t_1,
                        "indices": t_2
                    },
                    "names": [
                        "",
                    ],
                }
class ArabicBillionWords(datasets.GeneratorBasedBuilder):
    """Arabic Billion Words Corpus"""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="Alittihad",
            version=VERSION,
            description="This part of dataset covers Alittihad news paper"),
        datasets.BuilderConfig(
            name="Almasryalyoum",
            version=VERSION,
            description="This part of dataset covers Almasryalyoum news paper"
        ),
        datasets.BuilderConfig(
            name="Almustaqbal",
            version=VERSION,
            description="This part of dataset covers Almustaqbal news paper"),
        datasets.BuilderConfig(
            name="Alqabas",
            version=VERSION,
            description="This part of dataset covers Alqabas news paper"),
        datasets.BuilderConfig(
            name="Echoroukonline",
            version=VERSION,
            description="This part of dataset covers Echoroukonline news paper"
        ),
        datasets.BuilderConfig(
            name="Ryiadh",
            version=VERSION,
            description="This part of dataset covers Ryiadh news paper"),
        datasets.BuilderConfig(
            name="Sabanews",
            version=VERSION,
            description="This part of dataset covers Sabanews news paper"),
        datasets.BuilderConfig(
            name="SaudiYoum",
            version=VERSION,
            description="This part of dataset covers SaudiYoum news paper"),
        datasets.BuilderConfig(
            name="Techreen",
            version=VERSION,
            description="This part of dataset covers Techreen news paper"),
        datasets.BuilderConfig(
            name="Youm7",
            version=VERSION,
            description="This part of dataset covers Youm7 news paper"),
    ]

    def _info(self):
        features = datasets.Features({
            "url": datasets.Value("string"),
            "head_line": datasets.Value("string"),
            "date": datasets.Value("string"),
            "text": datasets.Value("string"),
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs[self.config.name]
        data_dir = dl_manager.download_and_extract(my_urls)
        my_file_name = f"{self.config.name}_utf_8.xml"
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, my_file_name),
                },
            ),
        ]

    def _extract_tags(self, sample, tag):

        # check if the tag is misspelled
        for tg in MISS_SPELLED_TAGS[tag]:
            pattern = f"<{tg}>(.*?)</{tg}>"
            out = re.findall(r"" + pattern, sample.group(0),
                             re.MULTILINE | re.DOTALL)
            if len(out) > 0:
                break
        return out[0]

    def _clean_text(self, text):
        return text.replace("?", "")

    def _generate_examples(self, filepath):
        """Yields examples."""
        current_multi_line = ""
        _idx = 0
        data_tag = self.config.name
        with open(filepath, mode="r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if i == 0:
                    continue
                current_multi_line += line
                if i % 8 == 0:
                    pattern = f"<{data_tag}(.*?)</{data_tag}>"
                    data = re.finditer(r"" + pattern, current_multi_line,
                                       re.MULTILINE | re.DOTALL)
                    text, url, head_line, date = ["", "", "", ""]
                    for record in data:
                        try:
                            text = self._clean_text(
                                self._extract_tags(record, "Text"))
                            url = self._extract_tags(record, "URL")
                            head_line = self._clean_text(
                                self._extract_tags(record, "Headline"))
                            date = self._extract_tags(record, "Dateline")
                        except IndexError:
                            continue
                        yield str(_idx), {
                            "url": url,
                            "head_line": head_line,
                            "date": date,
                            "text": text
                        }
                        _idx += 1
                    current_multi_line = ""
Beispiel #23
0
class OrangeSum(datasets.GeneratorBasedBuilder):
    """OrangeSum: a french abstractive summarization dataset"""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="abstract",
                               description="Abstracts used as summaries",
                               version=VERSION),
        datasets.BuilderConfig(name="title",
                               description="Titles used as summaries",
                               version=VERSION),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                _DOCUMENT: datasets.Value("string"),
                _SUMMARY: datasets.Value("string"),
            }),
            supervised_keys=(_DOCUMENT, _SUMMARY),
            homepage="https://github.com/Tixierae/OrangeSum/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = dl_manager.download_and_extract(_URL_DATA[self.config.name])

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir,
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir,
                    "split": "test",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir,
                    "split": "valid",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """ Yields examples. """
        with open(os.path.join(filepath, self.config.name,
                               "{}.source".format(split)),
                  encoding="utf-8") as f_source, open(
                      os.path.join(filepath, self.config.name,
                                   "{}.target".format(split)),
                      encoding="utf-8") as f_target:
            for idx, (document, summary) in enumerate(zip(f_source, f_target)):
                yield idx, {_DOCUMENT: document, _SUMMARY: summary}
class Aquamuse(datasets.GeneratorBasedBuilder):
    """Dataset for Query-based Multi-Document Summarization"""

    VERSION = datasets.Version("2.3.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="abstractive", version=VERSION, description="Abstractive query-based multi-document summarization"
        ),
        datasets.BuilderConfig(
            name="extractive", version=VERSION, description="Extractive query-based multi-document summarization"
        ),
    ]

    # DEFAULT_CONFIG_NAME = "abstractive"  # It's not mandatory to have a default configuration. Just use one if it make sense.

    def _info(self):
        features = datasets.Features(
            {
                "query": datasets.Value("string"),
                "input_urls": datasets.Sequence(datasets.Value("string")),
                "target": datasets.Value("string"),
            }
        )

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        if self.config.name == "abstractive":
            data_dir = dl_manager.download_and_extract(zipped_data_url)
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "filepath": os.path.join(data_dir, "v2.3/abstractive/train/"),
                        "split": "train",
                    },
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TEST,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "filepath": os.path.join(data_dir, "v2.3/abstractive/test/"),
                        "split": "test",
                    },
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "filepath": os.path.join(data_dir, "v2.3/abstractive/dev/"),
                        "split": "dev",
                    },
                ),
            ]

        else:
            data_dir = dl_manager.download_and_extract(zipped_data_url)
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "filepath": os.path.join(data_dir, "v2.3/extractive/train/"),
                        "split": "train",
                    },
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TEST,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "filepath": os.path.join(data_dir, "v2.3/extractive/test/"),
                        "split": "test",
                    },
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "filepath": os.path.join(data_dir, "v2.3/extractive/dev/"),
                        "split": "dev",
                    },
                ),
            ]

    def _generate_examples(self, filepath, split):
        """Yields examples."""
        filepath = [join(filepath, f) for f in listdir(filepath) if isfile(join(filepath, f))]
        filepath = sorted(filepath)
        raw_dataset = tf.data.TFRecordDataset(filepath)
        for id_, raw_record in enumerate(raw_dataset):
            example = tf.train.Example()
            example.ParseFromString(raw_record.numpy())
            yield id_, {
                "query": example.features.feature["query"].bytes_list.value[0].decode(),
                "input_urls": example.features.feature["input_urls"].bytes_list.value[0].decode().split("<EOD>"),
                "target": example.features.feature["target"].bytes_list.value[0].decode(),
            }
Beispiel #25
0
class Humicroedit(datasets.GeneratorBasedBuilder):
    """This is humorous headline dataset called Humicroedit introduced in the Task-7 of SemEval 2020."""

    VERSION = datasets.Version("1.1.0")

    # This is an example of a dataset with multiple configurations.
    # If you don't want/need to define several sub-sets in your dataset,
    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.

    # If you need to make complex sub-parts in the datasets with configurable options
    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
    # BUILDER_CONFIG_CLASS = MyBuilderConfig

    # You will be able to load one or the other configurations in the following list with
    # data = datasets.load_dataset('my_dataset', 'first_domain')
    # data = datasets.load_dataset('my_dataset', 'second_domain')
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="subtask-1",
            description="This part of the dataset covers the data for subtask-1"
        ),
        datasets.BuilderConfig(
            name="subtask-2",
            description="This part of the dataset covers the data for subtask-2"
        ),
    ]

    def _info(self):
        if self.config.name == "subtask-1":
            features = datasets.Features({
                "id": datasets.Value("string"),
                "original": datasets.Value("string"),
                "edit": datasets.Value("string"),
                "grades": datasets.Value("string"),
                "meanGrade": datasets.Value("float"),
                # These are the features of your dataset like images, labels ...
            })
        else:
            features = datasets.Features({
                "id":
                datasets.Value("string"),
                "original1":
                datasets.Value("string"),
                "edit1":
                datasets.Value("string"),
                "grades1":
                datasets.Value("string"),
                "meanGrade1":
                datasets.Value("float"),
                "original2":
                datasets.Value("string"),
                "edit2":
                datasets.Value("string"),
                "grades2":
                datasets.Value("string"),
                "meanGrade2":
                datasets.Value("float"),
                "label":
                datasets.ClassLabel(names=["equal", "sentence1", "sentence2"]),
            })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = dl_manager.download_and_extract(_URL)
        ROOT = "semeval-2020-task-7-dataset"

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, ROOT, self.config.name,
                                 "train.csv"),
                    "split":
                    "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, ROOT, self.config.name, "test.csv"),
                    "split":
                    "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, ROOT, self.config.name, "dev.csv"),
                    "split":
                    "dev",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("funlines"),
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, ROOT, self.config.name,
                                 "train_funlines.csv"),
                    "split":
                    "funlines",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """ Yields examples. """
        label_names = ["equal", "sentence1", "sentence2"]

        with open(filepath, encoding="utf-8") as csv_file:
            csv_reader = csv.reader(csv_file,
                                    quotechar='"',
                                    delimiter=",",
                                    quoting=csv.QUOTE_ALL,
                                    skipinitialspace=True)
            next(csv_reader)

            for id_, row in enumerate(csv_reader):
                if self.config.name == "subtask-1":
                    row_id, original, edit, grades, meanGrade = row
                    yield id_, {
                        "id": row_id,
                        "original": original,
                        "edit": edit,
                        "grades": grades,
                        "meanGrade": meanGrade,
                    }
                else:
                    row_id, original1, edit1, grades1, meanGrade1, original2, edit2, grades2, meanGrade2, label = row
                    yield id_, {
                        "id": row_id,
                        "original1": original1,
                        "edit1": edit1,
                        "grades1": grades1,
                        "meanGrade1": meanGrade1,
                        "original2": original2,
                        "edit2": edit2,
                        "grades2": grades2,
                        "meanGrade2": meanGrade2,
                        "label": label_names[int(label)],
                    }
Beispiel #26
0
class RepairDataset(datasets.GeneratorBasedBuilder):
    """连贯性测试数据集"""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="repair",
                               version=VERSION,
                               description="正常数量数据集"),
    ]

    DEFAULT_CONFIG_NAME = "repair"

    def _info(self):
        # 指定datasets.DatasetInfo类包含的数据集信息
        features = datasets.Features({
            "sentence1":
            datasets.Value("string"),
            "sentence2":
            datasets.Value("string"),
            "label":
            datasets.features.ClassLabel(names=LABELS)
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            #不同的数据集可以不同的特征,即不同的column
            features=features,
            # 如果特征包含一个通用的(input, target)元组,请在此处指定它们。They'll be used in builder.as_dataset,  as_supervised=True
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """下载数据集
        此方法的任务是下载/提取数据并根据配置定义拆分
        根据不同的配置BUILDER_CONFIGS,和数据集的name定义
        """
        # dl_manager是一个datasets.download.DownloadManager,可用于下载和提取URL,
        # 它可以接受任何类型或嵌套的列表/字典,并将返回相同的结构,url也可以替换为局部文件的路径。
        # 默认情况下,将提取压缩包,如果文件是压缩的,并返回提取压缩的缓存文件夹的路径,而不是压缩文件
        if self.config.data_dir:
            data_dir = self.config.data_dir
        else:
            my_urls = _URLs[self.config.name]
            data_dir = dl_manager.download_and_extract(my_urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # 下面的参数将传给 _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "train.json"),
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # 下面的参数将传给 _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "test.json"),
                    "split": "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # 下面的参数将传给 _generate_examples
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "dev.json"),
                    "split": "dev",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """ Yields 方法返回每个样本. """
        # 被函数_split_generators 调用,参数也是通过 gen_kwargs被传过来
        # 它负责打开给定的文件并从数据集中产生(key, example)元组
        # key是不重要的,只是习惯于这样

        with open(filepath, encoding="utf-8") as f:
            data = json.load(f)
            for id_, row in enumerate(data):
                yield id_, {
                    "sentence1": row[1].strip(),  #句子
                    "sentence2": row[0] + row[2].strip(),  #英语单词\n+错误单词
                    # "sentence2": row[2].strip(),  #英语单词+错误单词
                    "label": row[3].strip(),  #正确单词
                }
Beispiel #27
0
class Polemo2(datasets.GeneratorBasedBuilder):
    """PolEmo2.0"""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="in",
            version=VERSION,
            description="The PolEmo2.0 is a set of online reviews from medicine and hotels domains. The task is to predict the sentiment of a review. There are two separate test sets, to allow for in-domain (medicine and hotels) as well as out-of-domain (products and university) validation.",
        ),
        datasets.BuilderConfig(
            name="out",
            version=VERSION,
            description="The PolEmo2.0 is a set of online reviews from medicine and hotels domains. The task is to predict the sentiment of a review. There are two separate test sets, to allow for in-domain (medicine and hotels) as well as out-of-domain (products and university) validation.",
        ),
    ]

    DEFAULT_CONFIG_NAME = "in"

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "sentence": datasets.Value("string"),
                    "target": datasets.ClassLabel(
                        names=[
                            "__label__meta_amb",
                            "__label__meta_minus_m",
                            "__label__meta_plus_m",
                            "__label__meta_zero",
                        ]
                    ),
                }
            ),
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs[self.config.name]
        data_dir = dl_manager.download_and_extract(my_urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "train.tsv"),
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={"filepath": os.path.join(data_dir, "test_features.tsv"), "split": "test"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "dev.tsv"),
                    "split": "dev",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """ Yields examples. """
        with open(filepath, encoding="utf-8") as f:
            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
            for id_, row in enumerate(reader):
                yield id_, {
                    "sentence": row["sentence"],
                    "target": -1 if split == "test" else row["target"],
                }
Beispiel #28
0
class Lama(datasets.GeneratorBasedBuilder):
    """Lama Dataset"""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="trex",
            version=VERSION,
            description="The TRex part of the Lama dataset"),
        datasets.BuilderConfig(
            name="squad",
            version=VERSION,
            description="The Squad part of the Lama dataset"),
        datasets.BuilderConfig(
            name="google_re",
            version=VERSION,
            description="The Google_re part of the Lama dataset"),
        datasets.BuilderConfig(
            name="conceptnet",
            version=VERSION,
            description="The Conceptnet part of the Lama dataset"),
    ]

    DEFAULT_CONFIG_NAME = "trex"

    def _info(self):
        if self.config.name == "trex":
            features = datasets.Features({
                "uuid":
                datasets.Value("string"),
                "obj_uri":
                datasets.Value("string"),
                "obj_label":
                datasets.Value("string"),
                "sub_uri":
                datasets.Value("string"),
                "sub_label":
                datasets.Value("string"),
                "predicate_id":
                datasets.Value("string"),
                "sub_surface":
                datasets.Value("string"),
                "obj_surface":
                datasets.Value("string"),
                "masked_sentence":
                datasets.Value("string"),
                "template":
                datasets.Value("string"),
                "template_negated":
                datasets.Value("string"),
                "label":
                datasets.Value("string"),
                "description":
                datasets.Value("string"),
                "type":
                datasets.Value("string"),
            })
            return datasets.DatasetInfo(
                description=_DESCRIPTION,
                features=features,
                supervised_keys=None,
                homepage=_HOMEPAGE,
                license=_LICENSE,
                citation=_CITATION,
            )
        elif self.config.name == "conceptnet":
            features = datasets.Features({
                "uuid":
                datasets.Value("string"),
                "sub":
                datasets.Value("string"),
                "obj":
                datasets.Value("string"),
                "pred":
                datasets.Value("string"),
                "obj_label":
                datasets.Value("string"),
                "masked_sentence":
                datasets.Value("string"),
                "negated":
                datasets.Value("string"),
            })
            return datasets.DatasetInfo(
                description=_DESCRIPTION,
                features=features,
                supervised_keys=None,
                homepage=_HOMEPAGE,
                license=_LICENSE,
                citation=_CITATION,
            )
        elif self.config.name == "squad":
            features = datasets.Features({
                "id":
                datasets.Value("string"),
                "sub_label":
                datasets.Value("string"),
                "obj_label":
                datasets.Value("string"),
                "negated":
                datasets.Value("string"),
                "masked_sentence":
                datasets.Value("string"),
            })
            return datasets.DatasetInfo(
                description=_DESCRIPTION,
                features=features,
                supervised_keys=None,
                homepage=_HOMEPAGE,
                license=_LICENSE,
                citation=_CITATION,
            )
        elif self.config.name == "google_re":
            features = datasets.Features({
                "pred":
                datasets.Value("string"),
                "sub":
                datasets.Value("string"),
                "obj":
                datasets.Value("string"),
                "evidences":
                datasets.Value("string"),
                "judgments":
                datasets.Value("string"),
                "sub_w":
                datasets.Value("string"),
                "sub_label":
                datasets.Value("string"),
                "sub_aliases":
                datasets.Value("string"),
                "obj_w":
                datasets.Value("string"),
                "obj_label":
                datasets.Value("string"),
                "obj_aliases":
                datasets.Value("string"),
                "uuid":
                datasets.Value("string"),
                "masked_sentence":
                datasets.Value("string"),
                "template":
                datasets.Value("string"),
                "template_negated":
                datasets.Value("string"),
            })
            return datasets.DatasetInfo(
                description=_DESCRIPTION,
                features=features,
                supervised_keys=None,
                homepage=_HOMEPAGE,
                license=_LICENSE,
                citation=_CITATION,
            )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs[self.config.name]
        data_dir = dl_manager.download_and_extract(my_urls)
        if self.config.name == "trex":
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={
                        "filepath":
                        [os.path.join(data_dir, "relations.jsonl")] +
                        list(glob.glob(os.path.join(data_dir, "TREx", "*"))),
                        "split":
                        "train",
                    },
                ),
            ]
        elif self.config.name == "google_re":
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={
                        "filepath": [
                            os.path.join(data_dir, *f.split("/")) for f in [
                                "Google_RE/date_of_birth_test.jsonl",
                                "Google_RE/place_of_birth_test.jsonl",
                                "Google_RE/place_of_death_test.jsonl",
                            ]
                        ],
                        "split":
                        "train",
                    },
                ),
            ]
        elif self.config.name == "conceptnet":
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={
                        "filepath":
                        os.path.join(data_dir, "ConceptNet", "test.jsonl"),
                        "split":
                        "train",
                    },
                ),
            ]
        elif self.config.name == "squad":
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={
                        "filepath": os.path.join(data_dir, "Squad",
                                                 "test.jsonl"),
                        "split": "train",
                    },
                ),
            ]

    def _generate_examples(self, filepath, split):
        """Yields examples from the LAMA dataset."""
        if self.config.name == "trex":
            paths = filepath
            relations_path = paths[0]
            paths = paths[1:]
            all_rels = {}
            with open(relations_path, encoding="utf-8") as f:
                for row in f:
                    data = json.loads(row)
                    all_rels[data["relation"]] = data
            id_ = -1
            for filepath in paths:
                with open(filepath, encoding="utf-8") as f:
                    for row in f:
                        data = json.loads(row)
                        pred = all_rels.get(data["predicate_id"], {})
                        for evidences in data["evidences"]:
                            id_ += 1
                            yield id_, {
                                "uuid":
                                str(data["uuid"]),
                                "obj_uri":
                                str(data["obj_uri"]),
                                "obj_label":
                                str(data["obj_label"]),
                                "sub_uri":
                                str(data["sub_uri"]),
                                "sub_label":
                                str(data["sub_label"]),
                                "predicate_id":
                                str(data["predicate_id"]),
                                "sub_surface":
                                str(evidences["sub_surface"]),
                                "obj_surface":
                                str(evidences["obj_surface"]),
                                "masked_sentence":
                                str(evidences["masked_sentence"]),
                                "template":
                                str(pred.get("template", "")),
                                "template_negated":
                                str(pred.get("template_negated", "")),
                                "label":
                                str(pred.get("label", "")),
                                "description":
                                str(pred.get("description", "")),
                                "type":
                                str(pred.get("type", "")),
                            }
        elif self.config.name == "conceptnet":
            id_ = -1
            with open(filepath, encoding="utf-8") as f:
                for row in f:
                    data = json.loads(row)
                    if data.get("negated") is not None:
                        for masked_sentence, negated in zip(
                                data["masked_sentences"], data["negated"]):
                            id_ += 1
                            yield id_, {
                                "uuid": str(data["uuid"]),
                                "sub": str(data.get("sub", "")),
                                "obj": str(data.get("obj", "")),
                                "pred": str(data["pred"]),
                                "obj_label": str(data["obj_label"]),
                                "masked_sentence": str(masked_sentence),
                                "negated": str(negated),
                            }
                    else:
                        for masked_sentence in data["masked_sentences"]:
                            id_ += 1
                            yield id_, {
                                "uuid": str(data["uuid"]),
                                "sub": str(data.get("sub", "")),
                                "obj": str(data.get("obj", "")),
                                "pred": str(data["pred"]),
                                "obj_label": str(data["obj_label"]),
                                "masked_sentence": str(masked_sentence),
                                "negated": str(""),
                            }
        elif self.config.name == "squad":
            id_ = -1
            with open(filepath, encoding="utf-8") as f:
                for row in f:
                    data = json.loads(row)
                    for masked_sentence in data["masked_sentences"]:
                        id_ += 1
                        yield id_, {
                            "id": str(data["id"]),
                            "sub_label": str(data["sub_label"]),
                            "obj_label": str(data["obj_label"]),
                            "negated": str(data.get("negated", "")),
                            "masked_sentence": str(masked_sentence),
                        }
        elif self.config.name == "google_re":
            id_ = -1
            paths = filepath
            for filepath in paths:
                # from https://github.com/facebookresearch/LAMA/blob/master/scripts/run_experiments.py
                if "place_of_birth" in filepath:
                    pred = {
                        "relation": "place_of_birth",
                        "template": "[X] was born in [Y] .",
                        "template_negated": "[X] was not born in [Y] .",
                    }
                elif "date_of_birth" in filepath:
                    pred = {
                        "relation": "date_of_birth",
                        "template": "[X] (born [Y]).",
                        "template_negated": "[X] (not born [Y]).",
                    }
                else:
                    pred = {
                        "relation": "place_of_death",
                        "template": "[X] died in [Y] .",
                        "template_negated": "[X] did not die in [Y] .",
                    }
                with open(filepath, encoding="utf-8") as f:
                    for row in f:
                        data = json.loads(row)
                        for masked_sentence in data["masked_sentences"]:
                            id_ += 1
                            yield id_, {
                                "pred": str(data["pred"]),
                                "sub": str(data["sub"]),
                                "obj": str(data["obj"]),
                                "evidences": str(data["evidences"]),
                                "judgments": str(data["judgments"]),
                                "sub_w": str(data["sub_w"]),
                                "sub_label": str(data["sub_label"]),
                                "sub_aliases": str(data["sub_aliases"]),
                                "obj_w": str(data["obj_w"]),
                                "obj_label": str(data["obj_label"]),
                                "obj_aliases": str(data["obj_aliases"]),
                                "uuid": str(data["uuid"]),
                                "masked_sentence": str(masked_sentence),
                                "template": str(pred["template"]),
                                "template_negated":
                                str(pred["template_negated"]),
                            }
Beispiel #29
0
class SemEval2018Task1(datasets.GeneratorBasedBuilder):

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="subtask5.english",
            version=VERSION,
            description=
            "This is the English dataset of subtask 5: E-c: Detecting Emotions.",
        ),
        datasets.BuilderConfig(
            name="subtask5.spanish",
            version=VERSION,
            description=
            "This is the Spanish dataset of subtask 5: E-c: Detecting Emotions.",
        ),
        datasets.BuilderConfig(
            name="subtask5.arabic",
            version=VERSION,
            description=
            "This is the Arabic dataset of subtask 5: E-c: Detecting Emotions.",
        ),
    ]

    def _info(self):
        features = datasets.Features({
            "ID": datasets.Value("string"),
            "Tweet": datasets.Value("string"),
            "anger": datasets.Value("bool"),
            "anticipation": datasets.Value("bool"),
            "disgust": datasets.Value("bool"),
            "fear": datasets.Value("bool"),
            "joy": datasets.Value("bool"),
            "love": datasets.Value("bool"),
            "optimism": datasets.Value("bool"),
            "pessimism": datasets.Value("bool"),
            "sadness": datasets.Value("bool"),
            "surprise": datasets.Value("bool"),
            "trust": datasets.Value("bool"),
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs[self.config.name]
        if self.config.name == "subtask5.english":
            shortname = "En"
        if self.config.name == "subtask5.spanish":
            shortname = "Es"
        if self.config.name == "subtask5.arabic":
            shortname = "Ar"
        data_dir = dl_manager.download_and_extract(my_urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir[0],
                                 "2018-E-c-" + shortname + "-train.txt"),
                    "split":
                    "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir[2],
                                 "2018-E-c-" + shortname + "-test.txt"),
                    "split":
                    "test",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir[1],
                                 "2018-E-c-" + shortname + "-dev.txt"),
                    "split":
                    "dev",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        """Yields examples as (key, example) tuples."""

        with open(filepath, encoding="utf-8") as f:
            next(f)  # skip header
            for id_, row in enumerate(f):
                data = row.split("\t")
                yield id_, {
                    "ID": data[0],
                    "Tweet": data[1],
                    "anger": int(data[2]) if split != "test" else None,
                    "anticipation": int(data[3]) if split != "test" else None,
                    "disgust": int(data[4]) if split != "test" else None,
                    "fear": int(data[5]) if split != "test" else None,
                    "joy": int(data[6]) if split != "test" else None,
                    "love": int(data[7]) if split != "test" else None,
                    "optimism": int(data[8]) if split != "test" else None,
                    "pessimism": int(data[9]) if split != "test" else None,
                    "sadness": int(data[10]) if split != "test" else None,
                    "surprise": int(data[11]) if split != "test" else None,
                    "trust": int(data[12]) if split != "test" else None,
                }
Beispiel #30
0
class WinoBias(datasets.GeneratorBasedBuilder):
    """WinoBias: Winograd-schema dataset for detecting gender bias"""

    VERSION = datasets.Version("4.0.0")

    # This is an example of a dataset with multiple configurations.
    # If you don't want/need to define several sub-sets in your dataset,
    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.

    # If you need to make complex sub-parts in the datasets with configurable options
    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
    # BUILDER_CONFIG_CLASS = MyBuilderConfig

    # You will be able to load one or the other configurations in the following list with
    # data = datasets.load_dataset('my_dataset', 'first_domain')
    # data = datasets.load_dataset('my_dataset', 'second_domain')
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="wino_bias",
            version=VERSION,
            description=
            "WinoBias: Winograd-schema dataset for detecting gender bias",
        ),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            # Info about features for this: http://cemantix.org/data/ontonotes.html
            features=datasets.Features({
                "document_id":
                datasets.Value("string"),
                "part_number":
                datasets.Value("string"),
                "word_number":
                datasets.Sequence(datasets.Value("int32")),
                "tokens":
                datasets.Sequence(datasets.Value("string")),
                "pos_tags":
                datasets.Sequence(
                    datasets.features.ClassLabel(names=[
                        '"',
                        "''",
                        "#",
                        "$",
                        "(",
                        ")",
                        ",",
                        ".",
                        ":",
                        "``",
                        "CC",
                        "CD",
                        "DT",
                        "EX",
                        "FW",
                        "IN",
                        "JJ",
                        "JJR",
                        "JJS",
                        "LS",
                        "MD",
                        "NN",
                        "NNP",
                        "NNPS",
                        "NNS",
                        "NN|SYM",
                        "PDT",
                        "POS",
                        "PRP",
                        "PRP$",
                        "RB",
                        "RBR",
                        "RBS",
                        "RP",
                        "SYM",
                        "TO",
                        "UH",
                        "VB",
                        "VBD",
                        "VBG",
                        "VBN",
                        "VBP",
                        "VBZ",
                        "WDT",
                        "WP",
                        "WP$",
                        "WRB",
                        "HYPH",
                        "XX",
                        "NFP",
                        "AFX",
                        "ADD",
                        "-LRB-",
                        "-RRB-",
                    ])),
                "parse_bit":
                datasets.Sequence(datasets.Value("string")),
                "predicate_lemma":
                datasets.Sequence(datasets.Value("string")),
                "predicate_framenet_id":
                datasets.Sequence(datasets.Value("string")),
                "word_sense":
                datasets.Sequence(datasets.Value("string")),
                "speaker":
                datasets.Sequence(datasets.Value("string")),
                "ner_tags":
                datasets.Sequence(
                    datasets.features.ClassLabel(names=[
                        "B-PERSON",
                        "I-PERSON",
                        "B-NORP",
                        "I-NORP",
                        "B-FAC",
                        "I-FAC",
                        "B-ORG",
                        "I-ORG",
                        "B-GPE",
                        "I-GPE",
                        "B-LOC",
                        "I-LOC",
                        "B-PRODUCT",
                        "I-PRODUCT",
                        "B-EVENT",
                        "I-EVENT",
                        "B-WORK_OF_ART",
                        "I-WORK_OF_ART",
                        "B-LAW",
                        "I-LAW",
                        "B-LANGUAGE",
                        "I-LANGUAGE",
                        "B-DATE",
                        "I-DATE",
                        "B-TIME",
                        "I-TIME",
                        "B-PERCENT",
                        "I-PERCENT",
                        "B-MONEY",
                        "I-MONEY",
                        "B-QUANTITY",
                        "I-QUANTITY",
                        "B-ORDINAL",
                        "I-ORDINAL",
                        "B-CARDINAL",
                        "I-CARDINAL",
                        "*",
                        "0",
                    ])),
                "verbal_predicates":
                datasets.Sequence(datasets.Value("string")),
            }),
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = dl_manager.download_and_extract(_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepath": data_dir},
            )
        ]

    def _generate_examples(self, filepath):
        """ Yields examples. """
        with open(filepath, encoding="utf-8") as f:
            id_ = 0
            document_id = None
            part_number = 0
            word_num = []
            tokens = []
            pos_tags = []
            parse_bit = []
            predicate_lemma = []
            predicate_framenet_id = []
            word_sense = []
            speaker = []
            ner_tags = []
            ner_start = False
            verbal_predicates = []
            for line in f:
                if line.startswith("#begin") or line.startswith("#end"):
                    continue
                elif not line.strip():
                    id_ += 1
                    yield str(id_), {
                        "document_id": document_id,
                        "part_number": part_number,
                        "word_number": word_num,
                        "tokens": tokens,
                        "pos_tags": pos_tags,
                        "parse_bit": parse_bit,
                        "predicate_lemma": predicate_lemma,
                        "predicate_framenet_id": predicate_framenet_id,
                        "word_sense": word_sense,
                        "speaker": speaker,
                        "ner_tags": ner_tags,
                        "verbal_predicates": verbal_predicates,
                    }
                    word_num = []
                    tokens = []
                    pos_tags = []
                    parse_bit = []
                    predicate_lemma = []
                    predicate_framenet_id = []
                    word_sense = []
                    speaker = []
                    ner_tags = []
                    verbal_predicates = []
                else:
                    splits = [s for s in line.split(" ") if s]
                    if len(splits) > 7:
                        document_id = splits[0]
                        part_number = splits[1]
                        word_num.append(splits[2])
                        tokens.append(splits[3])
                        pos_tags.append(splits[4])
                        parse_bit.append(splits[5])
                        predicate_lemma.append(splits[6])
                        predicate_framenet_id.append(splits[7])
                        word_sense.append(splits[8])
                        speaker.append(splits[9])
                        ner_word = splits[10]
                        if ")" in ner_word and ner_start:
                            ner_start = False
                            ner_word = "0"
                        if "(" in ner_word:
                            ner_start = True
                            ner_word = ner_word.strip(" ").replace(
                                "(", "B-").replace("*", "").replace(")", "")
                            start_word = ner_word.strip(" ").replace("B-", "")
                        if ner_start:
                            if ner_word.strip(" ") == "*":
                                ner_word = "I-" + start_word
                        ner_tags.append(ner_word)
                        word_is_verbal_predicate = any(
                            ["(V" in x for x in splits[11:-1]])
                        if word_is_verbal_predicate:
                            verbal_predicates.append(splits[3])
            if tokens:
                # add the last one
                id_ += 1
                yield str(id_), {
                    "document_id": document_id,
                    "part_number": part_number,
                    "word_number": word_num,
                    "tokens": tokens,
                    "pos_tags": pos_tags,
                    "parse_bit": parse_bit,
                    "predicate_lemma": predicate_lemma,
                    "predicate_framenet_id": predicate_framenet_id,
                    "word_sense": word_sense,
                    "speaker": speaker,
                    "ner_tags": ner_tags,
                    "verbal_predicates": verbal_predicates,
                }