コード例 #1
0
ファイル: xtreme.py プロジェクト: Nasrin-Akter-88/datasets-1
    def _info(self):
        # TODO(xtreme): Specifies the datasets.DatasetInfo object
        features = {text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)}
        if "answers" in features.keys():
            features["answers"] = datasets.features.Sequence(
                {"answer_start": datasets.Value("int32"), "text": datasets.Value("string")}
            )
        if self.config.name.startswith("PAWS-X"):
            features["label"] = datasets.Value("string")
        if self.config.name == "XNLI":
            features["gold_label"] = datasets.Value("string")

        if self.config.name.startswith("udpos"):
            features = datasets.Features(
                {
                    "token": datasets.Value("string"),
                    "pos_tag": datasets.features.ClassLabel(
                        names=[
                            "ADJ",
                            "ADP",
                            "ADV",
                            "AUX",
                            "CCONJ",
                            "DET",
                            "INTJ",
                            "NOUN",
                            "NUM",
                            "PART",
                            "PRON",
                            "PROPN",
                            "PUNCT",
                            "SCONJ",
                            "SYM",
                            "VERB",
                            "X",
                        ]
                    ),
                }
            )

        if self.config.name.startswith("PAN-X"):
            features = datasets.Features(
                {
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "O",
                                "B-PER",
                                "I-PER",
                                "B-ORG",
                                "I-ORG",
                                "B-LOC",
                                "I-LOC",
                            ]
                        )
                    ),
                    "langs": datasets.Sequence(datasets.Value("string")),
                }
            )
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=self.config.description + "\n" + _DESCRIPTION,
            # datasets.features.FeatureConnectors
            features=datasets.Features(
                features
                # These are the features of your dataset like images, labels ...
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url,
            citation=self.config.citation + "\n" + _CITATION,
        )
コード例 #2
0
ファイル: ami.py プロジェクト: edugp/datasets
    def _info(self):
        features_dict = {
            "word_ids": datasets.Sequence(datasets.Value("string")),
            "word_start_times": datasets.Sequence(datasets.Value("float")),
            "word_end_times": datasets.Sequence(datasets.Value("float")),
            "word_speakers": datasets.Sequence(datasets.Value("string")),
            "segment_ids": datasets.Sequence(datasets.Value("string")),
            "segment_start_times": datasets.Sequence(datasets.Value("float")),
            "segment_end_times": datasets.Sequence(datasets.Value("float")),
            "segment_speakers": datasets.Sequence(datasets.Value("string")),
            "words": datasets.Sequence(datasets.Value("string")),
            "channels": datasets.Sequence(datasets.Value("string")),
        }

        if self.config.name == "headset-single":
            features_dict.update({"file": datasets.Value("string")})
            features_dict.update(
                {"audio": datasets.features.Audio(sampling_rate=16_000)})
            config_description = (
                "Close talking audio of single headset. "
                "This configuration only includes audio belonging to the "
                "headset of the person currently speaking.")
        elif self.config.name == "microphone-single":
            features_dict.update({"file": datasets.Value("string")})
            features_dict.update(
                {"audio": datasets.features.Audio(sampling_rate=16_000)})
            config_description = (
                "Far field audio of single microphone. "
                "This configuration only includes audio belonging the first microphone, "
                "*i.e.* 1-1, of the microphone array.")
        elif self.config.name == "headset-multi":
            features_dict.update(
                {f"file-{i}": datasets.Value("string")
                 for i in range(4)})
            features_dict.update({
                f"file-{i}": datasets.features.Audio(sampling_rate=16_000)
                for i in range(4)
            })
            config_description = (
                "Close talking audio of four individual headset. "
                "This configuration includes audio belonging to four individual headsets."
                " For each annotation there are 4 audio files 0, 1, 2, 3.")
        elif self.config.name == "microphone-multi":
            features_dict.update(
                {f"file-1-{i}": datasets.Value("string")
                 for i in range(1, 8)})
            features_dict.update({
                f"file-1-{i}": datasets.features.Audio(sampling_rate=16_000)
                for i in range(1, 8)
            })
            config_description = (
                "Far field audio of microphone array. "
                "This configuration includes audio of "
                "the first microphone array 1-1, 1-2, ..., 1-8.")
        else:
            raise ValueError(
                f"Configuration {self.config.name} does not exist.")

        return datasets.DatasetInfo(
            description=_DESCRIPTION + config_description,
            features=datasets.Features(features_dict),
            homepage=_URL,
            citation=_CITATION,
        )
コード例 #3
0
 def _info(self):
     return datasets.DatasetInfo(
         features=datasets.Features({"content": datasets.Value("string")}),
         # No default supervised_keys.
         supervised_keys=None,
     )
コード例 #4
0
class Truthfulqa(datasets.GeneratorBasedBuilder):
    """TruthfulQA is a benchmark to measure whether a language model is truthful in
    generating answers to questions."""

    BUILDER_CONFIGS = [
        TruthfulqaConfig(
            name="multiple_choice",
            url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
            features=datasets.Features(
                {
                    "question": datasets.Value("string"),
                    "mc1_targets": {
                        "choices": datasets.features.Sequence(datasets.Value("string")),
                        "labels": datasets.features.Sequence(datasets.Value("int32")),
                    },
                    "mc2_targets": {
                        "choices": datasets.features.Sequence(datasets.Value("string")),
                        "labels": datasets.features.Sequence(datasets.Value("int32")),
                    },
                }
            ),
            description="The multiple choice TruthfulQA task",
        ),
        TruthfulqaConfig(
            name="generation",
            url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
            features=datasets.Features(
                {
                    "category": datasets.Value("string"),
                    "question": datasets.Value("string"),
                    "best_answer": datasets.Value("string"),
                    "correct_answers": datasets.features.Sequence(
                        datasets.Value("string")
                    ),
                    "incorrect_answers": datasets.features.Sequence(
                        datasets.Value("string")
                    ),
                    "source": datasets.Value("string"),
                }
            ),
            description="The generative TruthfulQA task",
        ),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=f"{_DESCRIPTION}\n{self.config.description}",
            features=self.config.features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        urls = self.config.url
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir,
                    "split": "validation",
                },
            ),
        ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, filepath, split):
        if self.config.name == "multiple_choice":
            # Multiple choice data is in a `JSON` file.
            with open(filepath, encoding="utf-8") as f:
                contents = json.load(f)
                for key, row in enumerate(contents):
                    yield key, {
                        "question": row["question"],
                        "mc1_targets": {
                            "choices": row["mc1_targets"].keys(),
                            "labels": row["mc1_targets"].values(),
                        },
                        "mc2_targets": {
                            "choices": row["mc2_targets"].keys(),
                            "labels": row["mc2_targets"].values(),
                        },
                    }
        else:
            # Generation data is in a `CSV` file.
            with open(filepath, newline="") as f:
                contents = csv.DictReader(f)
                for key, row in enumerate(contents):
                    # Ensure that references exist.
                    if not row["Correct Answers"] or not row["Incorrect Answers"]:
                        continue
                    yield key, {
                        "category": row["Category"],
                        "question": row["Question"],
                        "best_answer": row["Best Answer"],
                        # split on ";"
                        "correct_answers": row["Correct Answers"].strip().split(";"),
                        "incorrect_answers": row["Incorrect Answers"]
                        .strip()
                        .split(";"),
                        "source": row["Source"],
                    }
コード例 #5
0
ファイル: test_array_xd.py プロジェクト: yngtodd/datasets
import pandas as pd
import pytest
from absl.testing import parameterized

import datasets
from datasets.arrow_writer import ArrowWriter
from datasets.features import Array2D, Array3D, Array4D, Array5D, Value, _ArrayXD

SHAPE_TEST_1 = (30, 487)
SHAPE_TEST_2 = (36, 1024)
SPEED_TEST_SHAPE = (100, 100)
SPEED_TEST_N_EXAMPLES = 100

DEFAULT_FEATURES = datasets.Features({
    "text":
    Array2D(SHAPE_TEST_1, dtype="float32"),
    "image":
    Array2D(SHAPE_TEST_2, dtype="float32")
})


def generate_examples(features: dict, num_examples=100, seq_shapes=None):
    dummy_data = []
    seq_shapes = seq_shapes or {}
    for i in range(num_examples):
        example = {}
        for col_id, (k, v) in enumerate(features.items()):
            if isinstance(v, _ArrayXD):
                data = np.random.rand(*v.shape).astype(v.dtype)
            elif isinstance(v, datasets.Value):
                data = "foo"
            elif isinstance(v, datasets.Sequence):
コード例 #6
0
import logging
from dataclasses import dataclass

import pyarrow as pa

import datasets

logger = logging.getLogger(__name__)

FEATURES = datasets.Features({
    "text": datasets.Value("string"),
})


@dataclass
class TextConfig(datasets.BuilderConfig):
    """BuilderConfig for text files."""

    encoding: str = "utf-8"
    chunksize: int = 10 << 20  # 10MB


class Text(datasets.ArrowBasedBuilder):
    BUILDER_CONFIG_CLASS = TextConfig

    def _info(self):
        return datasets.DatasetInfo(features=FEATURES)

    def _split_generators(self, dl_manager):
        """The `data_files` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].
コード例 #7
0
    def _info(self):
        features = {
            text_feature: datasets.Value("string")
            for text_feature in six.iterkeys(self.config.text_features)
        }

        if self.config.name.startswith("copa"):
            features["label"] = datasets.Value("int32")

        if self.config.name.startswith("sna"):
            features["label"] = datasets.features.ClassLabel(names=[
                "kolkata", "state", "national", "sports", "entertainment",
                "international"
            ])

        if self.config.name.startswith("inltkh"):
            features["label"] = datasets.features.ClassLabel(names=[
                "entertainment",
                "business",
                "tech",
                "sports",
                "state",
                "spirituality",
                "tamil-cinema",
                "positive",
                "negative",
                "neutral",
            ])

        if self.config.name.startswith("iitp"):
            features["label"] = datasets.features.ClassLabel(
                names=["negative", "neutral", "positive"])

        if self.config.name.startswith("wnli"):
            features["label"] = datasets.features.ClassLabel(
                names=["not_entailment", "entailment", "None"])

        if self.config.name.startswith("actsa"):
            features["label"] = datasets.features.ClassLabel(
                names=["positive", "negative"])

        if self.config.name.startswith("csqa"):
            features["options"] = datasets.features.Sequence(
                datasets.Value("string"))
            features["out_of_context_options"] = datasets.features.Sequence(
                datasets.Value("string"))

        if self.config.name.startswith("md"):
            features["story_number"] = datasets.Value("int32")
            features["id"] = datasets.Value("int32")

        if self.config.name.startswith("wiki-ner"):
            features["tokens"] = datasets.features.Sequence(
                datasets.Value("string"))
            features["ner_tags"] = datasets.features.Sequence(
                datasets.features.ClassLabel(names=[
                    "B-LOC", "B-ORG", "B-PER", "I-LOC", "I-ORG", "I-PER", "O"
                ]))
            features["additional_info"] = datasets.features.Sequence(
                datasets.features.Sequence(datasets.Value("string")))

        return datasets.DatasetInfo(
            description=_INDIC_GLUE_DESCRIPTION + "\n" +
            self.config.description,
            features=datasets.Features(features),
            homepage=self.config.url,
            citation=_INDIC_GLUE_CITATION + "\n" + self.config.citation,
        )
コード例 #8
0
def test_array_xd_with_np(data, feature, expected):
    ds = datasets.Dataset.from_dict(
        {"col": [data]},
        features=datasets.Features({"col": feature}) if feature else None)
    assert ds[0]["col"] == expected
コード例 #9
0
ファイル: md_gender_bias.py プロジェクト: Priyansh2/nlp
 def _info(self):
     # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
     if (
             self.config.name == "gendered_words"
     ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
         features = datasets.Features({
             "word_masculine":
             datasets.Value("string"),
             "word_feminine":
             datasets.Value("string"),
         })
     elif self.config.name == "name_genders":
         features = datasets.Features({
             "name":
             datasets.Value("string"),
             "assigned_gender":
             datasets.ClassLabel(names=["M", "F"]),
             "count":
             datasets.Value("int32"),
         })
     elif self.config.name == "new_data":
         features = datasets.Features({
             "text":
             datasets.Value("string"),
             "original":
             datasets.Value("string"),
             "labels": [
                 datasets.ClassLabel(names=[
                     "ABOUT:female",
                     "ABOUT:male",
                     "PARTNER:female",
                     "PARTNER:male",
                     "SELF:female",
                     "SELF:male",
                 ])
             ],
             "class_type":
             datasets.ClassLabel(names=["about", "partner", "self"]),
             "turker_gender":
             datasets.ClassLabel(names=[
                 "man", "woman", "nonbinary", "prefer not to say",
                 "no answer"
             ]),
             "episode_done":
             datasets.Value("bool_"),
             "confidence":
             datasets.Value("string"),
         })
     elif self.config.name == "funpedia":
         features = datasets.Features({
             "text":
             datasets.Value("string"),
             "title":
             datasets.Value("string"),
             "persona":
             datasets.Value("string"),
             "gender":
             datasets.ClassLabel(
                 names=["gender-neutral", "female", "male"]),
         })
     elif self.config.name == "image_chat":
         features = datasets.Features({
             "caption": datasets.Value("string"),
             "id": datasets.Value("string"),
             "male": datasets.Value("bool_"),
             "female": datasets.Value("bool_"),
         })
     elif self.config.name == "wizard":
         features = datasets.Features({
             "text":
             datasets.Value("string"),
             "chosen_topic":
             datasets.Value("string"),
             "gender":
             datasets.ClassLabel(
                 names=["gender-neutral", "female", "male"]),
         })
     elif self.config.name == "yelp_inferred":
         features = datasets.Features({
             "text":
             datasets.Value("string"),
             "binary_label":
             datasets.ClassLabel(names=["ABOUT:female", "ABOUT:male"]),
             "binary_score":
             datasets.Value("float"),
         })
     else:  # data with inferred labels
         features = datasets.Features({
             "text":
             datasets.Value("string"),
             "binary_label":
             datasets.ClassLabel(names=["ABOUT:female", "ABOUT:male"]),
             "binary_score":
             datasets.Value("float"),
             "ternary_label":
             datasets.ClassLabel(names=[
                 "ABOUT:female", "ABOUT:male", "ABOUT:gender-neutral"
             ]),
             "ternary_score":
             datasets.Value("float"),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
コード例 #10
0
 def _info(self):
     if self.config.name == "CLS" or self.config.name == "XNLI":
         features = {
             text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)
         }
         features[self.config.label_column] = datasets.features.ClassLabel(names=self.config.label_classes)
         features["idx"] = datasets.Value("int32")
     elif self.config.name == "WSD-V":
         features = {
             text_feature: datasets.Sequence(datasets.Value("string"))
             for text_feature in six.iterkeys(self.config.text_features)
         }
         features["fine_pos_tags"] = datasets.Sequence(
             datasets.features.ClassLabel(
                 names=[
                     "DET",
                     "P+D",
                     "CC",
                     "VS",
                     "P",
                     "CS",
                     "NC",
                     "NPP",
                     "ADJWH",
                     "VINF",
                     "VPP",
                     "ADVWH",
                     "PRO",
                     "V",
                     "CLO",
                     "PREF",
                     "VPR",
                     "PROREL",
                     "ADV",
                     "PROWH",
                     "N",
                     "DETWH",
                     "ADJ",
                     "P+PRO",
                     "ET",
                     "VIMP",
                     "CLS",
                     "PONCT",
                     "I",
                     "CLR",
                 ]
             )
         )
         features["pos_tags"] = datasets.Sequence(
             datasets.features.ClassLabel(
                 names=[
                     "V",
                     "PREF",
                     "P+D",
                     "I",
                     "A",
                     "P+PRO",
                     "PRO",
                     "P",
                     "anonyme",
                     "D",
                     "C",
                     "CL",
                     "ET",
                     "PONCT",
                     "ADV",
                     "N",
                 ]
             )
         )
         features["disambiguate_tokens_ids"] = datasets.Sequence(datasets.Value("int32"))
         features["disambiguate_labels"] = datasets.Sequence(datasets.Value("string"))
         features["idx"] = datasets.Value("string")
     else:
         features = {
             text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)
         }
         features[self.config.label_column] = datasets.Value("int32")
         features["idx"] = datasets.Value("int32")
     return datasets.DatasetInfo(
         description=_FLUE_DESCRIPTION,
         features=datasets.Features(features),
         homepage=self.config.url,
         citation=self.config.citation + "\n" + _FLUE_CITATION,
     )
コード例 #11
0
 def _info(self):
     if self.config.name.endswith("-7"):
         ner_tags = datasets.Sequence(
             datasets.features.ClassLabel(names=[
                 "O",
                 "B-PER",
                 "I-PER",
                 "B-ORG",
                 "I-ORG",
                 "B-PROD",
                 "I-PROD",
                 "B-LOC",
                 "I-LOC",
                 "B-DRV",
                 "I-DRV",
                 "B-EVT",
                 "I-EVT",
                 "B-MISC",
                 "I-MISC",
             ]))
     elif self.config.name.endswith("-8"):
         ner_tags = datasets.Sequence(
             datasets.features.ClassLabel(names=[
                 "O",
                 "B-PER",
                 "I-PER",
                 "B-ORG",
                 "I-ORG",
                 "B-PROD",
                 "I-PROD",
                 "B-LOC",
                 "I-LOC",
                 "B-GPE",
                 "I-GPE",
                 "B-DRV",
                 "I-DRV",
                 "B-EVT",
                 "I-EVT",
                 "B-MISC",
                 "I-MISC",
             ]))
     else:
         ner_tags = datasets.Sequence(
             datasets.features.ClassLabel(names=[
                 "O",
                 "B-PER",
                 "I-PER",
                 "B-ORG",
                 "I-ORG",
                 "B-GPE_LOC",
                 "I-GPE_LOC",
                 "B-PROD",
                 "I-PROD",
                 "B-LOC",
                 "I-LOC",
                 "B-GPE_ORG",
                 "I-GPE_ORG",
                 "B-DRV",
                 "I-DRV",
                 "B-EVT",
                 "I-EVT",
                 "B-MISC",
                 "I-MISC",
             ]))
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "idx":
             datasets.Value("string"),
             "lang":
             datasets.Value("string"),
             "text":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "lemmas":
             datasets.Sequence(datasets.Value("string")),
             "pos_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "NOUN",
                     "PUNCT",
                     "ADP",
                     "NUM",
                     "SYM",
                     "SCONJ",
                     "ADJ",
                     "PART",
                     "DET",
                     "CCONJ",
                     "PROPN",
                     "PRON",
                     "X",
                     "ADV",
                     "INTJ",
                     "VERB",
                     "AUX",
                 ])),
             "ner_tags":
             ner_tags,
         }),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         citation=_CITATION,
     )
コード例 #12
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "pos_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "''",
                     "#",
                     "$",
                     "(",
                     ")",
                     ",",
                     ".",
                     ":",
                     "``",
                     "CC",
                     "CD",
                     "DT",
                     "EX",
                     "FW",
                     "IN",
                     "JJ",
                     "JJR",
                     "JJS",
                     "MD",
                     "NN",
                     "NNP",
                     "NNPS",
                     "NNS",
                     "PDT",
                     "POS",
                     "PRP",
                     "PRP$",
                     "RB",
                     "RBR",
                     "RBS",
                     "RP",
                     "SYM",
                     "TO",
                     "UH",
                     "VB",
                     "VBD",
                     "VBG",
                     "VBN",
                     "VBP",
                     "VBZ",
                     "WDT",
                     "WP",
                     "WP$",
                     "WRB",
                 ])),
             "chunk_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-ADJP",
                     "I-ADJP",
                     "B-ADVP",
                     "I-ADVP",
                     "B-CONJP",
                     "I-CONJP",
                     "B-INTJ",
                     "I-INTJ",
                     "B-LST",
                     "I-LST",
                     "B-NP",
                     "I-NP",
                     "B-PP",
                     "I-PP",
                     "B-PRT",
                     "I-PRT",
                     "B-SBAR",
                     "I-SBAR",
                     "B-UCP",
                     "I-UCP",
                     "B-VP",
                     "I-VP",
                 ])),
         }),
         supervised_keys=None,
         homepage="https://www.clips.uantwerpen.be/conll2000/chunking/",
         citation=_CITATION,
     )
コード例 #13
0
ファイル: newsqa.py プロジェクト: ruch798/datasets
    def _info(self):

        if (
                self.config.name == "combined-csv"
        ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
            features = datasets.Features({
                "story_id":
                datasets.Value("string"),
                "story_text":
                datasets.Value("string"),
                "question":
                datasets.Value("string"),
                "answer_char_ranges":
                datasets.Value("string"),
            })
        elif (
                self.config.name == "combined-json"
        ):  # This is an example to show how to have different features for "first_domain" and "second_domain"
            features = datasets.Features({
                "storyId":
                datasets.Value("string"),
                "text":
                datasets.Value("string"),
                "type":
                datasets.Value("string"),
                "questions":
                datasets.features.Sequence({
                    "q":
                    datasets.Value("string"),
                    "isAnswerAbsent":
                    datasets.Value("int32"),
                    "isQuestionBad":
                    datasets.Value("int32"),
                    "consensus":
                    datasets.Features({
                        "s": datasets.Value("int32"),
                        "e": datasets.Value("int32"),
                        "badQuestion": datasets.Value("bool"),
                        "noAnswer": datasets.Value("bool"),
                    }),
                    "answers":
                    datasets.features.Sequence({
                        "sourcerAnswers":
                        datasets.features.Sequence({
                            "s":
                            datasets.Value("int32"),
                            "e":
                            datasets.Value("int32"),
                            "noAnswer":
                            datasets.Value("bool"),
                        }),
                    }),
                    "validated_answers":
                    datasets.features.Sequence({
                        "sourcerAnswers":
                        datasets.features.Sequence({
                            "s":
                            datasets.Value("int32"),
                            "e":
                            datasets.Value("int32"),
                            "noAnswer":
                            datasets.Value("bool"),
                            "count":
                            datasets.Value("int32"),
                        }),
                    }),
                }),
            })
        else:  # This is the name of the configuration selected in BUILDER_CONFIGS above
            features = datasets.Features({
                "story_id":
                datasets.Value("string"),
                "story_text":
                datasets.Value("string"),
                "question":
                datasets.Value("string"),
                "answer_token_ranges":
                datasets.Value("string"),
            })
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )
コード例 #14
0
ファイル: dane.py プロジェクト: ruch798/datasets
    def _info(self):
        features = datasets.Features(
            {
                "sent_id": datasets.Value("string"),
                "text": datasets.Value("string"),
                "tok_ids": datasets.Sequence(datasets.Value("int64")),
                "tokens": datasets.Sequence(datasets.Value("string")),
                "lemmas": datasets.Sequence(datasets.Value("string")),
                "pos_tags": datasets.Sequence(
                    datasets.features.ClassLabel(
                        names=[
                            "NUM",
                            "CCONJ",
                            "PRON",
                            "VERB",
                            "INTJ",
                            "AUX",
                            "ADJ",
                            "PROPN",
                            "PART",
                            "ADV",
                            "PUNCT",
                            "ADP",
                            "NOUN",
                            "X",
                            "DET",
                            "SYM",
                            "SCONJ",
                        ]
                    )
                ),
                "morph_tags": datasets.Sequence(datasets.Value("string")),
                "dep_ids": datasets.Sequence(datasets.Value("int64")),
                "dep_labels": datasets.Sequence(
                    datasets.ClassLabel(
                        names=[
                            "parataxis",
                            "mark",
                            "nummod",
                            "discourse",
                            "compound:prt",
                            "reparandum",
                            "vocative",
                            "list",
                            "obj",
                            "dep",
                            "det",
                            "obl:loc",
                            "flat",
                            "iobj",
                            "cop",
                            "expl",
                            "obl",
                            "conj",
                            "nmod",
                            "root",
                            "acl:relcl",
                            "goeswith",
                            "appos",
                            "fixed",
                            "obl:tmod",
                            "xcomp",
                            "advmod",
                            "nmod:poss",
                            "aux",
                            "ccomp",
                            "amod",
                            "cc",
                            "advcl",
                            "nsubj",
                            "punct",
                            "case",
                        ]
                    )
                ),
                "ner_tags": datasets.Sequence(
                    datasets.features.ClassLabel(
                        names=[
                            "O",
                            "B-PER",
                            "I-PER",
                            "B-ORG",
                            "I-ORG",
                            "B-LOC",
                            "I-LOC",
                            "B-MISC",
                            "I-MISC",
                        ]
                    )
                ),
            }
        )

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )
コード例 #15
0
ファイル: kilt_tasks.py プロジェクト: yngtodd/datasets
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "input":
             datasets.Value("string"),
             "meta": {
                 "left_context":
                 datasets.Value("string"),
                 "mention":
                 datasets.Value("string"),
                 "right_context":
                 datasets.Value("string"),
                 "partial_evidence": [{
                     "start_paragraph_id":
                     datasets.Value("int32"),
                     "end_paragraph_id":
                     datasets.Value("int32"),
                     "title":
                     datasets.Value("string"),
                     "section":
                     datasets.Value("string"),
                     "wikipedia_id":
                     datasets.Value("string"),
                     "meta": {
                         "evidence_span": [datasets.Value("string")]
                     },
                 }],
                 "obj_surface": [datasets.Value("string")],
                 "sub_surface": [datasets.Value("string")],
                 "subj_aliases": [datasets.Value("string")],
                 "template_questions": [datasets.Value("string")],
             },
             "output": [{
                 "answer":
                 datasets.Value("string"),
                 "meta": {
                     "score": datasets.Value("int32")
                 },
                 "provenance": [{
                     "bleu_score":
                     datasets.Value("float32"),
                     "start_character":
                     datasets.Value("int32"),
                     "start_paragraph_id":
                     datasets.Value("int32"),
                     "end_character":
                     datasets.Value("int32"),
                     "end_paragraph_id":
                     datasets.Value("int32"),
                     "meta": {
                         "fever_page_id": datasets.Value("string"),
                         "fever_sentence_id": datasets.Value("int32"),
                         "annotation_id": datasets.Value(
                             "string"),  # int runs into overflow issues
                         "yes_no_answer": datasets.Value("string"),
                         "evidence_span": [datasets.Value("string")],
                     },
                     "section":
                     datasets.Value("string"),
                     "title":
                     datasets.Value("string"),
                     "wikipedia_id":
                     datasets.Value("string"),
                 }],
             }],
         }),
         supervised_keys=None,
         homepage="https://github.com/facebookresearch/KILT",
         citation=_CITATION,
     )
コード例 #16
0
from dataclasses import dataclass
import datasets
import pyarrow as pa

FEATURES = datasets.Features({'text': datasets.Value('string')})


@dataclass
class ZHWikiConfig(datasets.BuilderConfig):

    data_path: str = None
    min_sent_length: int = 10
    chunksize: int = 10 << 20
    encoding = 'utf-8'


class zh_wiki(datasets.ArrowBasedBuilder):

    BUILDER_CONFIG_CLASS = ZHWikiConfig
    VERSION = "1.0.0"

    def _info(self):
        return datasets.DatasetInfo(features=FEATURES)

    def _split_generators(self, dl_manager):

        if not self.config.data_path:
            raise ValueError(
                f"Data path must be specified, but got data_path={self.config.data_path}"
            )
コード例 #17
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-academic",
                     "I-academic",
                     "B-academic_person",
                     "I-academic_person",
                     "B-aircraft",
                     "I-aircraft",
                     "B-album_person",
                     "I-album_person",
                     "B-anatomy",
                     "I-anatomy",
                     "B-animal",
                     "I-animal",
                     "B-architect_person",
                     "I-architect_person",
                     "B-capital",
                     "I-capital",
                     "B-chemical",
                     "I-chemical",
                     "B-clothes",
                     "I-clothes",
                     "B-country",
                     "I-country",
                     "B-culture",
                     "I-culture",
                     "B-currency",
                     "I-currency",
                     "B-date",
                     "I-date",
                     "B-food",
                     "I-food",
                     "B-genre",
                     "I-genre",
                     "B-government",
                     "I-government",
                     "B-government_person",
                     "I-government_person",
                     "B-language",
                     "I-language",
                     "B-location",
                     "I-location",
                     "B-material",
                     "I-material",
                     "B-measure",
                     "I-measure",
                     "B-medical",
                     "I-medical",
                     "B-military",
                     "I-military",
                     "B-military_person",
                     "I-military_person",
                     "B-nation",
                     "I-nation",
                     "B-newspaper",
                     "I-newspaper",
                     "B-organization",
                     "I-organization",
                     "B-organization_person",
                     "I-organization_person",
                     "B-person",
                     "I-person",
                     "B-production_art_music",
                     "I-production_art_music",
                     "B-production_art_music_person",
                     "I-production_art_music_person",
                     "B-quantity",
                     "I-quantity",
                     "B-religion",
                     "I-religion",
                     "B-science",
                     "I-science",
                     "B-shape",
                     "I-shape",
                     "B-ship",
                     "I-ship",
                     "B-software",
                     "I-software",
                     "B-space",
                     "I-space",
                     "B-space_person",
                     "I-space_person",
                     "B-sport",
                     "I-sport",
                     "B-sport_name",
                     "I-sport_name",
                     "B-sport_person",
                     "I-sport_person",
                     "B-structure",
                     "I-structure",
                     "B-subject",
                     "I-subject",
                     "B-tech",
                     "I-tech",
                     "B-train",
                     "I-train",
                     "B-vehicle",
                     "I-vehicle",
                 ])),
         }),
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )
コード例 #18
0
 def _info(self):
     if (
         self.config.name == "parsed_pdfs"
     ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
         features = datasets.Features(
             {
                 "name": datasets.Value("string"),
                 "metadata": {
                     "source": datasets.Value("string"),
                     "title": datasets.Value("string"),
                     "authors": datasets.features.Sequence(datasets.Value("string")),
                     "emails": datasets.features.Sequence(datasets.Value("string")),
                     "sections": datasets.features.Sequence(
                         {
                             "heading": datasets.Value("string"),
                             "text": datasets.Value("string"),
                         }
                     ),
                     "references": datasets.features.Sequence(
                         {
                             "title": datasets.Value("string"),
                             "author": datasets.features.Sequence(datasets.Value("string")),
                             "venue": datasets.Value("string"),
                             "citeRegEx": datasets.Value("string"),
                             "shortCiteRegEx": datasets.Value("string"),
                             "year": datasets.Value("int32"),
                         }
                     ),
                     "referenceMentions": datasets.features.Sequence(
                         {
                             "referenceID": datasets.Value("int32"),
                             "context": datasets.Value("string"),
                             "startOffset": datasets.Value("int32"),
                             "endOffset": datasets.Value("int32"),
                         }
                     ),
                     "year": datasets.Value("int32"),
                     "abstractText": datasets.Value("string"),
                     "creator": datasets.Value("string"),
                 },
             }
         )
     else:
         features = datasets.Features(
             {
                 "id": datasets.Value("string"),
                 "conference": datasets.Value("string"),
                 "comments": datasets.Value("string"),
                 "subjects": datasets.Value("string"),
                 "version": datasets.Value("string"),
                 "date_of_submission": datasets.Value("string"),
                 "title": datasets.Value("string"),
                 "authors": datasets.features.Sequence(datasets.Value("string")),
                 "accepted": datasets.Value("bool"),
                 "abstract": datasets.Value("string"),
                 "histories": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string"))),
                 "reviews": datasets.features.Sequence(
                     {
                         "date": datasets.Value("string"),
                         "title": datasets.Value("string"),
                         "other_keys": datasets.Value("string"),
                         "originality": datasets.Value("string"),
                         "comments": datasets.Value("string"),
                         "is_meta_review": datasets.Value("bool"),
                         "is_annotated": datasets.Value("bool"),
                         "recommendation": datasets.Value("string"),
                         "replicability": datasets.Value("string"),
                         "presentation_format": datasets.Value("string"),
                         "clarity": datasets.Value("string"),
                         "meaningful_comparison": datasets.Value("string"),
                         "substance": datasets.Value("string"),
                         "reviewer_confidence": datasets.Value("string"),
                         "soundness_correctness": datasets.Value("string"),
                         "appropriateness": datasets.Value("string"),
                         "impact": datasets.Value("string"),
                     }
                 ),
             }
         )
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
コード例 #19
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "qid":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "answer":
             datasets.features.Sequence({
                 "answer_type":
                 datasets.Value("string"),
                 "answer_argument":
                 datasets.Value("string"),
                 "entity_name":
                 datasets.Value("string"),
             }),
             "function":
             datasets.Value("string"),
             "num_node":
             datasets.Value("int32"),
             "num_edge":
             datasets.Value("int32"),
             "graph_query": {
                 "nodes":
                 datasets.features.Sequence({
                     "nid":
                     datasets.Value("int32"),
                     "node_type":
                     datasets.Value("string"),
                     "id":
                     datasets.Value("string"),
                     "class":
                     datasets.Value("string"),
                     "friendly_name":
                     datasets.Value("string"),
                     "question_node":
                     datasets.Value("int32"),
                     "function":
                     datasets.Value("string"),
                 }),
                 "edges":
                 datasets.features.Sequence({
                     "start":
                     datasets.Value("int32"),
                     "end":
                     datasets.Value("int32"),
                     "relation":
                     datasets.Value("string"),
                     "friendly_name":
                     datasets.Value("string"),
                 }),
             },
             "sparql_query":
             datasets.Value("string"),
             "domains":
             datasets.features.Sequence(datasets.Value("string")),
             "level":
             datasets.Value("string"),
             "s_expression":
             datasets.Value("string"),
         }),
         # No default supervised_keys (as we have to pass both question
         # and context as input).
         supervised_keys=None,
         homepage="https://dki-lab.github.io/GrailQA/",
         citation=_CITATION,
     )
コード例 #20
0
ファイル: wiki_auto.py プロジェクト: Priyansh2/nlp
 def _info(self):
     if self.config.name == "manual":  # This is the name of the configuration selected in BUILDER_CONFIGS above
         features = datasets.Features({
             "alignment_label":
             datasets.ClassLabel(names=["notAligned", "aligned"]),
             "normal_sentence_id":
             datasets.Value("string"),
             "simple_sentence_id":
             datasets.Value("string"),
             "normal_sentence":
             datasets.Value("string"),
             "simple_sentence":
             datasets.Value("string"),
         })
     elif self.config.name == "auto_acl":
         features = datasets.Features({
             "normal_sentence":
             datasets.Value("string"),
             "simple_sentence":
             datasets.Value("string"),
         })
     else:
         features = datasets.Features({
             "example_id":
             datasets.Value("string"),
             "normal": {
                 "normal_article_id":
                 datasets.Value("int32"),
                 "normal_article_title":
                 datasets.Value("string"),
                 "normal_article_url":
                 datasets.Value("string"),
                 "normal_article_content":
                 datasets.Sequence({
                     "normal_sentence_id":
                     datasets.Value("string"),
                     "normal_sentence":
                     datasets.Value("string"),
                 }),
             },
             "simple": {
                 "simple_article_id":
                 datasets.Value("int32"),
                 "simple_article_title":
                 datasets.Value("string"),
                 "simple_article_url":
                 datasets.Value("string"),
                 "simple_article_content":
                 datasets.Sequence({
                     "simple_sentence_id":
                     datasets.Value("string"),
                     "simple_sentence":
                     datasets.Value("string"),
                 }),
             },
             "paragraph_alignment":
             datasets.Sequence({
                 "normal_paragraph_id":
                 datasets.Value("string"),
                 "simple_paragraph_id":
                 datasets.Value("string"),
             }),
             "sentence_alignment":
             datasets.Sequence({
                 "normal_sentence_id":
                 datasets.Value("string"),
                 "simple_sentence_id":
                 datasets.Value("string"),
             }),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage="https://github.com/chaojiang06/wiki-auto",
         license=_LICENSE,
         citation=_CITATION,
     )
コード例 #21
0
    def _info(self):
        features = datasets.Features({
            "id":
            datasets.Value("int32"),
            "tokens":
            datasets.Sequence(datasets.Value("string")),
            "ner_ids":
            datasets.Sequence(datasets.Value("int32")),
            "space_after":
            datasets.Sequence(datasets.Value("bool")),
            "ner_tags":
            datasets.Sequence(
                datasets.features.ClassLabel(names=[
                    "O",
                    "B-PERSON",
                    "I-PERSON",
                    "B-ORG",
                    "I-ORG",
                    "B-GPE",
                    "I-GPE",
                    "B-LOC",
                    "I-LOC",
                    "B-NAT_REL_POL",
                    "I-NAT_REL_POL",
                    "B-EVENT",
                    "I-EVENT",
                    "B-LANGUAGE",
                    "I-LANGUAGE",
                    "B-WORK_OF_ART",
                    "I-WORK_OF_ART",
                    "B-DATETIME",
                    "I-DATETIME",
                    "B-PERIOD",
                    "I-PERIOD",
                    "B-MONEY",
                    "I-MONEY",
                    "B-QUANTITY",
                    "I-QUANTITY",
                    "B-NUMERIC",
                    "I-NUMERIC",
                    "B-ORDINAL",
                    "I-ORDINAL",
                    "B-FACILITY",
                    "I-FACILITY",
                ])),
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )
コード例 #22
0
ファイル: imppres.py プロジェクト: Priyansh2/nlp
 def _info(self):
     if (
             "presupposition" in self.config.name
     ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
         features = datasets.Features({
             "premise":
             datasets.Value("string"),
             "hypothesis":
             datasets.Value("string"),
             "trigger":
             datasets.Value("string"),
             "trigger1":
             datasets.Value("string"),
             "trigger2":
             datasets.Value("string"),
             "presupposition":
             datasets.Value("string"),
             "gold_label":
             datasets.ClassLabel(
                 names=["entailment", "neutral", "contradiction"]),
             "UID":
             datasets.Value("string"),
             "pairID":
             datasets.Value("string"),
             "paradigmID":
             datasets.Value("int16")
             # These are the features of your dataset like images, labels ...
         })
     else:  # This is an example to show how to have different features for "first_domain" and "second_domain"
         features = datasets.Features({
             "premise":
             datasets.Value("string"),
             "hypothesis":
             datasets.Value("string"),
             "gold_label_log":
             datasets.ClassLabel(
                 names=["entailment", "neutral", "contradiction"]),
             "gold_label_prag":
             datasets.ClassLabel(
                 names=["entailment", "neutral", "contradiction"]),
             "spec_relation":
             datasets.Value("string"),
             "item_type":
             datasets.Value("string"),
             "trigger":
             datasets.Value("string"),
             "lexemes":
             datasets.Value("string"),
             # These are the features of your dataset like images, labels ...
         })
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=
         features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )
コード例 #23
0
"""
USAGE:
``python extracting_data.py -i <img_dir> -o <dataset_file>.datasets <batch_size>``
"""


TEST = False
CONFIG = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
DEFAULT_SCHEMA = datasets.Features(
    OrderedDict(
        {
            "attr_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
            "attr_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
            "boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"),
            "img_id": datasets.Value("int32"),
            "obj_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
            "obj_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
            "roi_features": datasets.Array2D((CONFIG.MAX_DETECTIONS, 2048), dtype="float32"),
            "sizes": datasets.Sequence(length=2, feature=datasets.Value("float32")),
            "preds_per_image": datasets.Value(dtype="int32"),
        }
    )
)


class Extract:
    def __init__(self, argv=sys.argv[1:]):
        inputdir = None
        outputfile = None
        subset_list = None
        batch_size = 1
コード例 #24
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "text":
             datasets.Value("string"),
             "annot_text":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "pos_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "SO",
                     "SS",
                     "VV",
                     "XR",
                     "VCP",
                     "JC",
                     "VCN",
                     "JKB",
                     "MM",
                     "SP",
                     "XSN",
                     "SL",
                     "NNP",
                     "NP",
                     "EP",
                     "JKQ",
                     "IC",
                     "XSA",
                     "EC",
                     "EF",
                     "SE",
                     "XPN",
                     "ETN",
                     "SH",
                     "XSV",
                     "MAG",
                     "SW",
                     "ETM",
                     "JKO",
                     "NNB",
                     "MAJ",
                     "NNG",
                     "JKV",
                     "JKC",
                     "VA",
                     "NR",
                     "JKG",
                     "VX",
                     "SF",
                     "JX",
                     "JKS",
                     "SN",
                 ])),
             "ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "I", "O", "B_OG", "B_TI", "B_LC", "B_DT", "B_PS"
                 ])),
         }),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
コード例 #25
0
 def _info(self):
     return datasets.DatasetInfo(
         features=datasets.Features({
             #"position": datasets.Sequence(datasets.Value("string")),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             #"sid": datasets.Value("string"),
             "pos_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     '"',
                     "''",
                     "#",
                     "$",
                     "(",
                     ")",
                     ",",
                     ".",
                     ":",
                     "``",
                     "CC",
                     "CD",
                     "DT",
                     "EX",
                     "FW",
                     "HYPH",
                     "IN",
                     "JJ",
                     "JJR",
                     "JJS",
                     "LS",
                     "-LRB-",
                     "MD",
                     "NN",
                     "NNP",
                     "NNPS",
                     "NNS",
                     "NN|SYM",
                     "PDT",
                     "POS",
                     "PRP",
                     "PRP$",
                     "-RRB-",
                     "RB",
                     "RBR",
                     "RBS",
                     "RP",
                     "SYM",
                     "TO",
                     "UH",
                     "VB",
                     "VBD",
                     "VBG",
                     "VBN",
                     "VBP",
                     "VBZ",
                     "WDT",
                     "WP",
                     "WP$",
                     "WRB",
                 ])),
         }),
         supervised_keys=None,
     )
コード例 #26
0
class Superb(datasets.GeneratorBasedBuilder):
    """Superb dataset."""

    BUILDER_CONFIGS = [
        SuperbConfig(
            name="asr",
            description=textwrap.dedent(
                """\
            ASR transcribes utterances into words. While PR analyzes the
            improvement in modeling phonetics, ASR reflects the significance of
            the improvement in a real-world scenario. LibriSpeech
            train-clean-100/dev-clean/test-clean subsets are used for
            training/validation/testing. The evaluation metric is word error
            rate (WER)."""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "text": datasets.Value("string"),
                    "speaker_id": datasets.Value("int64"),
                    "chapter_id": datasets.Value("int64"),
                    "id": datasets.Value("string"),
                }
            ),
            supervised_keys=("file", "text"),
            url="http://www.openslr.org/12",
            data_url="http://www.openslr.org/resources/12/",
            task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")],
        ),
        SuperbConfig(
            name="ks",
            description=textwrap.dedent(
                """\
            Keyword Spotting (KS) detects preregistered keywords by classifying utterances into a predefined set of
            words. The task is usually performed on-device for the fast response time. Thus, accuracy, model size, and
            inference time are all crucial. SUPERB uses the widely used Speech Commands dataset v1.0 for the task.
            The dataset consists of ten classes of keywords, a class for silence, and an unknown class to include the
            false positive. The evaluation metric is accuracy (ACC)"""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "label": datasets.ClassLabel(
                        names=[
                            "yes",
                            "no",
                            "up",
                            "down",
                            "left",
                            "right",
                            "on",
                            "off",
                            "stop",
                            "go",
                            "_silence_",
                            "_unknown_",
                        ]
                    ),
                }
            ),
            supervised_keys=("file", "label"),
            url="https://www.tensorflow.org/datasets/catalog/speech_commands",
            data_url="http://download.tensorflow.org/data/{filename}",
        ),
        SuperbConfig(
            name="ic",
            description=textwrap.dedent(
                """\
            Intent Classification (IC) classifies utterances into predefined classes to determine the intent of
            speakers. SUPERB uses the Fluent Speech Commands dataset, where each utterance is tagged with three intent
            labels: action, object, and location. The evaluation metric is accuracy (ACC)."""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "speaker_id": datasets.Value("string"),
                    "text": datasets.Value("string"),
                    "action": datasets.ClassLabel(
                        names=["activate", "bring", "change language", "deactivate", "decrease", "increase"]
                    ),
                    "object": datasets.ClassLabel(
                        names=[
                            "Chinese",
                            "English",
                            "German",
                            "Korean",
                            "heat",
                            "juice",
                            "lamp",
                            "lights",
                            "music",
                            "newspaper",
                            "none",
                            "shoes",
                            "socks",
                            "volume",
                        ]
                    ),
                    "location": datasets.ClassLabel(names=["bedroom", "kitchen", "none", "washroom"]),
                }
            ),
            supervised_keys=None,
            url="https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/",
            data_url="http://fluent.ai:2052/jf8398hf30f0381738rucj3828chfdnchs.tar.gz",
        ),
        SuperbConfig(
            name="si",
            description=textwrap.dedent(
                """\
            Speaker Identification (SI) classifies each utterance for its speaker identity as a multi-class
            classification, where speakers are in the same predefined set for both training and testing. The widely
            used VoxCeleb1 dataset is adopted, and the evaluation metric is accuracy (ACC)."""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    # VoxCeleb1 contains 1251 speaker IDs in range ["id10001",..."id11251"]
                    "label": datasets.ClassLabel(names=[f"id{i + 10001}" for i in range(1251)]),
                }
            ),
            supervised_keys=("file", "label"),
            url="https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html",
        ),
        SuperbConfig(
            name="sd",
            description=textwrap.dedent(
                """\
            Speaker Diarization (SD) predicts `who is speaking when` for each timestamp, and multiple speakers can
            speak simultaneously. The model has to encode rich speaker characteristics for each frame and should be
            able to represent mixtures of signals. [LibriMix] is adopted where LibriSpeech
            train-clean-100/dev-clean/test-clean are used to generate mixtures for training/validation/testing.
            We focus on the two-speaker scenario as the first step. The time-coded speaker labels were generated using
            alignments from Kaldi LibriSpeech ASR model. The evaluation metric is diarization error rate (DER)."""
            ),
            features=datasets.Features(
                {
                    "record_id": datasets.Value("string"),
                    "file": datasets.Value("string"),
                    "start": datasets.Value("int64"),
                    "end": datasets.Value("int64"),
                    "speakers": [
                        {
                            "speaker_id": datasets.Value("string"),
                            "start": datasets.Value("int64"),
                            "end": datasets.Value("int64"),
                        }
                    ],
                }
            ),  # TODO
            supervised_keys=None,  # TODO
            url="https://github.com/ftshijt/LibriMix",
            data_url="https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/{split}/{filename}",
        ),
        SuperbConfig(
            name="er",
            description=textwrap.dedent(
                """\
            Emotion Recognition (ER) predicts an emotion class for each utterance. The most widely used ER dataset
            IEMOCAP is adopted, and we follow the conventional evaluation protocol: we drop the unbalanced emotion
            classes to leave the final four classes with a similar amount of data points and cross-validate on five
            folds of the standard splits. The evaluation metric is accuracy (ACC)."""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "label": datasets.ClassLabel(names=["neu", "hap", "ang", "sad"]),
                }
            ),
            supervised_keys=("file", "label"),
            url="https://sail.usc.edu/iemocap/",
        ),
    ]

    @property
    def manual_download_instructions(self):
        if self.config.name == "si":
            return textwrap.dedent(
                """
            Please download the VoxCeleb dataset using the following script,
            which should create `VoxCeleb1/wav/id*` directories for both train and test speakers`:
            ```
            mkdir VoxCeleb1
            cd VoxCeleb1

            wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa
            wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab
            wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac
            wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad
            cat vox1_dev* > vox1_dev_wav.zip
            unzip vox1_dev_wav.zip

            wget https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip
            unzip vox1_test_wav.zip

            # download the official SUPERB train-dev-test split
            wget https://raw.githubusercontent.com/s3prl/s3prl/master/s3prl/downstream/voxceleb1/veri_test_class.txt
            ```"""
            )
        elif self.config.name == "er":
            return textwrap.dedent(
                """
            Please download the IEMOCAP dataset after submitting the request form here:
            https://sail.usc.edu/iemocap/iemocap_release.htm
            Having downloaded the dataset you can extract it with `tar -xvzf IEMOCAP_full_release.tar.gz`
            which should create a folder called `IEMOCAP_full_release`
            """
            )
        return None

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=self.config.features,
            supervised_keys=self.config.supervised_keys,
            homepage=self.config.url,
            citation=_CITATION,
            task_templates=self.config.task_templates,
        )

    def _split_generators(self, dl_manager):
        if self.config.name == "asr":
            _DL_URLS = {
                "dev": self.config.data_url + "dev-clean.tar.gz",
                "test": self.config.data_url + "test-clean.tar.gz",
                "train": self.config.data_url + "train-clean-100.tar.gz",
            }
            archive_path = dl_manager.download_and_extract(_DL_URLS)

            return [
                datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path["train"]}),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["dev"]}
                ),
                datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"]}),
            ]
        elif self.config.name == "ks":
            _DL_URLS = {
                "train_val_test": self.config.data_url.format(filename="speech_commands_v0.01.tar.gz"),
                "test": self.config.data_url.format(filename="speech_commands_test_set_v0.01.tar.gz"),
            }
            archive_path = dl_manager.download_and_extract(_DL_URLS)
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "train"},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "val"},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"], "split": "test"}
                ),
            ]
        elif self.config.name == "ic":
            archive_path = dl_manager.download_and_extract(self.config.data_url)
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={"archive_path": archive_path, "split": "train"},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    gen_kwargs={"archive_path": archive_path, "split": "valid"},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path, "split": "test"}
                ),
            ]
        elif self.config.name == "si":
            manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={"archive_path": manual_dir, "split": 1},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    gen_kwargs={"archive_path": manual_dir, "split": 2},
                ),
                datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": manual_dir, "split": 3}),
            ]
        elif self.config.name == "sd":
            splits = ["train", "dev", "test"]
            _DL_URLS = {
                split: {
                    filename: self.config.data_url.format(split=split, filename=filename)
                    for filename in ["reco2dur", "segments", "utt2spk", "wav.zip"]
                }
                for split in splits
            }
            archive_path = dl_manager.download_and_extract(_DL_URLS)
            return [
                datasets.SplitGenerator(
                    name=datasets.NamedSplit(split), gen_kwargs={"archive_path": archive_path[split], "split": split}
                )
                for split in splits
            ]
        elif self.config.name == "er":
            manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
            return [
                datasets.SplitGenerator(
                    name=f"session{i}",
                    gen_kwargs={"archive_path": manual_dir, "split": i},
                )
                for i in range(1, 6)
            ]

    def _generate_examples(self, archive_path, split=None):
        """Generate examples."""
        if self.config.name == "asr":
            transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*/*/*/*.txt")
            key = 0
            for transcript_path in sorted(glob.glob(transcripts_glob)):
                transcript_dir_path = os.path.dirname(transcript_path)
                with open(transcript_path, "r", encoding="utf-8") as f:
                    for line in f:
                        line = line.strip()
                        id_, transcript = line.split(" ", 1)
                        audio_file = f"{id_}.flac"
                        speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]]
                        yield key, {
                            "id": id_,
                            "speaker_id": speaker_id,
                            "chapter_id": chapter_id,
                            "file": os.path.join(transcript_dir_path, audio_file),
                            "text": transcript,
                        }
                        key += 1
        elif self.config.name == "ks":
            words = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]
            splits = _split_ks_files(archive_path, split)
            for key, audio_file in enumerate(sorted(splits[split])):
                base_dir, file_name = os.path.split(audio_file)
                _, word = os.path.split(base_dir)
                if word in words:
                    label = word
                elif word == "_silence_" or word == "_background_noise_":
                    label = "_silence_"
                else:
                    label = "_unknown_"
                yield key, {"file": audio_file, "label": label}
        elif self.config.name == "ic":
            root_path = os.path.join(archive_path, "fluent_speech_commands_dataset/")
            csv_path = os.path.join(root_path, f"data/{split}_data.csv")
            with open(csv_path, encoding="utf-8") as csv_file:
                csv_reader = csv.reader(csv_file, delimiter=",", skipinitialspace=True)
                next(csv_reader)
                for row in csv_reader:
                    key, file_path, speaker_id, text, action, object_, location = row
                    yield key, {
                        "file": os.path.join(root_path, file_path),
                        "speaker_id": speaker_id,
                        "text": text,
                        "action": action,
                        "object": object_,
                        "location": location,
                    }
        elif self.config.name == "si":
            wav_path = os.path.join(archive_path, "wav/")
            splits_path = os.path.join(archive_path, "veri_test_class.txt")
            with open(splits_path, "r", encoding="utf-8") as f:
                for key, line in enumerate(f):
                    split_id, file_path = line.strip().split(" ")
                    if int(split_id) != split:
                        continue
                    speaker_id = file_path.split("/")[0]
                    yield key, {
                        "file": os.path.join(wav_path, file_path),
                        "label": speaker_id,
                    }
        elif self.config.name == "sd":
            data = SdData(archive_path)
            args = SdArgs()
            chunk_indices = _generate_chunk_indices(data, args, split=split)
            if split != "test":
                for key, (rec, st, ed) in enumerate(chunk_indices):
                    speakers = _get_speakers(rec, data, args)
                    yield key, {
                        "record_id": rec,
                        "file": data.wavs[rec],
                        "start": st,
                        "end": ed,
                        "speakers": speakers,
                    }
            else:
                key = 0
                for rec in chunk_indices:
                    for rec, st, ed in chunk_indices[rec]:
                        speakers = _get_speakers(rec, data, args)
                        yield key, {
                            "record_id": rec,
                            "file": data.wavs[rec],
                            "start": st,
                            "end": ed,
                            "speakers": speakers,
                        }
                        key += 1
        elif self.config.name == "er":
            root_path = os.path.join(archive_path, f"Session{split}/")
            wav_path = os.path.join(root_path, "sentences/wav/")
            labels_path = os.path.join(root_path, "dialog/EmoEvaluation/*.txt")
            emotions = ["neu", "hap", "ang", "sad", "exc"]
            key = 0
            for labels_file in sorted(glob.glob(labels_path)):
                with open(labels_file, "r", encoding="utf-8") as f:
                    for line in f:
                        if line[0] != "[":
                            continue
                        _, filename, emo, _ = line.split("\t")
                        if emo not in emotions:
                            continue
                        wav_subdir = filename.rsplit("_", 1)[0]
                        filename = f"{filename}.wav"
                        yield key, {
                            "file": os.path.join(wav_path, wav_subdir, filename),
                            "label": emo.replace("exc", "hap"),
                        }
                        key += 1
コード例 #27
0
 def _info(self):
     if (
         self.config.mode == "experiments"
     ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
         features = datasets.Features(
             {
                 "question": datasets.Value("string"),
                 "candidate": datasets.Value("string"),
                 "label": datasets.ClassLabel(names=["0", "1"]),
             }
         )
     else:
         if self.config.type_ == "answer_selection":
             features = datasets.Features(
                 {
                     "section": datasets.Value("string"),
                     "question": datasets.Value("string"),
                     "article": datasets.Value("string"),
                     "is_paraphrase": datasets.Value("bool"),
                     "topic": datasets.ClassLabel(
                         names=[
                             "MUSIC",
                             "TV",
                             "TRAVEL",
                             "ART",
                             "SPORT",
                             "COUNTRY",
                             "MOVIES",
                             "HISTORICAL EVENTS",
                             "SCIENCE",
                             "FOOD",
                         ]
                     ),
                     "answers": datasets.Sequence(datasets.Value("int32")),
                     "candidates": datasets.Sequence(datasets.Value("string")),
                     "q_types": datasets.Sequence(
                         datasets.ClassLabel(names=["what", "why", "when", "who", "where", "how", ""])
                     ),
                 }
             )
         else:
             features = datasets.Features(
                 {
                     "section": datasets.Value("string"),
                     "question": datasets.Value("string"),
                     "article": datasets.Value("string"),
                     "is_paraphrase": datasets.Value("bool"),
                     "topic": datasets.ClassLabel(
                         names=[
                             "MUSIC",
                             "TV",
                             "TRAVEL",
                             "ART",
                             "SPORT",
                             "COUNTRY",
                             "MOVIES",
                             "HISTORICAL EVENTS",
                             "SCIENCE",
                             "FOOD",
                         ]
                     ),
                     "q_types": datasets.Sequence(
                         datasets.ClassLabel(names=["what", "why", "when", "who", "where", "how", ""])
                     ),
                     "candidate_list": datasets.Sequence(
                         {
                             "article": datasets.Value("string"),
                             "section": datasets.Value("string"),
                             "candidates": datasets.Sequence(datasets.Value("string")),
                             "answers": datasets.Sequence(datasets.Value("int32")),
                         }
                     ),
                 }
             )
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )
コード例 #28
0
ファイル: swda.py プロジェクト: ddhruvkr/datasets-1
    def _info(self):
        """
        Specify the datasets.DatasetInfo object which contains information and typings for the dataset.
        """

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types.
            features=datasets.Features({
                "swda_filename":
                datasets.Value("string"),
                "ptb_basename":
                datasets.Value("string"),
                "conversation_no":
                datasets.Value("int64"),
                "transcript_index":
                datasets.Value("int64"),
                "act_tag":
                datasets.ClassLabel(num_classes=217, names=_ACT_TAGS),
                "damsl_act_tag":
                datasets.ClassLabel(num_classes=43, names=_DAMSL_ACT_TAGS),
                "caller":
                datasets.Value("string"),
                "utterance_index":
                datasets.Value("int64"),
                "subutterance_index":
                datasets.Value("int64"),
                "text":
                datasets.Value("string"),
                "pos":
                datasets.Value("string"),
                "trees":
                datasets.Value("string"),
                "ptb_treenumbers":
                datasets.Value("string"),
                "talk_day":
                datasets.Value("string"),
                "length":
                datasets.Value("int64"),
                "topic_description":
                datasets.Value("string"),
                "prompt":
                datasets.Value("string"),
                "from_caller":
                datasets.Value("int64"),
                "from_caller_sex":
                datasets.Value("string"),
                "from_caller_education":
                datasets.Value("int64"),
                "from_caller_birth_year":
                datasets.Value("int64"),
                "from_caller_dialect_area":
                datasets.Value("string"),
                "to_caller":
                datasets.Value("int64"),
                "to_caller_sex":
                datasets.Value("string"),
                "to_caller_education":
                datasets.Value("int64"),
                "to_caller_birth_year":
                datasets.Value("int64"),
                "to_caller_dialect_area":
                datasets.Value("string"),
            }),
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )
コード例 #29
0
    def _info(self):

        if self.config.name == "dialogue_domain":
            features = datasets.Features({
                "dial_id":
                datasets.Value("string"),
                "doc_id":
                datasets.Value("string"),
                "domain":
                datasets.Value("string"),
                "turns": [{
                    "turn_id":
                    datasets.Value("int32"),
                    "role":
                    datasets.Value("string"),
                    "da":
                    datasets.Value("string"),
                    "reference": [{
                        "keys": datasets.Value("string"),
                        "values": datasets.Value("string"),
                    }],
                    "utterance":
                    datasets.Value("string"),
                }],
            })
        elif self.config.name == "document_domain":
            features = datasets.Features({
                "domain":
                datasets.Value("string"),
                "doc_id":
                datasets.Value("string"),
                "title":
                datasets.Value("string"),
                "doc_text":
                datasets.Value("string"),
                "spans": [{
                    "id_sp": datasets.Value("string"),
                    "tag": datasets.Value("string"),
                    "start_sp": datasets.Value("int32"),
                    "end_sp": datasets.Value("int32"),
                    "text_sp": datasets.Value("string"),
                    "title": datasets.Value("string"),
                    "parent_titles": datasets.Value("string"),
                    "id_sec": datasets.Value("string"),
                    "start_sec": datasets.Value("int32"),
                    "text_sec": datasets.Value("string"),
                    "end_sec": datasets.Value("int32"),
                }],
                "doc_html_ts":
                datasets.Value("string"),
                "doc_html_raw":
                datasets.Value("string"),
            })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )
コード例 #30
0
ファイル: loading_ud.py プロジェクト: andybi7676/rnn_typology
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "id": datasets.Value("string"),
                 "tokens": datasets.Sequence(datasets.Value("string")),
                 "pos_tags": datasets.Sequence(
                     datasets.features.ClassLabel(
                         names=['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
                     )
                 ),
                 "dependency_tags": datasets.Sequence(
                     datasets.features.ClassLabel(
                         names=[
                             'acl',
                             'acl:relcl',
                             'advcl',
                             'advcl:cleft', 
                             'advmod',
                             'advmod:emph',
                             'advmod:lmod',
                             'amod',
                             'appos',
                             'aux',
                             'aux:aspect', 
                             'aux:pass',
                             'aux:q', 
                             'aux:tense', 
                             'case',
                             'case:dec', 
                             'case:pref',
                             'case:suff',  
                             'cc',
                             'cc:preconj',
                             'ccomp',
                             'ccomp:agent', 
                             'ccomp:obj', 
                             'clf',
                             'compound',
                             'compound:lvc',
                             'compound:prt',
                             'compound:redup',
                             'compound:svc',
                             'conj',
                             'cop',
                             'csubj',
                             'csubj:cop', 
                             'csubj:pass',
                             'dep',
                             'dep:comp', 
                             'det',
                             'det:numgov',
                             'det:nummod',
                             'det:poss',
                             'discourse',
                             'dislocated',
                             'det:predet',
                             'expl',
                             'expl:impers',
                             'expl:pass',
                             'expl:pv',
                             'expl:subj', 
                             'fixed',
                             'flat',
                             'flat:foreign',
                             'flat:name',
                             'goeswith',
                             'iobj',
                             'list',
                             'mark',
                             'mark:advb',
                             'mark:comp', 
                             'mark:relcl', 
                             'nmod',
                             'nmod:comp', 
                             'nmod:part', 
                             'nmod:poss',
                             'nmod:tmod',
                             'nmod:npmod', 
                             'nsubj',
                             'nsubj:cop', 
                             'nsubj:pass',
                             'nummod',
                             'nummod:gov',
                             'obj',
                             'obj:lvc', 
                             'obl',
                             'obl:agent',
                             'obl:arg',
                             'obl:lmod',
                             'obl:mod', 
                             'obl:loc', 
                             'obl:tmod',
                             'obl:npmod', 
                             'obl:patient', 
                             'orphan',
                             'parataxis',
                             'punct',
                             'reparandum',
                             'root',
                             'vocative',
                             'xcomp',
                             'xcomp:obj', 
                             'xcomp:obl', 
                         ] + ['acl:appos', 'acl:inf', 'acl:part', 'advcl:arg', 'advcl:cond', 'advmod:cc', 'amod:advmod', 'aux:caus', 'aux:neg', 'case:voc', 'ccomp:obl', 'ccomp:pred', 'compound:conjv', 'compound:nv', 'compound:plur', 'conj:expl', 'csubj:cleft', 'iobj:agent', 'iobj:loc', 'mark:prt', 'nmod:advmod', 'nmod:appos', 'nsubj:caus', 'nsubj:nc', 'obj:agent', 'obl:abl', 'obl:ben', 'obl:cmpr', 'obl:inst', 'obl:pmod', 'obl:prep', 'obl:soc', 'xcomp:adj', 'xcomp:pred']
                     )
                 ),
                 "lang": datasets.Sequence(
                     datasets.features.ClassLabel(
                         names=list(testing_path.keys())
                     )
                 ),
             }
         ),
         supervised_keys=None,
         homepage="https://www.aclweb.org/anthology/W03-0419/",
         citation=_CITATION,
     )