Exemple #1
0
    def _info(self):
        features = datasets.Features({
            "text":
            datasets.Value("string"),
            "sentence_offsets":
            datasets.features.Sequence({
                "begin_char_offset":
                datasets.Value("int64"),
                "end_char_offset":
                datasets.Value("int64")
            }),
            "sentences":
            datasets.features.Sequence(datasets.Value("string")),
            "sentence_labels":
            datasets.features.Sequence(datasets.Value("int64")),
            "token_offsets":
            datasets.features.Sequence({
                "offsets":
                datasets.features.Sequence({
                    "begin_char_offset":
                    datasets.Value("int64"),
                    "end_char_offset":
                    datasets.Value("int64")
                })
            }),
            "tokens":
            datasets.features.Sequence(
                datasets.features.Sequence(datasets.Value("string"))),
            "entity_labels":
            datasets.features.Sequence(
                datasets.features.Sequence(
                    datasets.features.ClassLabel(names=[
                        "B-DEVICE",
                        "B-EXPERIMENT",
                        "B-MATERIAL",
                        "B-VALUE",
                        "I-DEVICE",
                        "I-EXPERIMENT",
                        "I-MATERIAL",
                        "I-VALUE",
                        "O",
                    ]))),
            "slot_labels":
            datasets.features.Sequence(
                datasets.features.Sequence(
                    datasets.features.ClassLabel(names=[
                        "B-anode_material",
                        "B-cathode_material",
                        "B-conductivity",
                        "B-current_density",
                        "B-degradation_rate",
                        "B-device",
                        "B-electrolyte_material",
                        "B-experiment_evoking_word",
                        "B-fuel_used",
                        "B-interlayer_material",
                        "B-interconnect_material",
                        "B-open_circuit_voltage",
                        "B-power_density",
                        "B-resistance",
                        "B-support_material",
                        "B-thickness",
                        "B-time_of_operation",
                        "B-voltage",
                        "B-working_temperature",
                        "I-anode_material",
                        "I-cathode_material",
                        "I-conductivity",
                        "I-current_density",
                        "I-degradation_rate",
                        "I-device",
                        "I-electrolyte_material",
                        "I-experiment_evoking_word",
                        "I-fuel_used",
                        "I-interlayer_material",
                        "I-interconnect_material",
                        "I-open_circuit_voltage",
                        "I-power_density",
                        "I-resistance",
                        "I-support_material",
                        "I-thickness",
                        "I-time_of_operation",
                        "I-voltage",
                        "I-working_temperature",
                        "O",
                    ]))),
            "links":
            datasets.Sequence({
                "relation_label":
                datasets.features.ClassLabel(names=[
                    "coreference", "experiment_variation", "same_experiment",
                    "thickness"
                ]),
                "start_span_id":
                datasets.Value("int64"),
                "end_span_id":
                datasets.Value("int64"),
            }),
            "slots":
            datasets.features.Sequence({
                "frame_participant_label":
                datasets.features.ClassLabel(names=[
                    "anode_material",
                    "cathode_material",
                    "current_density",
                    "degradation_rate",
                    "device",
                    "electrolyte_material",
                    "fuel_used",
                    "interlayer_material",
                    "open_circuit_voltage",
                    "power_density",
                    "resistance",
                    "support_material",
                    "time_of_operation",
                    "voltage",
                    "working_temperature",
                ]),
                "slot_id":
                datasets.Value("int64"),
            }),
            "spans":
            datasets.features.Sequence({
                "span_id":
                datasets.Value("int64"),
                "entity_label":
                datasets.features.ClassLabel(
                    names=["", "DEVICE", "MATERIAL", "VALUE"]),
                "sentence_id":
                datasets.Value("int64"),
                "experiment_mention_type":
                datasets.features.ClassLabel(names=[
                    "", "current_exp", "future_work", "general_info",
                    "previous_work"
                ]),
                "begin_char_offset":
                datasets.Value("int64"),
                "end_char_offset":
                datasets.Value("int64"),
            }),
            "experiments":
            datasets.features.Sequence({
                "experiment_id":
                datasets.Value("int64"),
                "span_id":
                datasets.Value("int64"),
                "slots":
                datasets.features.Sequence({
                    "frame_participant_label":
                    datasets.features.ClassLabel(names=[
                        "anode_material",
                        "cathode_material",
                        "current_density",
                        "degradation_rate",
                        "conductivity",
                        "device",
                        "electrolyte_material",
                        "fuel_used",
                        "interlayer_material",
                        "open_circuit_voltage",
                        "power_density",
                        "resistance",
                        "support_material",
                        "time_of_operation",
                        "voltage",
                        "working_temperature",
                    ]),
                    "slot_id":
                    datasets.Value("int64"),
                }),
            }),
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "constituent_dataset":
             datasets.Value("string"),
             "id":
             datasets.Value("string"),
             "context":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "reference":
             datasets.Value("string"),
             "candidate":
             datasets.Value("string"),
             "score":
             datasets.Value("float"),
             "metadata": {
                 "scores":
                 datasets.features.Sequence(datasets.Value("int32")),
                 "source": datasets.Value("string"),
             },
             # features for minimal pairs
             "candidate2":
             datasets.Value("string"),
             "score2":
             datasets.Value("float"),
         }),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
Exemple #3
0
def benchmark_iterating():
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
    functions = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "pandas",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "torch",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "tensorflow",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]

    functions_shuffled = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]
    with tempfile.TemporaryDirectory() as tmp_dir:
        print("generating dataset")
        features = datasets.Features({
            "list":
            datasets.Sequence(datasets.Value("float32")),
            "numbers":
            datasets.Value("float32")
        })
        dataset = generate_example_dataset(
            os.path.join(tmp_dir, "dataset.arrow"),
            features,
            num_examples=SPEED_TEST_N_EXAMPLES,
            seq_shapes={"list": (100, )},
        )
        print("first set of iterations")
        for func, kwargs in functions:
            print(func.__name__, str(kwargs))
            times[func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

        print("shuffling dataset")
        dataset = dataset.shuffle()
        print("Second set of iterations (after shuffling")
        for func, kwargs in functions_shuffled:
            print("shuffled ", func.__name__, str(kwargs))
            times["shuffled " + func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))
Exemple #4
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "context_id":
             datasets.Value("string"),
             "question_id":
             datasets.Value("string"),
             "domain":
             datasets.Value("string"),
             "metadata": {
                 "author": datasets.Value("string"),
                 "title": datasets.Value("string"),
                 "url": datasets.Value("string"),
             },
             "context":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "question_type":
             datasets.Value("string"),
             "answers":
             datasets.features.Sequence(datasets.Value("string"), ),
             "correct_answer_id":
             datasets.Value("int32"),
         }),
         # No default supervised_keys (as we have to pass both question
         # and context as input).
         supervised_keys=None,
         homepage="https://text-machine-lab.github.io/blog/2020/quail/",
         citation=_CITATION,
     )
Exemple #5
0
    def _info(self):
        features = datasets.Features({
            "id":
            datasets.Value("string"),
            "text":
            datasets.Value("string"),
            "tokens":
            datasets.Sequence(datasets.Value("string")),
            "nps": [{
                "text": datasets.Value("string"),
                "first_char": datasets.Value("int32"),
                "last_char": datasets.Value("int32"),
                "first_token": datasets.Value("int32"),
                "last_token": datasets.Value("int32"),
                "id": datasets.Value("string"),
            }],
            "np_relations": [{
                "anchor":
                datasets.Value("string"),
                "complement":
                datasets.Value("string"),
                "preposition":
                datasets.features.ClassLabel(names=[
                    "about",
                    "for",
                    "with",
                    "from",
                    "among",
                    "by",
                    "on",
                    "at",
                    "during",
                    "of",
                    "member(s) of",
                    "in",
                    "after",
                    "under",
                    "to",
                    "into",
                    "before",
                    "near",
                    "outside",
                    "around",
                    "between",
                    "against",
                    "over",
                    "inside",
                ]),
                "complement_coref_cluster_id":
                datasets.Value("string"),
            }],
            "coref": [{
                "id":
                datasets.Value("string"),
                "members":
                datasets.Sequence(datasets.Value("string")),
                "np_type":
                datasets.features.ClassLabel(names=[
                    "standard",
                    "time/date/measurement",
                    "idiomatic",
                ]),
            }],
            "metadata": {
                "annotators": {
                    "coref_worker":
                    datasets.Value("int32"),
                    "consolidator_worker":
                    datasets.Value("int32"),
                    "np-relations_worker":
                    datasets.Sequence(datasets.Value("int32")),
                },
                "url": datasets.Value("string"),
                "source": datasets.Value("string"),
            },
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
            # supervised_keys=("sentence", "label"),
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )
 def _get_feature_types(self):
     if self.config_name == "record":
         return {
             "predictions": {
                 "idx": {
                     "passage": datasets.Value("int64"),
                     "query": datasets.Value("int64"),
                 },
                 "prediction_text": datasets.Value("string"),
             },
             "references": {
                 "idx": {
                     "passage": datasets.Value("int64"),
                     "query": datasets.Value("int64"),
                 },
                 "answers": datasets.Sequence(datasets.Value("string")),
             },
         }
     elif self.config_name == "multirc":
         return {
             "predictions": {
                 "idx": {
                     "answer": datasets.Value("int64"),
                     "paragraph": datasets.Value("int64"),
                     "question": datasets.Value("int64"),
                 },
                 "prediction": datasets.Value("int64"),
             },
             "references": datasets.Value("int64"),
         }
     else:
         return {
             "predictions": datasets.Value("int64"),
             "references": datasets.Value("int64"),
         }
Exemple #7
0
    def _info(self):
        # TODO(xtreme): Specifies the datasets.DatasetInfo object
        features = {text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)}
        if "answers" in features.keys():
            features["answers"] = datasets.features.Sequence(
                {"answer_start": datasets.Value("int32"), "text": datasets.Value("string")}
            )
        if self.config.name.startswith("PAWS-X"):
            features["label"] = datasets.Value("string")
        if self.config.name == "XNLI":
            features["gold_label"] = datasets.Value("string")

        if self.config.name.startswith("udpos"):
            features = datasets.Features(
                {
                    "token": datasets.Value("string"),
                    "pos_tag": datasets.features.ClassLabel(
                        names=[
                            "ADJ",
                            "ADP",
                            "ADV",
                            "AUX",
                            "CCONJ",
                            "DET",
                            "INTJ",
                            "NOUN",
                            "NUM",
                            "PART",
                            "PRON",
                            "PROPN",
                            "PUNCT",
                            "SCONJ",
                            "SYM",
                            "VERB",
                            "X",
                        ]
                    ),
                }
            )

        if self.config.name.startswith("PAN-X"):
            features = datasets.Features(
                {
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "O",
                                "B-PER",
                                "I-PER",
                                "B-ORG",
                                "I-ORG",
                                "B-LOC",
                                "I-LOC",
                            ]
                        )
                    ),
                    "langs": datasets.Sequence(datasets.Value("string")),
                }
            )
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=self.config.description + "\n" + _DESCRIPTION,
            # datasets.features.FeatureConnectors
            features=datasets.Features(
                features
                # These are the features of your dataset like images, labels ...
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url,
            citation=self.config.citation + "\n" + _CITATION,
        )
Exemple #8
0
 def _info(self):
     features = datasets.Features({
         "event":
         datasets.Value("string"),
         "oEffect":
         datasets.Sequence(datasets.Value("string")),
         "oReact":
         datasets.Sequence(datasets.Value("string")),
         "oWant":
         datasets.Sequence(datasets.Value("string")),
         "xAttr":
         datasets.Sequence(datasets.Value("string")),
         "xEffect":
         datasets.Sequence(datasets.Value("string")),
         "xIntent":
         datasets.Sequence(datasets.Value("string")),
         "xNeed":
         datasets.Sequence(datasets.Value("string")),
         "xReact":
         datasets.Sequence(datasets.Value("string")),
         "xWant":
         datasets.Sequence(datasets.Value("string")),
         "prefix":
         datasets.Sequence(datasets.Value("string")),
         "split":
         datasets.Value("string"),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
Exemple #9
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "qid": datasets.Value("string"),
                 "question": datasets.Value("string"),
                 "answer": datasets.features.Sequence(
                     {
                         "answer_type": datasets.Value("string"),
                         "answer_argument": datasets.Value("string"),
                         "entity_name": datasets.Value("string"),
                     }
                 ),
                 "function": datasets.Value("string"),
                 "num_node": datasets.Value("int32"),
                 "num_edge": datasets.Value("int32"),
                 "graph_query": {
                     "nodes": datasets.features.Sequence(
                         {
                             "nid": datasets.Value("int32"),
                             "node_type": datasets.Value("string"),
                             "id": datasets.Value("string"),
                             "class": datasets.Value("string"),
                             "friendly_name": datasets.Value("string"),
                             "question_node": datasets.Value("int32"),
                             "function": datasets.Value("string"),
                         }
                     ),
                     "edges": datasets.features.Sequence(
                         {
                             "start": datasets.Value("int32"),
                             "end": datasets.Value("int32"),
                             "relation": datasets.Value("string"),
                             "friendly_name": datasets.Value("string"),
                         }
                     ),
                 },
                 "sparql_query": datasets.Value("string"),
                 "domains": datasets.features.Sequence(datasets.Value("string")),
                 "level": datasets.Value("string"),
                 "s_expression": datasets.Value("string"),
             }
         ),
         # No default supervised_keys (as we have to pass both question
         # and context as input).
         supervised_keys=None,
         homepage="https://dki-lab.github.io/GrailQA/",
         citation=_CITATION,
     )
class CodeXGlueTcNLCodeSearchAdvImpl(CodeXGlueCtCodeToTextBaseImpl):
    LANGUAGE = "python"
    SINGLE_LANGUAGE = True

    _FEATURES = {
        "id": datasets.Value("int32"),  # Index of the sample
        "repo": datasets.Value("string"),  # repo: the owner/repo
        "path": datasets.Value("string"),  # path: the full path to the original file
        "func_name": datasets.Value("string"),  # func_name: the function or method name
        "original_string": datasets.Value("string"),  # original_string: the raw string before tokenization or parsing
        "language": datasets.Value("string"),  # language: the programming language
        "code": datasets.Value("string"),  # code/function: the part of the original_string that is code
        "code_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # code_tokens/function_tokens: tokenized version of code
        "docstring": datasets.Value(
            "string"
        ),  # docstring: the top-level comment or docstring, if it exists in the original string
        "docstring_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # docstring_tokens: tokenized version of docstring
        "sha": datasets.Value("string"),  # sha of the file
        "url": datasets.Value("string"),  # url of the file
        "docstring_summary": datasets.Value("string"),  # Summary of the docstring
        "parameters": datasets.Value("string"),  # parameters of the function
        "return_statement": datasets.Value("string"),  # return statement
        "argument_list": datasets.Value("string"),  # list of arguments of the function
        "identifier": datasets.Value("string"),  # identifier
        "nwo": datasets.Value("string"),  # nwo
        "score": datasets.Value("float"),  # score for this search
    }

    def post_process(self, split_name, language, js):
        for suffix in "_tokens", "":
            key = "function" + suffix
            if key in js:
                js["code" + suffix] = js[key]
                del js[key]

        for key in self._FEATURES:
            if key not in js:
                if key == "score":
                    js[key] = -1
                else:
                    js[key] = ""

        return js

    def generate_urls(self, split_name):
        for e in super().generate_urls(split_name, self.LANGUAGE):
            yield e

    def get_data_files(self, split_name, file_paths, language):
        if split_name == "train":
            return super().get_data_files(split_name, file_paths, language)
        else:
            data_set_path = file_paths["dataset"]
            data_file = os.path.join(data_set_path, "dataset", "test_code.jsonl")
            return [data_file]

    def _generate_examples(self, split_name, file_paths):
        for e in super()._generate_examples(split_name, file_paths, self.LANGUAGE):
            yield e
class CodeXGlueCtCodeToTextBaseImpl(TrainValidTestChild):
    _DESCRIPTION = _DESCRIPTION
    _CITATION = _CITATION

    # For each file, each line in the uncompressed file represents one function.
    _FEATURES = {
        "id": datasets.Value("int32"),  # Index of the sample
        "repo": datasets.Value("string"),  # repo: the owner/repo
        "path": datasets.Value("string"),  # path: the full path to the original file
        "func_name": datasets.Value("string"),  # func_name: the function or method name
        "original_string": datasets.Value("string"),  # original_string: the raw string before tokenization or parsing
        "language": datasets.Value("string"),  # language: the programming language name
        "code": datasets.Value("string"),  # code/function: the part of the original_string that is code
        "code_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # code_tokens/function_tokens: tokenized version of code
        "docstring": datasets.Value(
            "string"
        ),  # docstring: the top-level comment or docstring, if it exists in the original string
        "docstring_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # docstring_tokens: tokenized version of docstring
        "sha": datasets.Value("string"),  # sha of the file
        "url": datasets.Value("string"),  # url of the file
    }

    _SUPERVISED_KEYS = ["docstring", "docstring_tokens"]

    def generate_urls(self, split_name, language):
        yield "language", f"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{language}.zip"
        yield "dataset", "dataset.zip"

    def get_data_files(self, split_name, file_paths, language):
        language_specific_path = file_paths["language"]
        final_path = os.path.join(language_specific_path, language, "final")
        # Make some cleanup to save space
        for path in os.listdir(final_path):
            if path.endswith(".pkl"):
                os.unlink(path)

        data_files = []
        for root, dirs, files in os.walk(final_path):
            for file in files:
                temp = os.path.join(root, file)
                if ".jsonl" in temp:
                    if split_name in temp:
                        data_files.append(temp)
        return data_files

    def post_process(self, split_name, language, js):
        return js

    def _generate_examples(self, split_name, file_paths, language):
        import gzip

        data_set_path = file_paths["dataset"]

        data_files = self.get_data_files(split_name, file_paths, language)

        urls = {}
        f1_path_parts = [data_set_path, "dataset", language, f"{split_name}.txt"]
        if self.SINGLE_LANGUAGE:
            del f1_path_parts[2]

        f1_path = os.path.join(*f1_path_parts)
        with open(f1_path, encoding="utf-8") as f1:
            for line in f1:
                line = line.strip()
                urls[line] = True

        idx = 0
        for file in data_files:
            if ".gz" in file:
                f = gzip.open(file)
            else:
                f = open(file, encoding="utf-8")

            for line in f:
                line = line.strip()
                js = json.loads(line)
                if js["url"] in urls:
                    js["id"] = idx
                    js = self.post_process(split_name, language, js)
                    if "partition" in js:
                        del js["partition"]
                    yield idx, js
                    idx += 1
            f.close()
Exemple #12
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "ID": datasets.Value("string"),
                 "Text": datasets.Value("string"),
                 "Pronoun": datasets.Value("string"),
                 "Pronoun-offset": datasets.Value("int32"),
                 "A": datasets.Value("string"),
                 "A-offset": datasets.Value("int32"),
                 "A-coref": datasets.Value("bool"),
                 "B": datasets.Value("string"),
                 "B-offset": datasets.Value("int32"),
                 "B-coref": datasets.Value("bool"),
                 "URL": datasets.Value("string"),
             }
         ),
         supervised_keys=None,
         homepage="https://github.com/google-research-datasets/gap-coreference",
         citation=_CITATION,
     )
Exemple #13
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "source":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-LOC",
                     "I-LOC",
                     "B-LOCderiv",
                     "I-LOCderiv",
                     "B-LOCpart",
                     "I-LOCpart",
                     "B-ORG",
                     "I-ORG",
                     "B-ORGderiv",
                     "I-ORGderiv",
                     "B-ORGpart",
                     "I-ORGpart",
                     "B-OTH",
                     "I-OTH",
                     "B-OTHderiv",
                     "I-OTHderiv",
                     "B-OTHpart",
                     "I-OTHpart",
                     "B-PER",
                     "I-PER",
                     "B-PERderiv",
                     "I-PERderiv",
                     "B-PERpart",
                     "I-PERpart",
                 ])),
             "nested_ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-LOC",
                     "I-LOC",
                     "B-LOCderiv",
                     "I-LOCderiv",
                     "B-LOCpart",
                     "I-LOCpart",
                     "B-ORG",
                     "I-ORG",
                     "B-ORGderiv",
                     "I-ORGderiv",
                     "B-ORGpart",
                     "I-ORGpart",
                     "B-OTH",
                     "I-OTH",
                     "B-OTHderiv",
                     "I-OTHderiv",
                     "B-OTHpart",
                     "I-OTHpart",
                     "B-PER",
                     "I-PER",
                     "B-PERderiv",
                     "I-PERderiv",
                     "B-PERpart",
                     "I-PERpart",
                 ])),
         }),
         supervised_keys=None,
         homepage="https://sites.google.com/site/germeval2014ner/",
         citation=_CITATION,
     )
Exemple #14
0
 def _info(self):
     # TODO(quartz): Specifies the datasets.DatasetInfo object
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # datasets.features.FeatureConnectors
         features=datasets.Features({
             # These are the features of your dataset like images, labels ...
             "id":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "choices":
             datasets.features.Sequence({
                 "text": datasets.Value("string"),
                 "label": datasets.Value("string")
             }),
             "answerKey":
             datasets.Value("string"),
             "para":
             datasets.Value("string"),
             "para_id":
             datasets.Value("string"),
             "para_anno": {
                 "effect_prop": datasets.Value("string"),
                 "cause_dir_str": datasets.Value("string"),
                 "effect_dir_str": datasets.Value("string"),
                 "cause_dir_sign": datasets.Value("string"),
                 "effect_dir_sign": datasets.Value("string"),
                 "cause_prop": datasets.Value("string"),
             },
             "question_anno": {
                 "more_effect_dir": datasets.Value("string"),
                 "less_effect_dir": datasets.Value("string"),
                 "less_cause_prop": datasets.Value("string"),
                 "more_effect_prop": datasets.Value("string"),
                 "less_effect_prop": datasets.Value("string"),
                 "less_cause_dir": datasets.Value("string"),
             },
         }),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage="https://allenai.org/data/quartz",
         citation=_CITATION,
     )
Exemple #15
0
    def _info(self):
        """
        Specify the datasets.DatasetInfo object which contains information and typings for the dataset.
        """

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types.
            features=datasets.Features({
                "swda_filename":
                datasets.Value("string"),
                "ptb_basename":
                datasets.Value("string"),
                "conversation_no":
                datasets.Value("int64"),
                "transcript_index":
                datasets.Value("int64"),
                "act_tag":
                datasets.ClassLabel(num_classes=217, names=_ACT_TAGS),
                "damsl_act_tag":
                datasets.ClassLabel(num_classes=43, names=_DAMSL_ACT_TAGS),
                "caller":
                datasets.Value("string"),
                "utterance_index":
                datasets.Value("int64"),
                "subutterance_index":
                datasets.Value("int64"),
                "text":
                datasets.Value("string"),
                "pos":
                datasets.Value("string"),
                "trees":
                datasets.Value("string"),
                "ptb_treenumbers":
                datasets.Value("string"),
                "talk_day":
                datasets.Value("string"),
                "length":
                datasets.Value("int64"),
                "topic_description":
                datasets.Value("string"),
                "prompt":
                datasets.Value("string"),
                "from_caller":
                datasets.Value("int64"),
                "from_caller_sex":
                datasets.Value("string"),
                "from_caller_education":
                datasets.Value("int64"),
                "from_caller_birth_year":
                datasets.Value("int64"),
                "from_caller_dialect_area":
                datasets.Value("string"),
                "to_caller":
                datasets.Value("int64"),
                "to_caller_sex":
                datasets.Value("string"),
                "to_caller_education":
                datasets.Value("int64"),
                "to_caller_birth_year":
                datasets.Value("int64"),
                "to_caller_dialect_area":
                datasets.Value("string"),
            }),
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )
Exemple #16
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "id": datasets.Value("int32"),
                 "dialogId": datasets.Value("int32"),
                 "context": datasets.Value("string"),
                 "users": [{"userType": datasets.Value("string"), "id": datasets.Value("string")}],
                 "evaluation": [
                     {
                         "breadth": datasets.Value("int32"),
                         "userId": datasets.Value("string"),
                         "quality": datasets.Value("int32"),
                         "engagement": datasets.Value("int32"),
                     }
                 ],
                 "thread": [
                     {
                         "evaluation": datasets.Value("int32"),
                         "text": datasets.Value("string"),
                         "userId": datasets.Value("string"),
                         "time": datasets.Value("int32"),
                     }
                 ],
             }
         ),
         supervised_keys=None,
         homepage="https://github.com/DeepPavlov/convai/tree/master/2017",
     )
Exemple #17
0
 def _info(self):
     features = datasets.Features(
         {
             "text": datasets.Value("string"),
             "label": datasets.features.ClassLabel(
                 names=[
                     "activate_my_card",
                     "age_limit",
                     "apple_pay_or_google_pay",
                     "atm_support",
                     "automatic_top_up",
                     "balance_not_updated_after_bank_transfer",
                     "balance_not_updated_after_cheque_or_cash_deposit",
                     "beneficiary_not_allowed",
                     "cancel_transfer",
                     "card_about_to_expire",
                     "card_acceptance",
                     "card_arrival",
                     "card_delivery_estimate",
                     "card_linking",
                     "card_not_working",
                     "card_payment_fee_charged",
                     "card_payment_not_recognised",
                     "card_payment_wrong_exchange_rate",
                     "card_swallowed",
                     "cash_withdrawal_charge",
                     "cash_withdrawal_not_recognised",
                     "change_pin",
                     "compromised_card",
                     "contactless_not_working",
                     "country_support",
                     "declined_card_payment",
                     "declined_cash_withdrawal",
                     "declined_transfer",
                     "direct_debit_payment_not_recognised",
                     "disposable_card_limits",
                     "edit_personal_details",
                     "exchange_charge",
                     "exchange_rate",
                     "exchange_via_app",
                     "extra_charge_on_statement",
                     "failed_transfer",
                     "fiat_currency_support",
                     "get_disposable_virtual_card",
                     "get_physical_card",
                     "getting_spare_card",
                     "getting_virtual_card",
                     "lost_or_stolen_card",
                     "lost_or_stolen_phone",
                     "order_physical_card",
                     "passcode_forgotten",
                     "pending_card_payment",
                     "pending_cash_withdrawal",
                     "pending_top_up",
                     "pending_transfer",
                     "pin_blocked",
                     "receiving_money",
                     "Refund_not_showing_up",
                     "request_refund",
                     "reverted_card_payment?",
                     "supported_cards_and_currencies",
                     "terminate_account",
                     "top_up_by_bank_transfer_charge",
                     "top_up_by_card_charge",
                     "top_up_by_cash_or_cheque",
                     "top_up_failed",
                     "top_up_limits",
                     "top_up_reverted",
                     "topping_up_by_card",
                     "transaction_charged_twice",
                     "transfer_fee_charged",
                     "transfer_into_account",
                     "transfer_not_received_by_recipient",
                     "transfer_timing",
                     "unable_to_verify_identity",
                     "verify_my_identity",
                     "verify_source_of_funds",
                     "verify_top_up",
                     "virtual_card_not_working",
                     "visa_or_mastercard",
                     "why_verify_identity",
                     "wrong_amount_of_cash_received",
                     "wrong_exchange_rate_for_cash_withdrawal",
                 ]
             ),
         }
     )
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
    def _info(self):
        features = {
            feature: datasets.Value("string")
            for feature in self.config.features
        }
        if self.config.name.startswith("wsc"):
            features["span1_index"] = datasets.Value("int32")
            features["span2_index"] = datasets.Value("int32")
        if self.config.name == "wic":
            features["start1"] = datasets.Value("int32")
            features["start2"] = datasets.Value("int32")
            features["end1"] = datasets.Value("int32")
            features["end2"] = datasets.Value("int32")
        if self.config.name == "multirc":
            features["idx"] = dict({
                "paragraph": datasets.Value("int32"),
                "question": datasets.Value("int32"),
                "answer": datasets.Value("int32"),
            })
        elif self.config.name == "record":
            features["idx"] = dict({
                "passage": datasets.Value("int32"),
                "query": datasets.Value("int32"),
            })
        else:
            features["idx"] = datasets.Value("int32")

        if self.config.name == "record":
            # Entities are the set of possible choices for the placeholder.
            features["entities"] = datasets.features.Sequence(
                datasets.Value("string"))
            # Answers are the subset of entities that are correct.
            features["answers"] = datasets.features.Sequence(
                datasets.Value("string"))
        else:
            features["label"] = datasets.features.ClassLabel(
                names=self.config.label_classes)

        return datasets.DatasetInfo(
            description=_GLUE_DESCRIPTION + self.config.description,
            features=datasets.Features(features),
            homepage=self.config.url,
            citation=self.config.citation + "\n" + _SUPER_GLUE_CITATION,
        )
Exemple #19
0
    def _info(self):
        if self.config.name == "evaluation_dataset":
            features = datasets.Features({
                "stackoverflow_id":
                datasets.Value("int32"),
                "question":
                datasets.Value("string"),
                "question_url":
                datasets.Value("string"),
                "question_author":
                datasets.Value("string"),
                "question_author_url":
                datasets.Value("string"),
                "answer":
                datasets.Value("string"),
                "answer_url":
                datasets.Value("string"),
                "answer_author":
                datasets.Value("string"),
                "answer_author_url":
                datasets.Value("string"),
                "examples":
                datasets.features.Sequence(datasets.Value("int32")),
                "examples_url":
                datasets.features.Sequence(datasets.Value("string")),
            })
        else:
            features = datasets.Features({
                "id":
                datasets.Value("int32"),
                "filepath":
                datasets.Value("string"),
                "method_name":
                datasets.Value("string"),
                "start_line":
                datasets.Value("int32"),
                "end_line":
                datasets.Value("int32"),
                "url":
                datasets.Value("string"),
            })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
Exemple #20
0
 def _info(self):
     span_features = {
         "start": datasets.Value("int32"),
         "end": datasets.Value("int32"),
         "string": datasets.Value("string"),
     }
     reference_features = {
         "start": datasets.Value("int32"),
         "end": datasets.Value("int32"),
         "bridge": datasets.Value("bool_"),
         "string": datasets.Value("string"),
     }
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "example_id": datasets.Value("int64"),
                 "title_text": datasets.Value("string"),
                 "url": datasets.Value("string"),
                 "question": datasets.Value("string"),
                 "paragraph_text": datasets.Value("string"),
                 "sentence_starts": datasets.Sequence(datasets.Value("int32")),
                 "original_nq_answers": [span_features],
                 "annotation": {
                     "referential_equalities": [
                         {
                             "question_reference": span_features,
                             "sentence_reference": reference_features,
                         }
                     ],
                     "answer": [
                         {
                             "sentence_reference": reference_features,
                             "paragraph_reference": span_features,
                         }
                     ],
                     "explanation_type": datasets.Value("string"),
                     "selected_sentence": span_features,
                 },
             }
         ),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         citation=_CITATION,
     )
Exemple #21
0
}
"""

_DESCRIPTION = """\
A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.
It is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace("[n]", "\n")`) and then use them for your purposes.
"""

_HOMEPAGE = "https://github.com/hooshvare/pn-summary"
_LICENSE = "MIT License"

_URLs = {
    "1.0.0": {
        "data": "https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download",
        "features": [
            {"name": "id", "type": datasets.Value("string")},
            {"name": "title", "type": datasets.Value("string")},
            {"name": "article", "type": datasets.Value("string")},
            {"name": "summary", "type": datasets.Value("string")},
            {
                "name": "category",
                "type": datasets.ClassLabel(
                    names=[
                        "Economy",
                        "Roads-Urban",
                        "Banking-Insurance",
                        "Agriculture",
                        "International",
                        "Oil-Energy",
                        "Industry",
                        "Transportation",
Exemple #22
0
    def _info(self):

        if self.config.name == "dialogue_domain":
            features = datasets.Features({
                "dial_id":
                datasets.Value("string"),
                "doc_id":
                datasets.Value("string"),
                "domain":
                datasets.Value("string"),
                "turns": [{
                    "turn_id":
                    datasets.Value("int32"),
                    "role":
                    datasets.Value("string"),
                    "da":
                    datasets.Value("string"),
                    "references": [{
                        "sp_id": datasets.Value("string"),
                        "label": datasets.Value("string"),
                    }],
                    "utterance":
                    datasets.Value("string"),
                }],
            })
        elif self.config.name == "document_domain":
            features = datasets.Features({
                "domain":
                datasets.Value("string"),
                "doc_id":
                datasets.Value("string"),
                "title":
                datasets.Value("string"),
                "doc_text":
                datasets.Value("string"),
                "spans": [{
                    "id_sp": datasets.Value("string"),
                    "tag": datasets.Value("string"),
                    "start_sp": datasets.Value("int32"),
                    "end_sp": datasets.Value("int32"),
                    "text_sp": datasets.Value("string"),
                    "title": datasets.Value("string"),
                    "parent_titles": datasets.Value("string"),
                    "id_sec": datasets.Value("string"),
                    "start_sec": datasets.Value("int32"),
                    "text_sec": datasets.Value("string"),
                    "end_sec": datasets.Value("int32"),
                }],
                "doc_html_ts":
                datasets.Value("string"),
                "doc_html_raw":
                datasets.Value("string"),
            })
        elif self.config.name == "doc2dial_rc":
            features = datasets.Features({
                "id":
                datasets.Value("string"),
                "title":
                datasets.Value("string"),
                "context":
                datasets.Value("string"),
                "question":
                datasets.Value("string"),
                "answers":
                datasets.features.Sequence({
                    "text":
                    datasets.Value("string"),
                    "answer_start":
                    datasets.Value("int32"),
                }),
                "domain":
                datasets.Value("string"),
            })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )
Exemple #23
0
 def _info(self):
     if self.config.name == "trex":
         features = datasets.Features({
             "uuid":
             datasets.Value("string"),
             "obj_uri":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "sub_uri":
             datasets.Value("string"),
             "sub_label":
             datasets.Value("string"),
             "predicate_id":
             datasets.Value("string"),
             "sub_surface":
             datasets.Value("string"),
             "obj_surface":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
             "template":
             datasets.Value("string"),
             "template_negated":
             datasets.Value("string"),
             "label":
             datasets.Value("string"),
             "description":
             datasets.Value("string"),
             "type":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
     elif self.config.name == "conceptnet":
         features = datasets.Features({
             "uuid":
             datasets.Value("string"),
             "sub":
             datasets.Value("string"),
             "obj":
             datasets.Value("string"),
             "pred":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
             "negated":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
     elif self.config.name == "squad":
         features = datasets.Features({
             "id":
             datasets.Value("string"),
             "sub_label":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "negated":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
     elif self.config.name == "google_re":
         features = datasets.Features({
             "pred":
             datasets.Value("string"),
             "sub":
             datasets.Value("string"),
             "obj":
             datasets.Value("string"),
             "evidences":
             datasets.Value("string"),
             "judgments":
             datasets.Value("string"),
             "sub_w":
             datasets.Value("string"),
             "sub_label":
             datasets.Value("string"),
             "sub_aliases":
             datasets.Value("string"),
             "obj_w":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "obj_aliases":
             datasets.Value("string"),
             "uuid":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
             "template":
             datasets.Value("string"),
             "template_negated":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
Exemple #24
0
            f"https://the-eye.eu/public/AI/pile/train/{i:0>2}.jsonl.zst"
            for i in range(30)
        ],
        "validation": ["https://the-eye.eu/public/AI/pile/val.jsonl.zst"],
        "test": ["https://the-eye.eu/public/AI/pile/test.jsonl.zst"],
    },
    "free_law":
    "https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
    "pubmed_central":
    "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz",
}

_FEATURES = {
    "all":
    datasets.Features({
        "text": datasets.Value("string"),
        "meta": {
            "pile_set_name": datasets.Value("string")
        },
    }),
    "free_law":
    datasets.Features({
        "text": datasets.Value("string"),
        "meta": {
            "case_ID": datasets.Value("string"),
            "case_jurisdiction": datasets.Value("string"),
            "date_created": datasets.Value("string"),
        },
    }),
    "pubmed_central":
    datasets.Features({
Exemple #25
0
 def features(self):
     if self.name == "simplified":
         return {
             "text":
             datasets.Value("string"),
             "labels":
             datasets.Sequence(datasets.ClassLabel(names=_CLASS_NAMES)),
             "id":
             datasets.Value("string"),
         }
     elif self.name == "raw":
         d = {
             "text": datasets.Value("string"),
             "id": datasets.Value("string"),
             "author": datasets.Value("string"),
             "subreddit": datasets.Value("string"),
             "link_id": datasets.Value("string"),
             "parent_id": datasets.Value("string"),
             "created_utc": datasets.Value("float"),
             "rater_id": datasets.Value("int32"),
             "example_very_unclear": datasets.Value("bool"),
         }
         d.update(
             {label: datasets.Value("int32")
              for label in _CLASS_NAMES})
         return d
Exemple #26
0
 def _info(self):
     # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
     features = datasets.Features(
         {
             "id": datasets.Value("int32"),
             "category": datasets.Value("string"),
             "text": datasets.Value("string"),
             "ner": datasets.features.Sequence(
                 {
                     "source": {
                         "from": datasets.Value("int32"),
                         "text": datasets.Value("string"),
                         "to": datasets.Value("int32"),
                         "type": datasets.features.ClassLabel(
                             names=[
                                 "PRODUCT_NAME",
                                 "PRODUCT_NAME_IMP",
                                 "PRODUCT_NO_BRAND",
                                 "BRAND_NAME",
                                 "BRAND_NAME_IMP",
                                 "VERSION",
                                 "PRODUCT_ADJ",
                                 "BRAND_ADJ",
                                 "LOCATION",
                                 "LOCATION_IMP",
                             ]
                         ),
                     },
                     "target": {
                         "from": datasets.Value("int32"),
                         "text": datasets.Value("string"),
                         "to": datasets.Value("int32"),
                         "type": datasets.features.ClassLabel(
                             names=[
                                 "PRODUCT_NAME",
                                 "PRODUCT_NAME_IMP",
                                 "PRODUCT_NO_BRAND",
                                 "BRAND_NAME",
                                 "BRAND_NAME_IMP",
                                 "VERSION",
                                 "PRODUCT_ADJ",
                                 "BRAND_ADJ",
                                 "LOCATION",
                                 "LOCATION_IMP",
                             ]
                         ),
                     },
                 }
             ),
         }
     )
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )
Exemple #27
0
    def _info(self):

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "document": {
                    "id": datasets.Value("string"),
                    "kind": datasets.Value("string"),
                    "url": datasets.Value("string"),
                    "file_size": datasets.Value("int32"),
                    "word_count": datasets.Value("int32"),
                    "start": datasets.Value("string"),
                    "end": datasets.Value("string"),
                    "summary": {
                        "text":
                        datasets.Value("string"),
                        "tokens":
                        datasets.features.Sequence(datasets.Value("string")),
                        "url":
                        datasets.Value("string"),
                        "title":
                        datasets.Value("string"),
                    },
                    "text": datasets.Value("string"),
                },
                "question": {
                    "text": datasets.Value("string"),
                    "tokens":
                    datasets.features.Sequence(datasets.Value("string")),
                },
                "answers": [{
                    "text":
                    datasets.Value("string"),
                    "tokens":
                    datasets.features.Sequence(datasets.Value("string")),
                }],
            }),
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
Exemple #28
0
 def _info(self):
     # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
     features = datasets.Features({
         "full_text":
         datasets.Value("string"),
         "text_translation":
         datasets.Value("string"),
         "screen_name":
         datasets.Value("string"),
         "description":
         datasets.Value("string"),
         "desc_translation":
         datasets.Value("string"),
         "location":
         datasets.Value("string"),
         "weekofyear":
         datasets.Value("int64"),
         "weekday":
         datasets.Value("int64"),
         "month":
         datasets.Value("int64"),
         "year":
         datasets.Value("int64"),
         "day":
         datasets.Value("int64"),
         "point_info":
         datasets.Value("string"),
         "point":
         datasets.Value("string"),
         "latitude":
         datasets.Value("float64"),
         "longitude":
         datasets.Value("float64"),
         "altitude":
         datasets.Value("float64"),
         "province":
         datasets.Value("string"),
         "hisco_standard":
         datasets.Value("string"),
         "hisco_code":
         datasets.Value("string"),
         "industry":
         datasets.Value("bool_"),
         "sentiment_pattern":
         datasets.Value("float64"),
         "subjective_pattern":
         datasets.Value("float64"),
         "label":
         datasets.ClassLabel(num_classes=3,
                             names=["neg", "neu", "pos"],
                             names_file=None,
                             id=None),
     })
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=
         features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )
Exemple #29
0
 def _info(self):
     if self.config.name == "schema":
         features = datasets.Features({
             "service_name":
             datasets.Value("string"),
             "description":
             datasets.Value("string"),
             "slots":
             datasets.Sequence({
                 "name":
                 datasets.Value("string"),
                 "description":
                 datasets.Value("string"),
                 "is_categorical":
                 datasets.Value("bool"),
                 "possible_values":
                 datasets.Sequence(datasets.Value("string")),
             }),
             "intents":
             datasets.Sequence(
                 {
                     "name":
                     datasets.Value("string"),
                     "description":
                     datasets.Value("string"),
                     "is_transactional":
                     datasets.Value("bool"),
                     "required_slots":
                     datasets.Sequence(datasets.Value("string")),
                     # optional_slots was originally a dictionary
                     "optional_slots":
                     datasets.Sequence(
                         {
                             "slot_name": datasets.Value("string"),
                             "slot_value": datasets.Value("string"),
                         }),
                     "result_slots":
                     datasets.Sequence(datasets.Value("string")),
                 }, ),
         })
     else:
         features = datasets.Features({
             "dialogue_id":
             datasets.Value("string"),
             "services":
             datasets.Sequence(datasets.Value("string")),
             "turns":
             datasets.Sequence({
                 "speaker":
                 datasets.ClassLabel(names=["USER", "SYSTEM"]),
                 "utterance":
                 datasets.Value("string"),
                 "frames":
                 datasets.Sequence({
                     "service":
                     datasets.Value("string"),
                     "slots":
                     datasets.Sequence({
                         "slot":
                         datasets.Value("string"),
                         "start":
                         datasets.Value("int32"),
                         "exclusive_end":
                         datasets.Value("int32"),
                     }),
                     # optional
                     "state": {
                         "active_intent":
                         datasets.Value("string"),
                         "requested_slots":
                         datasets.Sequence(datasets.Value("string")),
                         # slot_values was originally a dictionary
                         "slot_values":
                         datasets.Sequence({
                             "slot_name":
                             datasets.Value("string"),
                             "slot_value_list":
                             datasets.Sequence(datasets.Value("string")),
                         }),
                     },
                     "actions":
                     datasets.Sequence({
                         "act":
                         datasets.ClassLabel(names=_ALL_ACTS),
                         # optional
                         "slot":
                         datasets.Value("string"),
                         # optional
                         "canonical_values":
                         datasets.Sequence(datasets.Value("string")),
                         # optional
                         "values":
                         datasets.Sequence(datasets.Value("string")),
                     }),
                     # optional
                     "service_results":
                     datasets.Sequence(
                         # Arrow doesn't like Sequences of Sequences for default values so we need a Sequence of Features of Sequences
                         {
                             "service_results_list":
                             datasets.Sequence(
                                 # originally each list item was a dictionary (optional)
                                 {
                                     "service_slot_name":
                                     datasets.Value("string"),
                                     "service_canonical_value":
                                     datasets.Value("string"),
                                 })
                         }),
                     # optional
                     "service_call": {
                         "method":
                         datasets.Value("string"),
                         # parameters was originally a dictionary
                         "parameters":
                         datasets.Sequence({
                             "parameter_slot_name":
                             datasets.Value("string"),
                             "parameter_canonical_value":
                             datasets.Value("string"),
                         }),
                     },
                 }),
             }),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
Exemple #30
0
 def _info(self):
     features = datasets.Features({
         "dialogue_id":
         datasets.Value("string"),
         "services":
         datasets.Sequence(datasets.Value("string")),
         "turns":
         datasets.Sequence({
             "turn_id":
             datasets.Value("string"),
             "speaker":
             datasets.ClassLabel(names=["USER", "SYSTEM"]),
             "utterance":
             datasets.Value("string"),
             "frames":
             datasets.Sequence({
                 "service":
                 datasets.Value("string"),
                 "state": {
                     "active_intent":
                     datasets.Value("string"),
                     "requested_slots":
                     datasets.Sequence(datasets.Value("string")),
                     "slots_values":
                     datasets.Sequence({
                         "slots_values_name":
                         datasets.Value("string"),
                         "slots_values_list":
                         datasets.Sequence(datasets.Value("string")),
                     }),
                 },
                 "slots":
                 datasets.Sequence({
                     "slot":
                     datasets.Value("string"),
                     "value":
                     datasets.Value("string"),
                     "start":
                     datasets.Value("int32"),
                     "exclusive_end":
                     datasets.Value("int32"),
                     "copy_from":
                     datasets.Value("string"),
                     "copy_from_value":
                     datasets.Sequence(datasets.Value("string")),
                 }),
             }),
             "dialogue_acts":
             datasets.Features({
                 "dialog_act":
                 datasets.Sequence({
                     "act_type":
                     datasets.Value("string"),
                     "act_slots":
                     datasets.Sequence(
                         datasets.Features({
                             "slot_name":
                             datasets.Value("string"),
                             "slot_value":
                             datasets.Value("string"),
                         }), ),
                 }),
                 "span_info":
                 datasets.Sequence({
                     "act_type":
                     datasets.Value("string"),
                     "act_slot_name":
                     datasets.Value("string"),
                     "act_slot_value":
                     datasets.Value("string"),
                     "span_start":
                     datasets.Value("int32"),
                     "span_end":
                     datasets.Value("int32"),
                 }),
             }),
         }),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=
         "https://github.com/budzianowski/multiwoz/tree/master/data/MultiWOZ_2.2",
         license=_LICENSE,
         citation=_CITATION,
     )