Python Value Exemples, datasets.Value Python Exemples

Exemple #1

0

Afficher le fichier

    def _info(self):
        features = datasets.Features({
            "text":
            datasets.Value("string"),
            "sentence_offsets":
            datasets.features.Sequence({
                "begin_char_offset":
                datasets.Value("int64"),
                "end_char_offset":
                datasets.Value("int64")
            }),
            "sentences":
            datasets.features.Sequence(datasets.Value("string")),
            "sentence_labels":
            datasets.features.Sequence(datasets.Value("int64")),
            "token_offsets":
            datasets.features.Sequence({
                "offsets":
                datasets.features.Sequence({
                    "begin_char_offset":
                    datasets.Value("int64"),
                    "end_char_offset":
                    datasets.Value("int64")
                })
            }),
            "tokens":
            datasets.features.Sequence(
                datasets.features.Sequence(datasets.Value("string"))),
            "entity_labels":
            datasets.features.Sequence(
                datasets.features.Sequence(
                    datasets.features.ClassLabel(names=[
                        "B-DEVICE",
                        "B-EXPERIMENT",
                        "B-MATERIAL",
                        "B-VALUE",
                        "I-DEVICE",
                        "I-EXPERIMENT",
                        "I-MATERIAL",
                        "I-VALUE",
                        "O",
                    ]))),
            "slot_labels":
            datasets.features.Sequence(
                datasets.features.Sequence(
                    datasets.features.ClassLabel(names=[
                        "B-anode_material",
                        "B-cathode_material",
                        "B-conductivity",
                        "B-current_density",
                        "B-degradation_rate",
                        "B-device",
                        "B-electrolyte_material",
                        "B-experiment_evoking_word",
                        "B-fuel_used",
                        "B-interlayer_material",
                        "B-interconnect_material",
                        "B-open_circuit_voltage",
                        "B-power_density",
                        "B-resistance",
                        "B-support_material",
                        "B-thickness",
                        "B-time_of_operation",
                        "B-voltage",
                        "B-working_temperature",
                        "I-anode_material",
                        "I-cathode_material",
                        "I-conductivity",
                        "I-current_density",
                        "I-degradation_rate",
                        "I-device",
                        "I-electrolyte_material",
                        "I-experiment_evoking_word",
                        "I-fuel_used",
                        "I-interlayer_material",
                        "I-interconnect_material",
                        "I-open_circuit_voltage",
                        "I-power_density",
                        "I-resistance",
                        "I-support_material",
                        "I-thickness",
                        "I-time_of_operation",
                        "I-voltage",
                        "I-working_temperature",
                        "O",
                    ]))),
            "links":
            datasets.Sequence({
                "relation_label":
                datasets.features.ClassLabel(names=[
                    "coreference", "experiment_variation", "same_experiment",
                    "thickness"
                ]),
                "start_span_id":
                datasets.Value("int64"),
                "end_span_id":
                datasets.Value("int64"),
            }),
            "slots":
            datasets.features.Sequence({
                "frame_participant_label":
                datasets.features.ClassLabel(names=[
                    "anode_material",
                    "cathode_material",
                    "current_density",
                    "degradation_rate",
                    "device",
                    "electrolyte_material",
                    "fuel_used",
                    "interlayer_material",
                    "open_circuit_voltage",
                    "power_density",
                    "resistance",
                    "support_material",
                    "time_of_operation",
                    "voltage",
                    "working_temperature",
                ]),
                "slot_id":
                datasets.Value("int64"),
            }),
            "spans":
            datasets.features.Sequence({
                "span_id":
                datasets.Value("int64"),
                "entity_label":
                datasets.features.ClassLabel(
                    names=["", "DEVICE", "MATERIAL", "VALUE"]),
                "sentence_id":
                datasets.Value("int64"),
                "experiment_mention_type":
                datasets.features.ClassLabel(names=[
                    "", "current_exp", "future_work", "general_info",
                    "previous_work"
                ]),
                "begin_char_offset":
                datasets.Value("int64"),
                "end_char_offset":
                datasets.Value("int64"),
            }),
            "experiments":
            datasets.features.Sequence({
                "experiment_id":
                datasets.Value("int64"),
                "span_id":
                datasets.Value("int64"),
                "slots":
                datasets.features.Sequence({
                    "frame_participant_label":
                    datasets.features.ClassLabel(names=[
                        "anode_material",
                        "cathode_material",
                        "current_density",
                        "degradation_rate",
                        "conductivity",
                        "device",
                        "electrolyte_material",
                        "fuel_used",
                        "interlayer_material",
                        "open_circuit_voltage",
                        "power_density",
                        "resistance",
                        "support_material",
                        "time_of_operation",
                        "voltage",
                        "working_temperature",
                    ]),
                    "slot_id":
                    datasets.Value("int64"),
                }),
            }),
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

Exemple #2

0

Afficher le fichier

Fichier : mocha.py Projet : aiinnova/huggingface_datasets

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "constituent_dataset":
             datasets.Value("string"),
             "id":
             datasets.Value("string"),
             "context":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "reference":
             datasets.Value("string"),
             "candidate":
             datasets.Value("string"),
             "score":
             datasets.Value("float"),
             "metadata": {
                 "scores":
                 datasets.features.Sequence(datasets.Value("int32")),
                 "source": datasets.Value("string"),
             },
             # features for minimal pairs
             "candidate2":
             datasets.Value("string"),
             "score2":
             datasets.Value("float"),
         }),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

Exemple #3

0

Afficher le fichier

def benchmark_iterating():
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
    functions = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "pandas",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "torch",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "tensorflow",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]

    functions_shuffled = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]
    with tempfile.TemporaryDirectory() as tmp_dir:
        print("generating dataset")
        features = datasets.Features({
            "list":
            datasets.Sequence(datasets.Value("float32")),
            "numbers":
            datasets.Value("float32")
        })
        dataset = generate_example_dataset(
            os.path.join(tmp_dir, "dataset.arrow"),
            features,
            num_examples=SPEED_TEST_N_EXAMPLES,
            seq_shapes={"list": (100, )},
        )
        print("first set of iterations")
        for func, kwargs in functions:
            print(func.__name__, str(kwargs))
            times[func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

        print("shuffling dataset")
        dataset = dataset.shuffle()
        print("Second set of iterations (after shuffling")
        for func, kwargs in functions_shuffled:
            print("shuffled ", func.__name__, str(kwargs))
            times["shuffled " + func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))

Exemple #4

0

Afficher le fichier

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "context_id":
             datasets.Value("string"),
             "question_id":
             datasets.Value("string"),
             "domain":
             datasets.Value("string"),
             "metadata": {
                 "author": datasets.Value("string"),
                 "title": datasets.Value("string"),
                 "url": datasets.Value("string"),
             },
             "context":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "question_type":
             datasets.Value("string"),
             "answers":
             datasets.features.Sequence(datasets.Value("string"), ),
             "correct_answer_id":
             datasets.Value("int32"),
         }),
         # No default supervised_keys (as we have to pass both question
         # and context as input).
         supervised_keys=None,
         homepage="https://text-machine-lab.github.io/blog/2020/quail/",
         citation=_CITATION,
     )

Exemple #5

0

Afficher le fichier

    def _info(self):
        features = datasets.Features({
            "id":
            datasets.Value("string"),
            "text":
            datasets.Value("string"),
            "tokens":
            datasets.Sequence(datasets.Value("string")),
            "nps": [{
                "text": datasets.Value("string"),
                "first_char": datasets.Value("int32"),
                "last_char": datasets.Value("int32"),
                "first_token": datasets.Value("int32"),
                "last_token": datasets.Value("int32"),
                "id": datasets.Value("string"),
            }],
            "np_relations": [{
                "anchor":
                datasets.Value("string"),
                "complement":
                datasets.Value("string"),
                "preposition":
                datasets.features.ClassLabel(names=[
                    "about",
                    "for",
                    "with",
                    "from",
                    "among",
                    "by",
                    "on",
                    "at",
                    "during",
                    "of",
                    "member(s) of",
                    "in",
                    "after",
                    "under",
                    "to",
                    "into",
                    "before",
                    "near",
                    "outside",
                    "around",
                    "between",
                    "against",
                    "over",
                    "inside",
                ]),
                "complement_coref_cluster_id":
                datasets.Value("string"),
            }],
            "coref": [{
                "id":
                datasets.Value("string"),
                "members":
                datasets.Sequence(datasets.Value("string")),
                "np_type":
                datasets.features.ClassLabel(names=[
                    "standard",
                    "time/date/measurement",
                    "idiomatic",
                ]),
            }],
            "metadata": {
                "annotators": {
                    "coref_worker":
                    datasets.Value("int32"),
                    "consolidator_worker":
                    datasets.Value("int32"),
                    "np-relations_worker":
                    datasets.Sequence(datasets.Value("int32")),
                },
                "url": datasets.Value("string"),
                "source": datasets.Value("string"),
            },
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
            # supervised_keys=("sentence", "label"),
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

Exemple #6

0

Afficher le fichier

Fichier : super_glue.py Projet : aiinnova/huggingface_datasets

 def _get_feature_types(self):
     if self.config_name == "record":
         return {
             "predictions": {
                 "idx": {
                     "passage": datasets.Value("int64"),
                     "query": datasets.Value("int64"),
                 },
                 "prediction_text": datasets.Value("string"),
             },
             "references": {
                 "idx": {
                     "passage": datasets.Value("int64"),
                     "query": datasets.Value("int64"),
                 },
                 "answers": datasets.Sequence(datasets.Value("string")),
             },
         }
     elif self.config_name == "multirc":
         return {
             "predictions": {
                 "idx": {
                     "answer": datasets.Value("int64"),
                     "paragraph": datasets.Value("int64"),
                     "question": datasets.Value("int64"),
                 },
                 "prediction": datasets.Value("int64"),
             },
             "references": datasets.Value("int64"),
         }
     else:
         return {
             "predictions": datasets.Value("int64"),
             "references": datasets.Value("int64"),
         }

Exemple #7

0

Afficher le fichier

Fichier : xtreme.py Projet : yngtodd/datasets

    def _info(self):
        # TODO(xtreme): Specifies the datasets.DatasetInfo object
        features = {text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)}
        if "answers" in features.keys():
            features["answers"] = datasets.features.Sequence(
                {"answer_start": datasets.Value("int32"), "text": datasets.Value("string")}
            )
        if self.config.name.startswith("PAWS-X"):
            features["label"] = datasets.Value("string")
        if self.config.name == "XNLI":
            features["gold_label"] = datasets.Value("string")

        if self.config.name.startswith("udpos"):
            features = datasets.Features(
                {
                    "token": datasets.Value("string"),
                    "pos_tag": datasets.features.ClassLabel(
                        names=[
                            "ADJ",
                            "ADP",
                            "ADV",
                            "AUX",
                            "CCONJ",
                            "DET",
                            "INTJ",
                            "NOUN",
                            "NUM",
                            "PART",
                            "PRON",
                            "PROPN",
                            "PUNCT",
                            "SCONJ",
                            "SYM",
                            "VERB",
                            "X",
                        ]
                    ),
                }
            )

        if self.config.name.startswith("PAN-X"):
            features = datasets.Features(
                {
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "O",
                                "B-PER",
                                "I-PER",
                                "B-ORG",
                                "I-ORG",
                                "B-LOC",
                                "I-LOC",
                            ]
                        )
                    ),
                    "langs": datasets.Sequence(datasets.Value("string")),
                }
            )
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=self.config.description + "\n" + _DESCRIPTION,
            # datasets.features.FeatureConnectors
            features=datasets.Features(
                features
                # These are the features of your dataset like images, labels ...
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url,
            citation=self.config.citation + "\n" + _CITATION,
        )

Exemple #8

0

Afficher le fichier

 def _info(self):
     features = datasets.Features({
         "event":
         datasets.Value("string"),
         "oEffect":
         datasets.Sequence(datasets.Value("string")),
         "oReact":
         datasets.Sequence(datasets.Value("string")),
         "oWant":
         datasets.Sequence(datasets.Value("string")),
         "xAttr":
         datasets.Sequence(datasets.Value("string")),
         "xEffect":
         datasets.Sequence(datasets.Value("string")),
         "xIntent":
         datasets.Sequence(datasets.Value("string")),
         "xNeed":
         datasets.Sequence(datasets.Value("string")),
         "xReact":
         datasets.Sequence(datasets.Value("string")),
         "xWant":
         datasets.Sequence(datasets.Value("string")),
         "prefix":
         datasets.Sequence(datasets.Value("string")),
         "split":
         datasets.Value("string"),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

Exemple #9

0

Afficher le fichier

Fichier : grail_qa.py Projet : Priyansh2/nlp

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "qid": datasets.Value("string"),
                 "question": datasets.Value("string"),
                 "answer": datasets.features.Sequence(
                     {
                         "answer_type": datasets.Value("string"),
                         "answer_argument": datasets.Value("string"),
                         "entity_name": datasets.Value("string"),
                     }
                 ),
                 "function": datasets.Value("string"),
                 "num_node": datasets.Value("int32"),
                 "num_edge": datasets.Value("int32"),
                 "graph_query": {
                     "nodes": datasets.features.Sequence(
                         {
                             "nid": datasets.Value("int32"),
                             "node_type": datasets.Value("string"),
                             "id": datasets.Value("string"),
                             "class": datasets.Value("string"),
                             "friendly_name": datasets.Value("string"),
                             "question_node": datasets.Value("int32"),
                             "function": datasets.Value("string"),
                         }
                     ),
                     "edges": datasets.features.Sequence(
                         {
                             "start": datasets.Value("int32"),
                             "end": datasets.Value("int32"),
                             "relation": datasets.Value("string"),
                             "friendly_name": datasets.Value("string"),
                         }
                     ),
                 },
                 "sparql_query": datasets.Value("string"),
                 "domains": datasets.features.Sequence(datasets.Value("string")),
                 "level": datasets.Value("string"),
                 "s_expression": datasets.Value("string"),
             }
         ),
         # No default supervised_keys (as we have to pass both question
         # and context as input).
         supervised_keys=None,
         homepage="https://dki-lab.github.io/GrailQA/",
         citation=_CITATION,
     )

Exemple #10

0

Afficher le fichier

Fichier : code_x_glue_tc_nl_code_search_adv.py Projet : albertvillanova/huggingface_datasets

class CodeXGlueTcNLCodeSearchAdvImpl(CodeXGlueCtCodeToTextBaseImpl):
    LANGUAGE = "python"
    SINGLE_LANGUAGE = True

    _FEATURES = {
        "id": datasets.Value("int32"),  # Index of the sample
        "repo": datasets.Value("string"),  # repo: the owner/repo
        "path": datasets.Value("string"),  # path: the full path to the original file
        "func_name": datasets.Value("string"),  # func_name: the function or method name
        "original_string": datasets.Value("string"),  # original_string: the raw string before tokenization or parsing
        "language": datasets.Value("string"),  # language: the programming language
        "code": datasets.Value("string"),  # code/function: the part of the original_string that is code
        "code_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # code_tokens/function_tokens: tokenized version of code
        "docstring": datasets.Value(
            "string"
        ),  # docstring: the top-level comment or docstring, if it exists in the original string
        "docstring_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # docstring_tokens: tokenized version of docstring
        "sha": datasets.Value("string"),  # sha of the file
        "url": datasets.Value("string"),  # url of the file
        "docstring_summary": datasets.Value("string"),  # Summary of the docstring
        "parameters": datasets.Value("string"),  # parameters of the function
        "return_statement": datasets.Value("string"),  # return statement
        "argument_list": datasets.Value("string"),  # list of arguments of the function
        "identifier": datasets.Value("string"),  # identifier
        "nwo": datasets.Value("string"),  # nwo
        "score": datasets.Value("float"),  # score for this search
    }

    def post_process(self, split_name, language, js):
        for suffix in "_tokens", "":
            key = "function" + suffix
            if key in js:
                js["code" + suffix] = js[key]
                del js[key]

        for key in self._FEATURES:
            if key not in js:
                if key == "score":
                    js[key] = -1
                else:
                    js[key] = ""

        return js

    def generate_urls(self, split_name):
        for e in super().generate_urls(split_name, self.LANGUAGE):
            yield e

    def get_data_files(self, split_name, file_paths, language):
        if split_name == "train":
            return super().get_data_files(split_name, file_paths, language)
        else:
            data_set_path = file_paths["dataset"]
            data_file = os.path.join(data_set_path, "dataset", "test_code.jsonl")
            return [data_file]

    def _generate_examples(self, split_name, file_paths):
        for e in super()._generate_examples(split_name, file_paths, self.LANGUAGE):
            yield e

Exemple #11

0

Afficher le fichier

Fichier : code_x_glue_tc_nl_code_search_adv.py Projet : albertvillanova/huggingface_datasets

class CodeXGlueCtCodeToTextBaseImpl(TrainValidTestChild):
    _DESCRIPTION = _DESCRIPTION
    _CITATION = _CITATION

    # For each file, each line in the uncompressed file represents one function.
    _FEATURES = {
        "id": datasets.Value("int32"),  # Index of the sample
        "repo": datasets.Value("string"),  # repo: the owner/repo
        "path": datasets.Value("string"),  # path: the full path to the original file
        "func_name": datasets.Value("string"),  # func_name: the function or method name
        "original_string": datasets.Value("string"),  # original_string: the raw string before tokenization or parsing
        "language": datasets.Value("string"),  # language: the programming language name
        "code": datasets.Value("string"),  # code/function: the part of the original_string that is code
        "code_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # code_tokens/function_tokens: tokenized version of code
        "docstring": datasets.Value(
            "string"
        ),  # docstring: the top-level comment or docstring, if it exists in the original string
        "docstring_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # docstring_tokens: tokenized version of docstring
        "sha": datasets.Value("string"),  # sha of the file
        "url": datasets.Value("string"),  # url of the file
    }

    _SUPERVISED_KEYS = ["docstring", "docstring_tokens"]

    def generate_urls(self, split_name, language):
        yield "language", f"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{language}.zip"
        yield "dataset", "dataset.zip"

    def get_data_files(self, split_name, file_paths, language):
        language_specific_path = file_paths["language"]
        final_path = os.path.join(language_specific_path, language, "final")
        # Make some cleanup to save space
        for path in os.listdir(final_path):
            if path.endswith(".pkl"):
                os.unlink(path)

        data_files = []
        for root, dirs, files in os.walk(final_path):
            for file in files:
                temp = os.path.join(root, file)
                if ".jsonl" in temp:
                    if split_name in temp:
                        data_files.append(temp)
        return data_files

    def post_process(self, split_name, language, js):
        return js

    def _generate_examples(self, split_name, file_paths, language):
        import gzip

        data_set_path = file_paths["dataset"]

        data_files = self.get_data_files(split_name, file_paths, language)

        urls = {}
        f1_path_parts = [data_set_path, "dataset", language, f"{split_name}.txt"]
        if self.SINGLE_LANGUAGE:
            del f1_path_parts[2]

        f1_path = os.path.join(*f1_path_parts)
        with open(f1_path, encoding="utf-8") as f1:
            for line in f1:
                line = line.strip()
                urls[line] = True

        idx = 0
        for file in data_files:
            if ".gz" in file:
                f = gzip.open(file)
            else:
                f = open(file, encoding="utf-8")

            for line in f:
                line = line.strip()
                js = json.loads(line)
                if js["url"] in urls:
                    js["id"] = idx
                    js = self.post_process(split_name, language, js)
                    if "partition" in js:
                        del js["partition"]
                    yield idx, js
                    idx += 1
            f.close()

Exemple #12

0

Afficher le fichier

Fichier : gap.py Projet : lingsond/huggingface-datasets

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "ID": datasets.Value("string"),
                 "Text": datasets.Value("string"),
                 "Pronoun": datasets.Value("string"),
                 "Pronoun-offset": datasets.Value("int32"),
                 "A": datasets.Value("string"),
                 "A-offset": datasets.Value("int32"),
                 "A-coref": datasets.Value("bool"),
                 "B": datasets.Value("string"),
                 "B-offset": datasets.Value("int32"),
                 "B-coref": datasets.Value("bool"),
                 "URL": datasets.Value("string"),
             }
         ),
         supervised_keys=None,
         homepage="https://github.com/google-research-datasets/gap-coreference",
         citation=_CITATION,
     )

Exemple #13

0

Afficher le fichier

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "source":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-LOC",
                     "I-LOC",
                     "B-LOCderiv",
                     "I-LOCderiv",
                     "B-LOCpart",
                     "I-LOCpart",
                     "B-ORG",
                     "I-ORG",
                     "B-ORGderiv",
                     "I-ORGderiv",
                     "B-ORGpart",
                     "I-ORGpart",
                     "B-OTH",
                     "I-OTH",
                     "B-OTHderiv",
                     "I-OTHderiv",
                     "B-OTHpart",
                     "I-OTHpart",
                     "B-PER",
                     "I-PER",
                     "B-PERderiv",
                     "I-PERderiv",
                     "B-PERpart",
                     "I-PERpart",
                 ])),
             "nested_ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-LOC",
                     "I-LOC",
                     "B-LOCderiv",
                     "I-LOCderiv",
                     "B-LOCpart",
                     "I-LOCpart",
                     "B-ORG",
                     "I-ORG",
                     "B-ORGderiv",
                     "I-ORGderiv",
                     "B-ORGpart",
                     "I-ORGpart",
                     "B-OTH",
                     "I-OTH",
                     "B-OTHderiv",
                     "I-OTHderiv",
                     "B-OTHpart",
                     "I-OTHpart",
                     "B-PER",
                     "I-PER",
                     "B-PERderiv",
                     "I-PERderiv",
                     "B-PERpart",
                     "I-PERpart",
                 ])),
         }),
         supervised_keys=None,
         homepage="https://sites.google.com/site/germeval2014ner/",
         citation=_CITATION,
     )

Exemple #14

0

Afficher le fichier

 def _info(self):
     # TODO(quartz): Specifies the datasets.DatasetInfo object
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # datasets.features.FeatureConnectors
         features=datasets.Features({
             # These are the features of your dataset like images, labels ...
             "id":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "choices":
             datasets.features.Sequence({
                 "text": datasets.Value("string"),
                 "label": datasets.Value("string")
             }),
             "answerKey":
             datasets.Value("string"),
             "para":
             datasets.Value("string"),
             "para_id":
             datasets.Value("string"),
             "para_anno": {
                 "effect_prop": datasets.Value("string"),
                 "cause_dir_str": datasets.Value("string"),
                 "effect_dir_str": datasets.Value("string"),
                 "cause_dir_sign": datasets.Value("string"),
                 "effect_dir_sign": datasets.Value("string"),
                 "cause_prop": datasets.Value("string"),
             },
             "question_anno": {
                 "more_effect_dir": datasets.Value("string"),
                 "less_effect_dir": datasets.Value("string"),
                 "less_cause_prop": datasets.Value("string"),
                 "more_effect_prop": datasets.Value("string"),
                 "less_effect_prop": datasets.Value("string"),
                 "less_cause_dir": datasets.Value("string"),
             },
         }),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage="https://allenai.org/data/quartz",
         citation=_CITATION,
     )

Exemple #15

0

Afficher le fichier

    def _info(self):
        """
        Specify the datasets.DatasetInfo object which contains information and typings for the dataset.
        """

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types.
            features=datasets.Features({
                "swda_filename":
                datasets.Value("string"),
                "ptb_basename":
                datasets.Value("string"),
                "conversation_no":
                datasets.Value("int64"),
                "transcript_index":
                datasets.Value("int64"),
                "act_tag":
                datasets.ClassLabel(num_classes=217, names=_ACT_TAGS),
                "damsl_act_tag":
                datasets.ClassLabel(num_classes=43, names=_DAMSL_ACT_TAGS),
                "caller":
                datasets.Value("string"),
                "utterance_index":
                datasets.Value("int64"),
                "subutterance_index":
                datasets.Value("int64"),
                "text":
                datasets.Value("string"),
                "pos":
                datasets.Value("string"),
                "trees":
                datasets.Value("string"),
                "ptb_treenumbers":
                datasets.Value("string"),
                "talk_day":
                datasets.Value("string"),
                "length":
                datasets.Value("int64"),
                "topic_description":
                datasets.Value("string"),
                "prompt":
                datasets.Value("string"),
                "from_caller":
                datasets.Value("int64"),
                "from_caller_sex":
                datasets.Value("string"),
                "from_caller_education":
                datasets.Value("int64"),
                "from_caller_birth_year":
                datasets.Value("int64"),
                "from_caller_dialect_area":
                datasets.Value("string"),
                "to_caller":
                datasets.Value("int64"),
                "to_caller_sex":
                datasets.Value("string"),
                "to_caller_education":
                datasets.Value("int64"),
                "to_caller_birth_year":
                datasets.Value("int64"),
                "to_caller_dialect_area":
                datasets.Value("string"),
            }),
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

Exemple #16

0

Afficher le fichier

Fichier : conv_ai.py Projet : Priyansh2/nlp

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "id": datasets.Value("int32"),
                 "dialogId": datasets.Value("int32"),
                 "context": datasets.Value("string"),
                 "users": [{"userType": datasets.Value("string"), "id": datasets.Value("string")}],
                 "evaluation": [
                     {
                         "breadth": datasets.Value("int32"),
                         "userId": datasets.Value("string"),
                         "quality": datasets.Value("int32"),
                         "engagement": datasets.Value("int32"),
                     }
                 ],
                 "thread": [
                     {
                         "evaluation": datasets.Value("int32"),
                         "text": datasets.Value("string"),
                         "userId": datasets.Value("string"),
                         "time": datasets.Value("int32"),
                     }
                 ],
             }
         ),
         supervised_keys=None,
         homepage="https://github.com/DeepPavlov/convai/tree/master/2017",
     )

Exemple #17

0

Afficher le fichier

 def _info(self):
     features = datasets.Features(
         {
             "text": datasets.Value("string"),
             "label": datasets.features.ClassLabel(
                 names=[
                     "activate_my_card",
                     "age_limit",
                     "apple_pay_or_google_pay",
                     "atm_support",
                     "automatic_top_up",
                     "balance_not_updated_after_bank_transfer",
                     "balance_not_updated_after_cheque_or_cash_deposit",
                     "beneficiary_not_allowed",
                     "cancel_transfer",
                     "card_about_to_expire",
                     "card_acceptance",
                     "card_arrival",
                     "card_delivery_estimate",
                     "card_linking",
                     "card_not_working",
                     "card_payment_fee_charged",
                     "card_payment_not_recognised",
                     "card_payment_wrong_exchange_rate",
                     "card_swallowed",
                     "cash_withdrawal_charge",
                     "cash_withdrawal_not_recognised",
                     "change_pin",
                     "compromised_card",
                     "contactless_not_working",
                     "country_support",
                     "declined_card_payment",
                     "declined_cash_withdrawal",
                     "declined_transfer",
                     "direct_debit_payment_not_recognised",
                     "disposable_card_limits",
                     "edit_personal_details",
                     "exchange_charge",
                     "exchange_rate",
                     "exchange_via_app",
                     "extra_charge_on_statement",
                     "failed_transfer",
                     "fiat_currency_support",
                     "get_disposable_virtual_card",
                     "get_physical_card",
                     "getting_spare_card",
                     "getting_virtual_card",
                     "lost_or_stolen_card",
                     "lost_or_stolen_phone",
                     "order_physical_card",
                     "passcode_forgotten",
                     "pending_card_payment",
                     "pending_cash_withdrawal",
                     "pending_top_up",
                     "pending_transfer",
                     "pin_blocked",
                     "receiving_money",
                     "Refund_not_showing_up",
                     "request_refund",
                     "reverted_card_payment?",
                     "supported_cards_and_currencies",
                     "terminate_account",
                     "top_up_by_bank_transfer_charge",
                     "top_up_by_card_charge",
                     "top_up_by_cash_or_cheque",
                     "top_up_failed",
                     "top_up_limits",
                     "top_up_reverted",
                     "topping_up_by_card",
                     "transaction_charged_twice",
                     "transfer_fee_charged",
                     "transfer_into_account",
                     "transfer_not_received_by_recipient",
                     "transfer_timing",
                     "unable_to_verify_identity",
                     "verify_my_identity",
                     "verify_source_of_funds",
                     "verify_top_up",
                     "virtual_card_not_working",
                     "visa_or_mastercard",
                     "why_verify_identity",
                     "wrong_amount_of_cash_received",
                     "wrong_exchange_rate_for_cash_withdrawal",
                 ]
             ),
         }
     )
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

Exemple #18

0

Afficher le fichier

Fichier : super_glue.py Projet : albertvillanova/huggingface_datasets

    def _info(self):
        features = {
            feature: datasets.Value("string")
            for feature in self.config.features
        }
        if self.config.name.startswith("wsc"):
            features["span1_index"] = datasets.Value("int32")
            features["span2_index"] = datasets.Value("int32")
        if self.config.name == "wic":
            features["start1"] = datasets.Value("int32")
            features["start2"] = datasets.Value("int32")
            features["end1"] = datasets.Value("int32")
            features["end2"] = datasets.Value("int32")
        if self.config.name == "multirc":
            features["idx"] = dict({
                "paragraph": datasets.Value("int32"),
                "question": datasets.Value("int32"),
                "answer": datasets.Value("int32"),
            })
        elif self.config.name == "record":
            features["idx"] = dict({
                "passage": datasets.Value("int32"),
                "query": datasets.Value("int32"),
            })
        else:
            features["idx"] = datasets.Value("int32")

        if self.config.name == "record":
            # Entities are the set of possible choices for the placeholder.
            features["entities"] = datasets.features.Sequence(
                datasets.Value("string"))
            # Answers are the subset of entities that are correct.
            features["answers"] = datasets.features.Sequence(
                datasets.Value("string"))
        else:
            features["label"] = datasets.features.ClassLabel(
                names=self.config.label_classes)

        return datasets.DatasetInfo(
            description=_GLUE_DESCRIPTION + self.config.description,
            features=datasets.Features(features),
            homepage=self.config.url,
            citation=self.config.citation + "\n" + _SUPER_GLUE_CITATION,
        )

Exemple #19

0

Afficher le fichier

    def _info(self):
        if self.config.name == "evaluation_dataset":
            features = datasets.Features({
                "stackoverflow_id":
                datasets.Value("int32"),
                "question":
                datasets.Value("string"),
                "question_url":
                datasets.Value("string"),
                "question_author":
                datasets.Value("string"),
                "question_author_url":
                datasets.Value("string"),
                "answer":
                datasets.Value("string"),
                "answer_url":
                datasets.Value("string"),
                "answer_author":
                datasets.Value("string"),
                "answer_author_url":
                datasets.Value("string"),
                "examples":
                datasets.features.Sequence(datasets.Value("int32")),
                "examples_url":
                datasets.features.Sequence(datasets.Value("string")),
            })
        else:
            features = datasets.Features({
                "id":
                datasets.Value("int32"),
                "filepath":
                datasets.Value("string"),
                "method_name":
                datasets.Value("string"),
                "start_line":
                datasets.Value("int32"),
                "end_line":
                datasets.Value("int32"),
                "url":
                datasets.Value("string"),
            })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

Exemple #20

0

Afficher le fichier

 def _info(self):
     span_features = {
         "start": datasets.Value("int32"),
         "end": datasets.Value("int32"),
         "string": datasets.Value("string"),
     }
     reference_features = {
         "start": datasets.Value("int32"),
         "end": datasets.Value("int32"),
         "bridge": datasets.Value("bool_"),
         "string": datasets.Value("string"),
     }
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "example_id": datasets.Value("int64"),
                 "title_text": datasets.Value("string"),
                 "url": datasets.Value("string"),
                 "question": datasets.Value("string"),
                 "paragraph_text": datasets.Value("string"),
                 "sentence_starts": datasets.Sequence(datasets.Value("int32")),
                 "original_nq_answers": [span_features],
                 "annotation": {
                     "referential_equalities": [
                         {
                             "question_reference": span_features,
                             "sentence_reference": reference_features,
                         }
                     ],
                     "answer": [
                         {
                             "sentence_reference": reference_features,
                             "paragraph_reference": span_features,
                         }
                     ],
                     "explanation_type": datasets.Value("string"),
                     "selected_sentence": span_features,
                 },
             }
         ),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         citation=_CITATION,
     )

Exemple #21

0

Afficher le fichier

Fichier : pn_summary.py Projet : Priyansh2/nlp

}
"""

_DESCRIPTION = """\
A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.
It is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace("[n]", "\n")`) and then use them for your purposes.
"""

_HOMEPAGE = "https://github.com/hooshvare/pn-summary"
_LICENSE = "MIT License"

_URLs = {
    "1.0.0": {
        "data": "https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download",
        "features": [
            {"name": "id", "type": datasets.Value("string")},
            {"name": "title", "type": datasets.Value("string")},
            {"name": "article", "type": datasets.Value("string")},
            {"name": "summary", "type": datasets.Value("string")},
            {
                "name": "category",
                "type": datasets.ClassLabel(
                    names=[
                        "Economy",
                        "Roads-Urban",
                        "Banking-Insurance",
                        "Agriculture",
                        "International",
                        "Oil-Energy",
                        "Industry",
                        "Transportation",

Exemple #22

0

Afficher le fichier

Fichier : doc2dial.py Projet : wickieonya/datasets

    def _info(self):

        if self.config.name == "dialogue_domain":
            features = datasets.Features({
                "dial_id":
                datasets.Value("string"),
                "doc_id":
                datasets.Value("string"),
                "domain":
                datasets.Value("string"),
                "turns": [{
                    "turn_id":
                    datasets.Value("int32"),
                    "role":
                    datasets.Value("string"),
                    "da":
                    datasets.Value("string"),
                    "references": [{
                        "sp_id": datasets.Value("string"),
                        "label": datasets.Value("string"),
                    }],
                    "utterance":
                    datasets.Value("string"),
                }],
            })
        elif self.config.name == "document_domain":
            features = datasets.Features({
                "domain":
                datasets.Value("string"),
                "doc_id":
                datasets.Value("string"),
                "title":
                datasets.Value("string"),
                "doc_text":
                datasets.Value("string"),
                "spans": [{
                    "id_sp": datasets.Value("string"),
                    "tag": datasets.Value("string"),
                    "start_sp": datasets.Value("int32"),
                    "end_sp": datasets.Value("int32"),
                    "text_sp": datasets.Value("string"),
                    "title": datasets.Value("string"),
                    "parent_titles": datasets.Value("string"),
                    "id_sec": datasets.Value("string"),
                    "start_sec": datasets.Value("int32"),
                    "text_sec": datasets.Value("string"),
                    "end_sec": datasets.Value("int32"),
                }],
                "doc_html_ts":
                datasets.Value("string"),
                "doc_html_raw":
                datasets.Value("string"),
            })
        elif self.config.name == "doc2dial_rc":
            features = datasets.Features({
                "id":
                datasets.Value("string"),
                "title":
                datasets.Value("string"),
                "context":
                datasets.Value("string"),
                "question":
                datasets.Value("string"),
                "answers":
                datasets.features.Sequence({
                    "text":
                    datasets.Value("string"),
                    "answer_start":
                    datasets.Value("int32"),
                }),
                "domain":
                datasets.Value("string"),
            })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

Exemple #23

0

Afficher le fichier

Fichier : lama.py Projet : Priyansh2/nlp

 def _info(self):
     if self.config.name == "trex":
         features = datasets.Features({
             "uuid":
             datasets.Value("string"),
             "obj_uri":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "sub_uri":
             datasets.Value("string"),
             "sub_label":
             datasets.Value("string"),
             "predicate_id":
             datasets.Value("string"),
             "sub_surface":
             datasets.Value("string"),
             "obj_surface":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
             "template":
             datasets.Value("string"),
             "template_negated":
             datasets.Value("string"),
             "label":
             datasets.Value("string"),
             "description":
             datasets.Value("string"),
             "type":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
     elif self.config.name == "conceptnet":
         features = datasets.Features({
             "uuid":
             datasets.Value("string"),
             "sub":
             datasets.Value("string"),
             "obj":
             datasets.Value("string"),
             "pred":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
             "negated":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
     elif self.config.name == "squad":
         features = datasets.Features({
             "id":
             datasets.Value("string"),
             "sub_label":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "negated":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
     elif self.config.name == "google_re":
         features = datasets.Features({
             "pred":
             datasets.Value("string"),
             "sub":
             datasets.Value("string"),
             "obj":
             datasets.Value("string"),
             "evidences":
             datasets.Value("string"),
             "judgments":
             datasets.Value("string"),
             "sub_w":
             datasets.Value("string"),
             "sub_label":
             datasets.Value("string"),
             "sub_aliases":
             datasets.Value("string"),
             "obj_w":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "obj_aliases":
             datasets.Value("string"),
             "uuid":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
             "template":
             datasets.Value("string"),
             "template_negated":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )

Exemple #24

0

Afficher le fichier

Fichier : the_pile.py Projet : merveenoyan/datasets

            f"https://the-eye.eu/public/AI/pile/train/{i:0>2}.jsonl.zst"
            for i in range(30)
        ],
        "validation": ["https://the-eye.eu/public/AI/pile/val.jsonl.zst"],
        "test": ["https://the-eye.eu/public/AI/pile/test.jsonl.zst"],
    },
    "free_law":
    "https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
    "pubmed_central":
    "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz",
}

_FEATURES = {
    "all":
    datasets.Features({
        "text": datasets.Value("string"),
        "meta": {
            "pile_set_name": datasets.Value("string")
        },
    }),
    "free_law":
    datasets.Features({
        "text": datasets.Value("string"),
        "meta": {
            "case_ID": datasets.Value("string"),
            "case_jurisdiction": datasets.Value("string"),
            "date_created": datasets.Value("string"),
        },
    }),
    "pubmed_central":
    datasets.Features({

Exemple #25

0

Afficher le fichier

 def features(self):
     if self.name == "simplified":
         return {
             "text":
             datasets.Value("string"),
             "labels":
             datasets.Sequence(datasets.ClassLabel(names=_CLASS_NAMES)),
             "id":
             datasets.Value("string"),
         }
     elif self.name == "raw":
         d = {
             "text": datasets.Value("string"),
             "id": datasets.Value("string"),
             "author": datasets.Value("string"),
             "subreddit": datasets.Value("string"),
             "link_id": datasets.Value("string"),
             "parent_id": datasets.Value("string"),
             "created_utc": datasets.Value("float"),
             "rater_id": datasets.Value("int32"),
             "example_very_unclear": datasets.Value("bool"),
         }
         d.update(
             {label: datasets.Value("int32")
              for label in _CLASS_NAMES})
         return d

Exemple #26

0

Afficher le fichier

 def _info(self):
     # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
     features = datasets.Features(
         {
             "id": datasets.Value("int32"),
             "category": datasets.Value("string"),
             "text": datasets.Value("string"),
             "ner": datasets.features.Sequence(
                 {
                     "source": {
                         "from": datasets.Value("int32"),
                         "text": datasets.Value("string"),
                         "to": datasets.Value("int32"),
                         "type": datasets.features.ClassLabel(
                             names=[
                                 "PRODUCT_NAME",
                                 "PRODUCT_NAME_IMP",
                                 "PRODUCT_NO_BRAND",
                                 "BRAND_NAME",
                                 "BRAND_NAME_IMP",
                                 "VERSION",
                                 "PRODUCT_ADJ",
                                 "BRAND_ADJ",
                                 "LOCATION",
                                 "LOCATION_IMP",
                             ]
                         ),
                     },
                     "target": {
                         "from": datasets.Value("int32"),
                         "text": datasets.Value("string"),
                         "to": datasets.Value("int32"),
                         "type": datasets.features.ClassLabel(
                             names=[
                                 "PRODUCT_NAME",
                                 "PRODUCT_NAME_IMP",
                                 "PRODUCT_NO_BRAND",
                                 "BRAND_NAME",
                                 "BRAND_NAME_IMP",
                                 "VERSION",
                                 "PRODUCT_ADJ",
                                 "BRAND_ADJ",
                                 "LOCATION",
                                 "LOCATION_IMP",
                             ]
                         ),
                     },
                 }
             ),
         }
     )
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )

Exemple #27

0

Afficher le fichier

    def _info(self):

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "document": {
                    "id": datasets.Value("string"),
                    "kind": datasets.Value("string"),
                    "url": datasets.Value("string"),
                    "file_size": datasets.Value("int32"),
                    "word_count": datasets.Value("int32"),
                    "start": datasets.Value("string"),
                    "end": datasets.Value("string"),
                    "summary": {
                        "text":
                        datasets.Value("string"),
                        "tokens":
                        datasets.features.Sequence(datasets.Value("string")),
                        "url":
                        datasets.Value("string"),
                        "title":
                        datasets.Value("string"),
                    },
                    "text": datasets.Value("string"),
                },
                "question": {
                    "text": datasets.Value("string"),
                    "tokens":
                    datasets.features.Sequence(datasets.Value("string")),
                },
                "answers": [{
                    "text":
                    datasets.Value("string"),
                    "tokens":
                    datasets.features.Sequence(datasets.Value("string")),
                }],
            }),
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

Exemple #28

0

Afficher le fichier

 def _info(self):
     # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
     features = datasets.Features({
         "full_text":
         datasets.Value("string"),
         "text_translation":
         datasets.Value("string"),
         "screen_name":
         datasets.Value("string"),
         "description":
         datasets.Value("string"),
         "desc_translation":
         datasets.Value("string"),
         "location":
         datasets.Value("string"),
         "weekofyear":
         datasets.Value("int64"),
         "weekday":
         datasets.Value("int64"),
         "month":
         datasets.Value("int64"),
         "year":
         datasets.Value("int64"),
         "day":
         datasets.Value("int64"),
         "point_info":
         datasets.Value("string"),
         "point":
         datasets.Value("string"),
         "latitude":
         datasets.Value("float64"),
         "longitude":
         datasets.Value("float64"),
         "altitude":
         datasets.Value("float64"),
         "province":
         datasets.Value("string"),
         "hisco_standard":
         datasets.Value("string"),
         "hisco_code":
         datasets.Value("string"),
         "industry":
         datasets.Value("bool_"),
         "sentiment_pattern":
         datasets.Value("float64"),
         "subjective_pattern":
         datasets.Value("float64"),
         "label":
         datasets.ClassLabel(num_classes=3,
                             names=["neg", "neu", "pos"],
                             names_file=None,
                             id=None),
     })
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=
         features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )

Exemple #29

0

Afficher le fichier

Fichier : schema_guided_dstc8.py Projet : Priyansh2/nlp

 def _info(self):
     if self.config.name == "schema":
         features = datasets.Features({
             "service_name":
             datasets.Value("string"),
             "description":
             datasets.Value("string"),
             "slots":
             datasets.Sequence({
                 "name":
                 datasets.Value("string"),
                 "description":
                 datasets.Value("string"),
                 "is_categorical":
                 datasets.Value("bool"),
                 "possible_values":
                 datasets.Sequence(datasets.Value("string")),
             }),
             "intents":
             datasets.Sequence(
                 {
                     "name":
                     datasets.Value("string"),
                     "description":
                     datasets.Value("string"),
                     "is_transactional":
                     datasets.Value("bool"),
                     "required_slots":
                     datasets.Sequence(datasets.Value("string")),
                     # optional_slots was originally a dictionary
                     "optional_slots":
                     datasets.Sequence(
                         {
                             "slot_name": datasets.Value("string"),
                             "slot_value": datasets.Value("string"),
                         }),
                     "result_slots":
                     datasets.Sequence(datasets.Value("string")),
                 }, ),
         })
     else:
         features = datasets.Features({
             "dialogue_id":
             datasets.Value("string"),
             "services":
             datasets.Sequence(datasets.Value("string")),
             "turns":
             datasets.Sequence({
                 "speaker":
                 datasets.ClassLabel(names=["USER", "SYSTEM"]),
                 "utterance":
                 datasets.Value("string"),
                 "frames":
                 datasets.Sequence({
                     "service":
                     datasets.Value("string"),
                     "slots":
                     datasets.Sequence({
                         "slot":
                         datasets.Value("string"),
                         "start":
                         datasets.Value("int32"),
                         "exclusive_end":
                         datasets.Value("int32"),
                     }),
                     # optional
                     "state": {
                         "active_intent":
                         datasets.Value("string"),
                         "requested_slots":
                         datasets.Sequence(datasets.Value("string")),
                         # slot_values was originally a dictionary
                         "slot_values":
                         datasets.Sequence({
                             "slot_name":
                             datasets.Value("string"),
                             "slot_value_list":
                             datasets.Sequence(datasets.Value("string")),
                         }),
                     },
                     "actions":
                     datasets.Sequence({
                         "act":
                         datasets.ClassLabel(names=_ALL_ACTS),
                         # optional
                         "slot":
                         datasets.Value("string"),
                         # optional
                         "canonical_values":
                         datasets.Sequence(datasets.Value("string")),
                         # optional
                         "values":
                         datasets.Sequence(datasets.Value("string")),
                     }),
                     # optional
                     "service_results":
                     datasets.Sequence(
                         # Arrow doesn't like Sequences of Sequences for default values so we need a Sequence of Features of Sequences
                         {
                             "service_results_list":
                             datasets.Sequence(
                                 # originally each list item was a dictionary (optional)
                                 {
                                     "service_slot_name":
                                     datasets.Value("string"),
                                     "service_canonical_value":
                                     datasets.Value("string"),
                                 })
                         }),
                     # optional
                     "service_call": {
                         "method":
                         datasets.Value("string"),
                         # parameters was originally a dictionary
                         "parameters":
                         datasets.Sequence({
                             "parameter_slot_name":
                             datasets.Value("string"),
                             "parameter_canonical_value":
                             datasets.Value("string"),
                         }),
                     },
                 }),
             }),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

Exemple #30

0

Afficher le fichier

Fichier : multi_woz_v22.py Projet : Priyansh2/nlp

 def _info(self):
     features = datasets.Features({
         "dialogue_id":
         datasets.Value("string"),
         "services":
         datasets.Sequence(datasets.Value("string")),
         "turns":
         datasets.Sequence({
             "turn_id":
             datasets.Value("string"),
             "speaker":
             datasets.ClassLabel(names=["USER", "SYSTEM"]),
             "utterance":
             datasets.Value("string"),
             "frames":
             datasets.Sequence({
                 "service":
                 datasets.Value("string"),
                 "state": {
                     "active_intent":
                     datasets.Value("string"),
                     "requested_slots":
                     datasets.Sequence(datasets.Value("string")),
                     "slots_values":
                     datasets.Sequence({
                         "slots_values_name":
                         datasets.Value("string"),
                         "slots_values_list":
                         datasets.Sequence(datasets.Value("string")),
                     }),
                 },
                 "slots":
                 datasets.Sequence({
                     "slot":
                     datasets.Value("string"),
                     "value":
                     datasets.Value("string"),
                     "start":
                     datasets.Value("int32"),
                     "exclusive_end":
                     datasets.Value("int32"),
                     "copy_from":
                     datasets.Value("string"),
                     "copy_from_value":
                     datasets.Sequence(datasets.Value("string")),
                 }),
             }),
             "dialogue_acts":
             datasets.Features({
                 "dialog_act":
                 datasets.Sequence({
                     "act_type":
                     datasets.Value("string"),
                     "act_slots":
                     datasets.Sequence(
                         datasets.Features({
                             "slot_name":
                             datasets.Value("string"),
                             "slot_value":
                             datasets.Value("string"),
                         }), ),
                 }),
                 "span_info":
                 datasets.Sequence({
                     "act_type":
                     datasets.Value("string"),
                     "act_slot_name":
                     datasets.Value("string"),
                     "act_slot_value":
                     datasets.Value("string"),
                     "span_start":
                     datasets.Value("int32"),
                     "span_end":
                     datasets.Value("int32"),
                 }),
             }),
         }),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=
         "https://github.com/budzianowski/multiwoz/tree/master/data/MultiWOZ_2.2",
         license=_LICENSE,
         citation=_CITATION,
     )