Python Valueの例、datasets.Value Pythonの例

コード例 #1

0

ファイルを表示

    def _info(self):
        features = datasets.Features({
            "text":
            datasets.Value("string"),
            "sentence_offsets":
            datasets.features.Sequence({
                "begin_char_offset":
                datasets.Value("int64"),
                "end_char_offset":
                datasets.Value("int64")
            }),
            "sentences":
            datasets.features.Sequence(datasets.Value("string")),
            "sentence_labels":
            datasets.features.Sequence(datasets.Value("int64")),
            "token_offsets":
            datasets.features.Sequence({
                "offsets":
                datasets.features.Sequence({
                    "begin_char_offset":
                    datasets.Value("int64"),
                    "end_char_offset":
                    datasets.Value("int64")
                })
            }),
            "tokens":
            datasets.features.Sequence(
                datasets.features.Sequence(datasets.Value("string"))),
            "entity_labels":
            datasets.features.Sequence(
                datasets.features.Sequence(
                    datasets.features.ClassLabel(names=[
                        "B-DEVICE",
                        "B-EXPERIMENT",
                        "B-MATERIAL",
                        "B-VALUE",
                        "I-DEVICE",
                        "I-EXPERIMENT",
                        "I-MATERIAL",
                        "I-VALUE",
                        "O",
                    ]))),
            "slot_labels":
            datasets.features.Sequence(
                datasets.features.Sequence(
                    datasets.features.ClassLabel(names=[
                        "B-anode_material",
                        "B-cathode_material",
                        "B-conductivity",
                        "B-current_density",
                        "B-degradation_rate",
                        "B-device",
                        "B-electrolyte_material",
                        "B-experiment_evoking_word",
                        "B-fuel_used",
                        "B-interlayer_material",
                        "B-interconnect_material",
                        "B-open_circuit_voltage",
                        "B-power_density",
                        "B-resistance",
                        "B-support_material",
                        "B-thickness",
                        "B-time_of_operation",
                        "B-voltage",
                        "B-working_temperature",
                        "I-anode_material",
                        "I-cathode_material",
                        "I-conductivity",
                        "I-current_density",
                        "I-degradation_rate",
                        "I-device",
                        "I-electrolyte_material",
                        "I-experiment_evoking_word",
                        "I-fuel_used",
                        "I-interlayer_material",
                        "I-interconnect_material",
                        "I-open_circuit_voltage",
                        "I-power_density",
                        "I-resistance",
                        "I-support_material",
                        "I-thickness",
                        "I-time_of_operation",
                        "I-voltage",
                        "I-working_temperature",
                        "O",
                    ]))),
            "links":
            datasets.Sequence({
                "relation_label":
                datasets.features.ClassLabel(names=[
                    "coreference", "experiment_variation", "same_experiment",
                    "thickness"
                ]),
                "start_span_id":
                datasets.Value("int64"),
                "end_span_id":
                datasets.Value("int64"),
            }),
            "slots":
            datasets.features.Sequence({
                "frame_participant_label":
                datasets.features.ClassLabel(names=[
                    "anode_material",
                    "cathode_material",
                    "current_density",
                    "degradation_rate",
                    "device",
                    "electrolyte_material",
                    "fuel_used",
                    "interlayer_material",
                    "open_circuit_voltage",
                    "power_density",
                    "resistance",
                    "support_material",
                    "time_of_operation",
                    "voltage",
                    "working_temperature",
                ]),
                "slot_id":
                datasets.Value("int64"),
            }),
            "spans":
            datasets.features.Sequence({
                "span_id":
                datasets.Value("int64"),
                "entity_label":
                datasets.features.ClassLabel(
                    names=["", "DEVICE", "MATERIAL", "VALUE"]),
                "sentence_id":
                datasets.Value("int64"),
                "experiment_mention_type":
                datasets.features.ClassLabel(names=[
                    "", "current_exp", "future_work", "general_info",
                    "previous_work"
                ]),
                "begin_char_offset":
                datasets.Value("int64"),
                "end_char_offset":
                datasets.Value("int64"),
            }),
            "experiments":
            datasets.features.Sequence({
                "experiment_id":
                datasets.Value("int64"),
                "span_id":
                datasets.Value("int64"),
                "slots":
                datasets.features.Sequence({
                    "frame_participant_label":
                    datasets.features.ClassLabel(names=[
                        "anode_material",
                        "cathode_material",
                        "current_density",
                        "degradation_rate",
                        "conductivity",
                        "device",
                        "electrolyte_material",
                        "fuel_used",
                        "interlayer_material",
                        "open_circuit_voltage",
                        "power_density",
                        "resistance",
                        "support_material",
                        "time_of_operation",
                        "voltage",
                        "working_temperature",
                    ]),
                    "slot_id":
                    datasets.Value("int64"),
                }),
            }),
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

コード例 #2

0

ファイルを表示

ファイル: mocha.py プロジェクト: aiinnova/huggingface_datasets

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "constituent_dataset":
             datasets.Value("string"),
             "id":
             datasets.Value("string"),
             "context":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "reference":
             datasets.Value("string"),
             "candidate":
             datasets.Value("string"),
             "score":
             datasets.Value("float"),
             "metadata": {
                 "scores":
                 datasets.features.Sequence(datasets.Value("int32")),
                 "source": datasets.Value("string"),
             },
             # features for minimal pairs
             "candidate2":
             datasets.Value("string"),
             "score2":
             datasets.Value("float"),
         }),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

コード例 #3

0

ファイルを表示

def benchmark_iterating():
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
    functions = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "pandas",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "torch",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "tensorflow",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]

    functions_shuffled = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]
    with tempfile.TemporaryDirectory() as tmp_dir:
        print("generating dataset")
        features = datasets.Features({
            "list":
            datasets.Sequence(datasets.Value("float32")),
            "numbers":
            datasets.Value("float32")
        })
        dataset = generate_example_dataset(
            os.path.join(tmp_dir, "dataset.arrow"),
            features,
            num_examples=SPEED_TEST_N_EXAMPLES,
            seq_shapes={"list": (100, )},
        )
        print("first set of iterations")
        for func, kwargs in functions:
            print(func.__name__, str(kwargs))
            times[func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

        print("shuffling dataset")
        dataset = dataset.shuffle()
        print("Second set of iterations (after shuffling")
        for func, kwargs in functions_shuffled:
            print("shuffled ", func.__name__, str(kwargs))
            times["shuffled " + func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))

コード例 #4

0

ファイルを表示

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "context_id":
             datasets.Value("string"),
             "question_id":
             datasets.Value("string"),
             "domain":
             datasets.Value("string"),
             "metadata": {
                 "author": datasets.Value("string"),
                 "title": datasets.Value("string"),
                 "url": datasets.Value("string"),
             },
             "context":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "question_type":
             datasets.Value("string"),
             "answers":
             datasets.features.Sequence(datasets.Value("string"), ),
             "correct_answer_id":
             datasets.Value("int32"),
         }),
         # No default supervised_keys (as we have to pass both question
         # and context as input).
         supervised_keys=None,
         homepage="https://text-machine-lab.github.io/blog/2020/quail/",
         citation=_CITATION,
     )

コード例 #5

0

ファイルを表示

    def _info(self):
        features = datasets.Features({
            "id":
            datasets.Value("string"),
            "text":
            datasets.Value("string"),
            "tokens":
            datasets.Sequence(datasets.Value("string")),
            "nps": [{
                "text": datasets.Value("string"),
                "first_char": datasets.Value("int32"),
                "last_char": datasets.Value("int32"),
                "first_token": datasets.Value("int32"),
                "last_token": datasets.Value("int32"),
                "id": datasets.Value("string"),
            }],
            "np_relations": [{
                "anchor":
                datasets.Value("string"),
                "complement":
                datasets.Value("string"),
                "preposition":
                datasets.features.ClassLabel(names=[
                    "about",
                    "for",
                    "with",
                    "from",
                    "among",
                    "by",
                    "on",
                    "at",
                    "during",
                    "of",
                    "member(s) of",
                    "in",
                    "after",
                    "under",
                    "to",
                    "into",
                    "before",
                    "near",
                    "outside",
                    "around",
                    "between",
                    "against",
                    "over",
                    "inside",
                ]),
                "complement_coref_cluster_id":
                datasets.Value("string"),
            }],
            "coref": [{
                "id":
                datasets.Value("string"),
                "members":
                datasets.Sequence(datasets.Value("string")),
                "np_type":
                datasets.features.ClassLabel(names=[
                    "standard",
                    "time/date/measurement",
                    "idiomatic",
                ]),
            }],
            "metadata": {
                "annotators": {
                    "coref_worker":
                    datasets.Value("int32"),
                    "consolidator_worker":
                    datasets.Value("int32"),
                    "np-relations_worker":
                    datasets.Sequence(datasets.Value("int32")),
                },
                "url": datasets.Value("string"),
                "source": datasets.Value("string"),
            },
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
            # supervised_keys=("sentence", "label"),
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

コード例 #6

0

ファイルを表示

ファイル: super_glue.py プロジェクト: aiinnova/huggingface_datasets

 def _get_feature_types(self):
     if self.config_name == "record":
         return {
             "predictions": {
                 "idx": {
                     "passage": datasets.Value("int64"),
                     "query": datasets.Value("int64"),
                 },
                 "prediction_text": datasets.Value("string"),
             },
             "references": {
                 "idx": {
                     "passage": datasets.Value("int64"),
                     "query": datasets.Value("int64"),
                 },
                 "answers": datasets.Sequence(datasets.Value("string")),
             },
         }
     elif self.config_name == "multirc":
         return {
             "predictions": {
                 "idx": {
                     "answer": datasets.Value("int64"),
                     "paragraph": datasets.Value("int64"),
                     "question": datasets.Value("int64"),
                 },
                 "prediction": datasets.Value("int64"),
             },
             "references": datasets.Value("int64"),
         }
     else:
         return {
             "predictions": datasets.Value("int64"),
             "references": datasets.Value("int64"),
         }

コード例 #7

0

ファイルを表示

ファイル: xtreme.py プロジェクト: yngtodd/datasets

    def _info(self):
        # TODO(xtreme): Specifies the datasets.DatasetInfo object
        features = {text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)}
        if "answers" in features.keys():
            features["answers"] = datasets.features.Sequence(
                {"answer_start": datasets.Value("int32"), "text": datasets.Value("string")}
            )
        if self.config.name.startswith("PAWS-X"):
            features["label"] = datasets.Value("string")
        if self.config.name == "XNLI":
            features["gold_label"] = datasets.Value("string")

        if self.config.name.startswith("udpos"):
            features = datasets.Features(
                {
                    "token": datasets.Value("string"),
                    "pos_tag": datasets.features.ClassLabel(
                        names=[
                            "ADJ",
                            "ADP",
                            "ADV",
                            "AUX",
                            "CCONJ",
                            "DET",
                            "INTJ",
                            "NOUN",
                            "NUM",
                            "PART",
                            "PRON",
                            "PROPN",
                            "PUNCT",
                            "SCONJ",
                            "SYM",
                            "VERB",
                            "X",
                        ]
                    ),
                }
            )

        if self.config.name.startswith("PAN-X"):
            features = datasets.Features(
                {
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "O",
                                "B-PER",
                                "I-PER",
                                "B-ORG",
                                "I-ORG",
                                "B-LOC",
                                "I-LOC",
                            ]
                        )
                    ),
                    "langs": datasets.Sequence(datasets.Value("string")),
                }
            )
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=self.config.description + "\n" + _DESCRIPTION,
            # datasets.features.FeatureConnectors
            features=datasets.Features(
                features
                # These are the features of your dataset like images, labels ...
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url,
            citation=self.config.citation + "\n" + _CITATION,
        )

コード例 #8

0

ファイルを表示

 def _info(self):
     features = datasets.Features({
         "event":
         datasets.Value("string"),
         "oEffect":
         datasets.Sequence(datasets.Value("string")),
         "oReact":
         datasets.Sequence(datasets.Value("string")),
         "oWant":
         datasets.Sequence(datasets.Value("string")),
         "xAttr":
         datasets.Sequence(datasets.Value("string")),
         "xEffect":
         datasets.Sequence(datasets.Value("string")),
         "xIntent":
         datasets.Sequence(datasets.Value("string")),
         "xNeed":
         datasets.Sequence(datasets.Value("string")),
         "xReact":
         datasets.Sequence(datasets.Value("string")),
         "xWant":
         datasets.Sequence(datasets.Value("string")),
         "prefix":
         datasets.Sequence(datasets.Value("string")),
         "split":
         datasets.Value("string"),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

コード例 #9

0

ファイルを表示

ファイル: grail_qa.py プロジェクト: Priyansh2/nlp

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "qid": datasets.Value("string"),
                 "question": datasets.Value("string"),
                 "answer": datasets.features.Sequence(
                     {
                         "answer_type": datasets.Value("string"),
                         "answer_argument": datasets.Value("string"),
                         "entity_name": datasets.Value("string"),
                     }
                 ),
                 "function": datasets.Value("string"),
                 "num_node": datasets.Value("int32"),
                 "num_edge": datasets.Value("int32"),
                 "graph_query": {
                     "nodes": datasets.features.Sequence(
                         {
                             "nid": datasets.Value("int32"),
                             "node_type": datasets.Value("string"),
                             "id": datasets.Value("string"),
                             "class": datasets.Value("string"),
                             "friendly_name": datasets.Value("string"),
                             "question_node": datasets.Value("int32"),
                             "function": datasets.Value("string"),
                         }
                     ),
                     "edges": datasets.features.Sequence(
                         {
                             "start": datasets.Value("int32"),
                             "end": datasets.Value("int32"),
                             "relation": datasets.Value("string"),
                             "friendly_name": datasets.Value("string"),
                         }
                     ),
                 },
                 "sparql_query": datasets.Value("string"),
                 "domains": datasets.features.Sequence(datasets.Value("string")),
                 "level": datasets.Value("string"),
                 "s_expression": datasets.Value("string"),
             }
         ),
         # No default supervised_keys (as we have to pass both question
         # and context as input).
         supervised_keys=None,
         homepage="https://dki-lab.github.io/GrailQA/",
         citation=_CITATION,
     )

コード例 #10

0

ファイルを表示

ファイル: code_x_glue_tc_nl_code_search_adv.py プロジェクト: albertvillanova/huggingface_datasets

class CodeXGlueTcNLCodeSearchAdvImpl(CodeXGlueCtCodeToTextBaseImpl):
    LANGUAGE = "python"
    SINGLE_LANGUAGE = True

    _FEATURES = {
        "id": datasets.Value("int32"),  # Index of the sample
        "repo": datasets.Value("string"),  # repo: the owner/repo
        "path": datasets.Value("string"),  # path: the full path to the original file
        "func_name": datasets.Value("string"),  # func_name: the function or method name
        "original_string": datasets.Value("string"),  # original_string: the raw string before tokenization or parsing
        "language": datasets.Value("string"),  # language: the programming language
        "code": datasets.Value("string"),  # code/function: the part of the original_string that is code
        "code_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # code_tokens/function_tokens: tokenized version of code
        "docstring": datasets.Value(
            "string"
        ),  # docstring: the top-level comment or docstring, if it exists in the original string
        "docstring_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # docstring_tokens: tokenized version of docstring
        "sha": datasets.Value("string"),  # sha of the file
        "url": datasets.Value("string"),  # url of the file
        "docstring_summary": datasets.Value("string"),  # Summary of the docstring
        "parameters": datasets.Value("string"),  # parameters of the function
        "return_statement": datasets.Value("string"),  # return statement
        "argument_list": datasets.Value("string"),  # list of arguments of the function
        "identifier": datasets.Value("string"),  # identifier
        "nwo": datasets.Value("string"),  # nwo
        "score": datasets.Value("float"),  # score for this search
    }

    def post_process(self, split_name, language, js):
        for suffix in "_tokens", "":
            key = "function" + suffix
            if key in js:
                js["code" + suffix] = js[key]
                del js[key]

        for key in self._FEATURES:
            if key not in js:
                if key == "score":
                    js[key] = -1
                else:
                    js[key] = ""

        return js

    def generate_urls(self, split_name):
        for e in super().generate_urls(split_name, self.LANGUAGE):
            yield e

    def get_data_files(self, split_name, file_paths, language):
        if split_name == "train":
            return super().get_data_files(split_name, file_paths, language)
        else:
            data_set_path = file_paths["dataset"]
            data_file = os.path.join(data_set_path, "dataset", "test_code.jsonl")
            return [data_file]

    def _generate_examples(self, split_name, file_paths):
        for e in super()._generate_examples(split_name, file_paths, self.LANGUAGE):
            yield e

コード例 #11

0

ファイルを表示

ファイル: code_x_glue_tc_nl_code_search_adv.py プロジェクト: albertvillanova/huggingface_datasets

class CodeXGlueCtCodeToTextBaseImpl(TrainValidTestChild):
    _DESCRIPTION = _DESCRIPTION
    _CITATION = _CITATION

    # For each file, each line in the uncompressed file represents one function.
    _FEATURES = {
        "id": datasets.Value("int32"),  # Index of the sample
        "repo": datasets.Value("string"),  # repo: the owner/repo
        "path": datasets.Value("string"),  # path: the full path to the original file
        "func_name": datasets.Value("string"),  # func_name: the function or method name
        "original_string": datasets.Value("string"),  # original_string: the raw string before tokenization or parsing
        "language": datasets.Value("string"),  # language: the programming language name
        "code": datasets.Value("string"),  # code/function: the part of the original_string that is code
        "code_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # code_tokens/function_tokens: tokenized version of code
        "docstring": datasets.Value(
            "string"
        ),  # docstring: the top-level comment or docstring, if it exists in the original string
        "docstring_tokens": datasets.features.Sequence(
            datasets.Value("string")
        ),  # docstring_tokens: tokenized version of docstring
        "sha": datasets.Value("string"),  # sha of the file
        "url": datasets.Value("string"),  # url of the file
    }

    _SUPERVISED_KEYS = ["docstring", "docstring_tokens"]

    def generate_urls(self, split_name, language):
        yield "language", f"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{language}.zip"
        yield "dataset", "dataset.zip"

    def get_data_files(self, split_name, file_paths, language):
        language_specific_path = file_paths["language"]
        final_path = os.path.join(language_specific_path, language, "final")
        # Make some cleanup to save space
        for path in os.listdir(final_path):
            if path.endswith(".pkl"):
                os.unlink(path)

        data_files = []
        for root, dirs, files in os.walk(final_path):
            for file in files:
                temp = os.path.join(root, file)
                if ".jsonl" in temp:
                    if split_name in temp:
                        data_files.append(temp)
        return data_files

    def post_process(self, split_name, language, js):
        return js

    def _generate_examples(self, split_name, file_paths, language):
        import gzip

        data_set_path = file_paths["dataset"]

        data_files = self.get_data_files(split_name, file_paths, language)

        urls = {}
        f1_path_parts = [data_set_path, "dataset", language, f"{split_name}.txt"]
        if self.SINGLE_LANGUAGE:
            del f1_path_parts[2]

        f1_path = os.path.join(*f1_path_parts)
        with open(f1_path, encoding="utf-8") as f1:
            for line in f1:
                line = line.strip()
                urls[line] = True

        idx = 0
        for file in data_files:
            if ".gz" in file:
                f = gzip.open(file)
            else:
                f = open(file, encoding="utf-8")

            for line in f:
                line = line.strip()
                js = json.loads(line)
                if js["url"] in urls:
                    js["id"] = idx
                    js = self.post_process(split_name, language, js)
                    if "partition" in js:
                        del js["partition"]
                    yield idx, js
                    idx += 1
            f.close()

コード例 #12

0

ファイルを表示

ファイル: gap.py プロジェクト: lingsond/huggingface-datasets

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "ID": datasets.Value("string"),
                 "Text": datasets.Value("string"),
                 "Pronoun": datasets.Value("string"),
                 "Pronoun-offset": datasets.Value("int32"),
                 "A": datasets.Value("string"),
                 "A-offset": datasets.Value("int32"),
                 "A-coref": datasets.Value("bool"),
                 "B": datasets.Value("string"),
                 "B-offset": datasets.Value("int32"),
                 "B-coref": datasets.Value("bool"),
                 "URL": datasets.Value("string"),
             }
         ),
         supervised_keys=None,
         homepage="https://github.com/google-research-datasets/gap-coreference",
         citation=_CITATION,
     )

コード例 #13

0

ファイルを表示

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "source":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-LOC",
                     "I-LOC",
                     "B-LOCderiv",
                     "I-LOCderiv",
                     "B-LOCpart",
                     "I-LOCpart",
                     "B-ORG",
                     "I-ORG",
                     "B-ORGderiv",
                     "I-ORGderiv",
                     "B-ORGpart",
                     "I-ORGpart",
                     "B-OTH",
                     "I-OTH",
                     "B-OTHderiv",
                     "I-OTHderiv",
                     "B-OTHpart",
                     "I-OTHpart",
                     "B-PER",
                     "I-PER",
                     "B-PERderiv",
                     "I-PERderiv",
                     "B-PERpart",
                     "I-PERpart",
                 ])),
             "nested_ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-LOC",
                     "I-LOC",
                     "B-LOCderiv",
                     "I-LOCderiv",
                     "B-LOCpart",
                     "I-LOCpart",
                     "B-ORG",
                     "I-ORG",
                     "B-ORGderiv",
                     "I-ORGderiv",
                     "B-ORGpart",
                     "I-ORGpart",
                     "B-OTH",
                     "I-OTH",
                     "B-OTHderiv",
                     "I-OTHderiv",
                     "B-OTHpart",
                     "I-OTHpart",
                     "B-PER",
                     "I-PER",
                     "B-PERderiv",
                     "I-PERderiv",
                     "B-PERpart",
                     "I-PERpart",
                 ])),
         }),
         supervised_keys=None,
         homepage="https://sites.google.com/site/germeval2014ner/",
         citation=_CITATION,
     )

コード例 #14

0

ファイルを表示

 def _info(self):
     # TODO(quartz): Specifies the datasets.DatasetInfo object
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # datasets.features.FeatureConnectors
         features=datasets.Features({
             # These are the features of your dataset like images, labels ...
             "id":
             datasets.Value("string"),
             "question":
             datasets.Value("string"),
             "choices":
             datasets.features.Sequence({
                 "text": datasets.Value("string"),
                 "label": datasets.Value("string")
             }),
             "answerKey":
             datasets.Value("string"),
             "para":
             datasets.Value("string"),
             "para_id":
             datasets.Value("string"),
             "para_anno": {
                 "effect_prop": datasets.Value("string"),
                 "cause_dir_str": datasets.Value("string"),
                 "effect_dir_str": datasets.Value("string"),
                 "cause_dir_sign": datasets.Value("string"),
                 "effect_dir_sign": datasets.Value("string"),
                 "cause_prop": datasets.Value("string"),
             },
             "question_anno": {
                 "more_effect_dir": datasets.Value("string"),
                 "less_effect_dir": datasets.Value("string"),
                 "less_cause_prop": datasets.Value("string"),
                 "more_effect_prop": datasets.Value("string"),
                 "less_effect_prop": datasets.Value("string"),
                 "less_cause_dir": datasets.Value("string"),
             },
         }),
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage="https://allenai.org/data/quartz",
         citation=_CITATION,
     )

コード例 #15

0

ファイルを表示

    def _info(self):
        """
        Specify the datasets.DatasetInfo object which contains information and typings for the dataset.
        """

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types.
            features=datasets.Features({
                "swda_filename":
                datasets.Value("string"),
                "ptb_basename":
                datasets.Value("string"),
                "conversation_no":
                datasets.Value("int64"),
                "transcript_index":
                datasets.Value("int64"),
                "act_tag":
                datasets.ClassLabel(num_classes=217, names=_ACT_TAGS),
                "damsl_act_tag":
                datasets.ClassLabel(num_classes=43, names=_DAMSL_ACT_TAGS),
                "caller":
                datasets.Value("string"),
                "utterance_index":
                datasets.Value("int64"),
                "subutterance_index":
                datasets.Value("int64"),
                "text":
                datasets.Value("string"),
                "pos":
                datasets.Value("string"),
                "trees":
                datasets.Value("string"),
                "ptb_treenumbers":
                datasets.Value("string"),
                "talk_day":
                datasets.Value("string"),
                "length":
                datasets.Value("int64"),
                "topic_description":
                datasets.Value("string"),
                "prompt":
                datasets.Value("string"),
                "from_caller":
                datasets.Value("int64"),
                "from_caller_sex":
                datasets.Value("string"),
                "from_caller_education":
                datasets.Value("int64"),
                "from_caller_birth_year":
                datasets.Value("int64"),
                "from_caller_dialect_area":
                datasets.Value("string"),
                "to_caller":
                datasets.Value("int64"),
                "to_caller_sex":
                datasets.Value("string"),
                "to_caller_education":
                datasets.Value("int64"),
                "to_caller_birth_year":
                datasets.Value("int64"),
                "to_caller_dialect_area":
                datasets.Value("string"),
            }),
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

コード例 #16

0

ファイルを表示

ファイル: conv_ai.py プロジェクト: Priyansh2/nlp

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "id": datasets.Value("int32"),
                 "dialogId": datasets.Value("int32"),
                 "context": datasets.Value("string"),
                 "users": [{"userType": datasets.Value("string"), "id": datasets.Value("string")}],
                 "evaluation": [
                     {
                         "breadth": datasets.Value("int32"),
                         "userId": datasets.Value("string"),
                         "quality": datasets.Value("int32"),
                         "engagement": datasets.Value("int32"),
                     }
                 ],
                 "thread": [
                     {
                         "evaluation": datasets.Value("int32"),
                         "text": datasets.Value("string"),
                         "userId": datasets.Value("string"),
                         "time": datasets.Value("int32"),
                     }
                 ],
             }
         ),
         supervised_keys=None,
         homepage="https://github.com/DeepPavlov/convai/tree/master/2017",
     )

コード例 #17

0

ファイルを表示

 def _info(self):
     features = datasets.Features(
         {
             "text": datasets.Value("string"),
             "label": datasets.features.ClassLabel(
                 names=[
                     "activate_my_card",
                     "age_limit",
                     "apple_pay_or_google_pay",
                     "atm_support",
                     "automatic_top_up",
                     "balance_not_updated_after_bank_transfer",
                     "balance_not_updated_after_cheque_or_cash_deposit",
                     "beneficiary_not_allowed",
                     "cancel_transfer",
                     "card_about_to_expire",
                     "card_acceptance",
                     "card_arrival",
                     "card_delivery_estimate",
                     "card_linking",
                     "card_not_working",
                     "card_payment_fee_charged",
                     "card_payment_not_recognised",
                     "card_payment_wrong_exchange_rate",
                     "card_swallowed",
                     "cash_withdrawal_charge",
                     "cash_withdrawal_not_recognised",
                     "change_pin",
                     "compromised_card",
                     "contactless_not_working",
                     "country_support",
                     "declined_card_payment",
                     "declined_cash_withdrawal",
                     "declined_transfer",
                     "direct_debit_payment_not_recognised",
                     "disposable_card_limits",
                     "edit_personal_details",
                     "exchange_charge",
                     "exchange_rate",
                     "exchange_via_app",
                     "extra_charge_on_statement",
                     "failed_transfer",
                     "fiat_currency_support",
                     "get_disposable_virtual_card",
                     "get_physical_card",
                     "getting_spare_card",
                     "getting_virtual_card",
                     "lost_or_stolen_card",
                     "lost_or_stolen_phone",
                     "order_physical_card",
                     "passcode_forgotten",
                     "pending_card_payment",
                     "pending_cash_withdrawal",
                     "pending_top_up",
                     "pending_transfer",
                     "pin_blocked",
                     "receiving_money",
                     "Refund_not_showing_up",
                     "request_refund",
                     "reverted_card_payment?",
                     "supported_cards_and_currencies",
                     "terminate_account",
                     "top_up_by_bank_transfer_charge",
                     "top_up_by_card_charge",
                     "top_up_by_cash_or_cheque",
                     "top_up_failed",
                     "top_up_limits",
                     "top_up_reverted",
                     "topping_up_by_card",
                     "transaction_charged_twice",
                     "transfer_fee_charged",
                     "transfer_into_account",
                     "transfer_not_received_by_recipient",
                     "transfer_timing",
                     "unable_to_verify_identity",
                     "verify_my_identity",
                     "verify_source_of_funds",
                     "verify_top_up",
                     "virtual_card_not_working",
                     "visa_or_mastercard",
                     "why_verify_identity",
                     "wrong_amount_of_cash_received",
                     "wrong_exchange_rate_for_cash_withdrawal",
                 ]
             ),
         }
     )
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

コード例 #18

0

ファイルを表示

ファイル: super_glue.py プロジェクト: albertvillanova/huggingface_datasets

    def _info(self):
        features = {
            feature: datasets.Value("string")
            for feature in self.config.features
        }
        if self.config.name.startswith("wsc"):
            features["span1_index"] = datasets.Value("int32")
            features["span2_index"] = datasets.Value("int32")
        if self.config.name == "wic":
            features["start1"] = datasets.Value("int32")
            features["start2"] = datasets.Value("int32")
            features["end1"] = datasets.Value("int32")
            features["end2"] = datasets.Value("int32")
        if self.config.name == "multirc":
            features["idx"] = dict({
                "paragraph": datasets.Value("int32"),
                "question": datasets.Value("int32"),
                "answer": datasets.Value("int32"),
            })
        elif self.config.name == "record":
            features["idx"] = dict({
                "passage": datasets.Value("int32"),
                "query": datasets.Value("int32"),
            })
        else:
            features["idx"] = datasets.Value("int32")

        if self.config.name == "record":
            # Entities are the set of possible choices for the placeholder.
            features["entities"] = datasets.features.Sequence(
                datasets.Value("string"))
            # Answers are the subset of entities that are correct.
            features["answers"] = datasets.features.Sequence(
                datasets.Value("string"))
        else:
            features["label"] = datasets.features.ClassLabel(
                names=self.config.label_classes)

        return datasets.DatasetInfo(
            description=_GLUE_DESCRIPTION + self.config.description,
            features=datasets.Features(features),
            homepage=self.config.url,
            citation=self.config.citation + "\n" + _SUPER_GLUE_CITATION,
        )

コード例 #19

0

ファイルを表示

    def _info(self):
        if self.config.name == "evaluation_dataset":
            features = datasets.Features({
                "stackoverflow_id":
                datasets.Value("int32"),
                "question":
                datasets.Value("string"),
                "question_url":
                datasets.Value("string"),
                "question_author":
                datasets.Value("string"),
                "question_author_url":
                datasets.Value("string"),
                "answer":
                datasets.Value("string"),
                "answer_url":
                datasets.Value("string"),
                "answer_author":
                datasets.Value("string"),
                "answer_author_url":
                datasets.Value("string"),
                "examples":
                datasets.features.Sequence(datasets.Value("int32")),
                "examples_url":
                datasets.features.Sequence(datasets.Value("string")),
            })
        else:
            features = datasets.Features({
                "id":
                datasets.Value("int32"),
                "filepath":
                datasets.Value("string"),
                "method_name":
                datasets.Value("string"),
                "start_line":
                datasets.Value("int32"),
                "end_line":
                datasets.Value("int32"),
                "url":
                datasets.Value("string"),
            })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

コード例 #20

0

ファイルを表示

 def _info(self):
     span_features = {
         "start": datasets.Value("int32"),
         "end": datasets.Value("int32"),
         "string": datasets.Value("string"),
     }
     reference_features = {
         "start": datasets.Value("int32"),
         "end": datasets.Value("int32"),
         "bridge": datasets.Value("bool_"),
         "string": datasets.Value("string"),
     }
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "example_id": datasets.Value("int64"),
                 "title_text": datasets.Value("string"),
                 "url": datasets.Value("string"),
                 "question": datasets.Value("string"),
                 "paragraph_text": datasets.Value("string"),
                 "sentence_starts": datasets.Sequence(datasets.Value("int32")),
                 "original_nq_answers": [span_features],
                 "annotation": {
                     "referential_equalities": [
                         {
                             "question_reference": span_features,
                             "sentence_reference": reference_features,
                         }
                     ],
                     "answer": [
                         {
                             "sentence_reference": reference_features,
                             "paragraph_reference": span_features,
                         }
                     ],
                     "explanation_type": datasets.Value("string"),
                     "selected_sentence": span_features,
                 },
             }
         ),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         citation=_CITATION,
     )

コード例 #21

0

ファイルを表示

ファイル: pn_summary.py プロジェクト: Priyansh2/nlp

}
"""

_DESCRIPTION = """\
A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.
It is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace("[n]", "\n")`) and then use them for your purposes.
"""

_HOMEPAGE = "https://github.com/hooshvare/pn-summary"
_LICENSE = "MIT License"

_URLs = {
    "1.0.0": {
        "data": "https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download",
        "features": [
            {"name": "id", "type": datasets.Value("string")},
            {"name": "title", "type": datasets.Value("string")},
            {"name": "article", "type": datasets.Value("string")},
            {"name": "summary", "type": datasets.Value("string")},
            {
                "name": "category",
                "type": datasets.ClassLabel(
                    names=[
                        "Economy",
                        "Roads-Urban",
                        "Banking-Insurance",
                        "Agriculture",
                        "International",
                        "Oil-Energy",
                        "Industry",
                        "Transportation",

コード例 #22

0

ファイルを表示

ファイル: doc2dial.py プロジェクト: wickieonya/datasets

    def _info(self):

        if self.config.name == "dialogue_domain":
            features = datasets.Features({
                "dial_id":
                datasets.Value("string"),
                "doc_id":
                datasets.Value("string"),
                "domain":
                datasets.Value("string"),
                "turns": [{
                    "turn_id":
                    datasets.Value("int32"),
                    "role":
                    datasets.Value("string"),
                    "da":
                    datasets.Value("string"),
                    "references": [{
                        "sp_id": datasets.Value("string"),
                        "label": datasets.Value("string"),
                    }],
                    "utterance":
                    datasets.Value("string"),
                }],
            })
        elif self.config.name == "document_domain":
            features = datasets.Features({
                "domain":
                datasets.Value("string"),
                "doc_id":
                datasets.Value("string"),
                "title":
                datasets.Value("string"),
                "doc_text":
                datasets.Value("string"),
                "spans": [{
                    "id_sp": datasets.Value("string"),
                    "tag": datasets.Value("string"),
                    "start_sp": datasets.Value("int32"),
                    "end_sp": datasets.Value("int32"),
                    "text_sp": datasets.Value("string"),
                    "title": datasets.Value("string"),
                    "parent_titles": datasets.Value("string"),
                    "id_sec": datasets.Value("string"),
                    "start_sec": datasets.Value("int32"),
                    "text_sec": datasets.Value("string"),
                    "end_sec": datasets.Value("int32"),
                }],
                "doc_html_ts":
                datasets.Value("string"),
                "doc_html_raw":
                datasets.Value("string"),
            })
        elif self.config.name == "doc2dial_rc":
            features = datasets.Features({
                "id":
                datasets.Value("string"),
                "title":
                datasets.Value("string"),
                "context":
                datasets.Value("string"),
                "question":
                datasets.Value("string"),
                "answers":
                datasets.features.Sequence({
                    "text":
                    datasets.Value("string"),
                    "answer_start":
                    datasets.Value("int32"),
                }),
                "domain":
                datasets.Value("string"),
            })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            citation=_CITATION,
        )

コード例 #23

0

ファイルを表示

ファイル: lama.py プロジェクト: Priyansh2/nlp

 def _info(self):
     if self.config.name == "trex":
         features = datasets.Features({
             "uuid":
             datasets.Value("string"),
             "obj_uri":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "sub_uri":
             datasets.Value("string"),
             "sub_label":
             datasets.Value("string"),
             "predicate_id":
             datasets.Value("string"),
             "sub_surface":
             datasets.Value("string"),
             "obj_surface":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
             "template":
             datasets.Value("string"),
             "template_negated":
             datasets.Value("string"),
             "label":
             datasets.Value("string"),
             "description":
             datasets.Value("string"),
             "type":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
     elif self.config.name == "conceptnet":
         features = datasets.Features({
             "uuid":
             datasets.Value("string"),
             "sub":
             datasets.Value("string"),
             "obj":
             datasets.Value("string"),
             "pred":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
             "negated":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
     elif self.config.name == "squad":
         features = datasets.Features({
             "id":
             datasets.Value("string"),
             "sub_label":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "negated":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )
     elif self.config.name == "google_re":
         features = datasets.Features({
             "pred":
             datasets.Value("string"),
             "sub":
             datasets.Value("string"),
             "obj":
             datasets.Value("string"),
             "evidences":
             datasets.Value("string"),
             "judgments":
             datasets.Value("string"),
             "sub_w":
             datasets.Value("string"),
             "sub_label":
             datasets.Value("string"),
             "sub_aliases":
             datasets.Value("string"),
             "obj_w":
             datasets.Value("string"),
             "obj_label":
             datasets.Value("string"),
             "obj_aliases":
             datasets.Value("string"),
             "uuid":
             datasets.Value("string"),
             "masked_sentence":
             datasets.Value("string"),
             "template":
             datasets.Value("string"),
             "template_negated":
             datasets.Value("string"),
         })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
             supervised_keys=None,
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
         )

コード例 #24

0

ファイルを表示

ファイル: the_pile.py プロジェクト: merveenoyan/datasets

            f"https://the-eye.eu/public/AI/pile/train/{i:0>2}.jsonl.zst"
            for i in range(30)
        ],
        "validation": ["https://the-eye.eu/public/AI/pile/val.jsonl.zst"],
        "test": ["https://the-eye.eu/public/AI/pile/test.jsonl.zst"],
    },
    "free_law":
    "https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
    "pubmed_central":
    "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz",
}

_FEATURES = {
    "all":
    datasets.Features({
        "text": datasets.Value("string"),
        "meta": {
            "pile_set_name": datasets.Value("string")
        },
    }),
    "free_law":
    datasets.Features({
        "text": datasets.Value("string"),
        "meta": {
            "case_ID": datasets.Value("string"),
            "case_jurisdiction": datasets.Value("string"),
            "date_created": datasets.Value("string"),
        },
    }),
    "pubmed_central":
    datasets.Features({

コード例 #25

0

ファイルを表示

 def features(self):
     if self.name == "simplified":
         return {
             "text":
             datasets.Value("string"),
             "labels":
             datasets.Sequence(datasets.ClassLabel(names=_CLASS_NAMES)),
             "id":
             datasets.Value("string"),
         }
     elif self.name == "raw":
         d = {
             "text": datasets.Value("string"),
             "id": datasets.Value("string"),
             "author": datasets.Value("string"),
             "subreddit": datasets.Value("string"),
             "link_id": datasets.Value("string"),
             "parent_id": datasets.Value("string"),
             "created_utc": datasets.Value("float"),
             "rater_id": datasets.Value("int32"),
             "example_very_unclear": datasets.Value("bool"),
         }
         d.update(
             {label: datasets.Value("int32")
              for label in _CLASS_NAMES})
         return d

コード例 #26

0

ファイルを表示

 def _info(self):
     # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
     features = datasets.Features(
         {
             "id": datasets.Value("int32"),
             "category": datasets.Value("string"),
             "text": datasets.Value("string"),
             "ner": datasets.features.Sequence(
                 {
                     "source": {
                         "from": datasets.Value("int32"),
                         "text": datasets.Value("string"),
                         "to": datasets.Value("int32"),
                         "type": datasets.features.ClassLabel(
                             names=[
                                 "PRODUCT_NAME",
                                 "PRODUCT_NAME_IMP",
                                 "PRODUCT_NO_BRAND",
                                 "BRAND_NAME",
                                 "BRAND_NAME_IMP",
                                 "VERSION",
                                 "PRODUCT_ADJ",
                                 "BRAND_ADJ",
                                 "LOCATION",
                                 "LOCATION_IMP",
                             ]
                         ),
                     },
                     "target": {
                         "from": datasets.Value("int32"),
                         "text": datasets.Value("string"),
                         "to": datasets.Value("int32"),
                         "type": datasets.features.ClassLabel(
                             names=[
                                 "PRODUCT_NAME",
                                 "PRODUCT_NAME_IMP",
                                 "PRODUCT_NO_BRAND",
                                 "BRAND_NAME",
                                 "BRAND_NAME_IMP",
                                 "VERSION",
                                 "PRODUCT_ADJ",
                                 "BRAND_ADJ",
                                 "LOCATION",
                                 "LOCATION_IMP",
                             ]
                         ),
                     },
                 }
             ),
         }
     )
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )

コード例 #27

0

ファイルを表示

    def _info(self):

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "document": {
                    "id": datasets.Value("string"),
                    "kind": datasets.Value("string"),
                    "url": datasets.Value("string"),
                    "file_size": datasets.Value("int32"),
                    "word_count": datasets.Value("int32"),
                    "start": datasets.Value("string"),
                    "end": datasets.Value("string"),
                    "summary": {
                        "text":
                        datasets.Value("string"),
                        "tokens":
                        datasets.features.Sequence(datasets.Value("string")),
                        "url":
                        datasets.Value("string"),
                        "title":
                        datasets.Value("string"),
                    },
                    "text": datasets.Value("string"),
                },
                "question": {
                    "text": datasets.Value("string"),
                    "tokens":
                    datasets.features.Sequence(datasets.Value("string")),
                },
                "answers": [{
                    "text":
                    datasets.Value("string"),
                    "tokens":
                    datasets.features.Sequence(datasets.Value("string")),
                }],
            }),
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

コード例 #28

0

ファイルを表示

 def _info(self):
     # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
     features = datasets.Features({
         "full_text":
         datasets.Value("string"),
         "text_translation":
         datasets.Value("string"),
         "screen_name":
         datasets.Value("string"),
         "description":
         datasets.Value("string"),
         "desc_translation":
         datasets.Value("string"),
         "location":
         datasets.Value("string"),
         "weekofyear":
         datasets.Value("int64"),
         "weekday":
         datasets.Value("int64"),
         "month":
         datasets.Value("int64"),
         "year":
         datasets.Value("int64"),
         "day":
         datasets.Value("int64"),
         "point_info":
         datasets.Value("string"),
         "point":
         datasets.Value("string"),
         "latitude":
         datasets.Value("float64"),
         "longitude":
         datasets.Value("float64"),
         "altitude":
         datasets.Value("float64"),
         "province":
         datasets.Value("string"),
         "hisco_standard":
         datasets.Value("string"),
         "hisco_code":
         datasets.Value("string"),
         "industry":
         datasets.Value("bool_"),
         "sentiment_pattern":
         datasets.Value("float64"),
         "subjective_pattern":
         datasets.Value("float64"),
         "label":
         datasets.ClassLabel(num_classes=3,
                             names=["neg", "neu", "pos"],
                             names_file=None,
                             id=None),
     })
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=
         features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )

コード例 #29

0

ファイルを表示

ファイル: schema_guided_dstc8.py プロジェクト: Priyansh2/nlp

 def _info(self):
     if self.config.name == "schema":
         features = datasets.Features({
             "service_name":
             datasets.Value("string"),
             "description":
             datasets.Value("string"),
             "slots":
             datasets.Sequence({
                 "name":
                 datasets.Value("string"),
                 "description":
                 datasets.Value("string"),
                 "is_categorical":
                 datasets.Value("bool"),
                 "possible_values":
                 datasets.Sequence(datasets.Value("string")),
             }),
             "intents":
             datasets.Sequence(
                 {
                     "name":
                     datasets.Value("string"),
                     "description":
                     datasets.Value("string"),
                     "is_transactional":
                     datasets.Value("bool"),
                     "required_slots":
                     datasets.Sequence(datasets.Value("string")),
                     # optional_slots was originally a dictionary
                     "optional_slots":
                     datasets.Sequence(
                         {
                             "slot_name": datasets.Value("string"),
                             "slot_value": datasets.Value("string"),
                         }),
                     "result_slots":
                     datasets.Sequence(datasets.Value("string")),
                 }, ),
         })
     else:
         features = datasets.Features({
             "dialogue_id":
             datasets.Value("string"),
             "services":
             datasets.Sequence(datasets.Value("string")),
             "turns":
             datasets.Sequence({
                 "speaker":
                 datasets.ClassLabel(names=["USER", "SYSTEM"]),
                 "utterance":
                 datasets.Value("string"),
                 "frames":
                 datasets.Sequence({
                     "service":
                     datasets.Value("string"),
                     "slots":
                     datasets.Sequence({
                         "slot":
                         datasets.Value("string"),
                         "start":
                         datasets.Value("int32"),
                         "exclusive_end":
                         datasets.Value("int32"),
                     }),
                     # optional
                     "state": {
                         "active_intent":
                         datasets.Value("string"),
                         "requested_slots":
                         datasets.Sequence(datasets.Value("string")),
                         # slot_values was originally a dictionary
                         "slot_values":
                         datasets.Sequence({
                             "slot_name":
                             datasets.Value("string"),
                             "slot_value_list":
                             datasets.Sequence(datasets.Value("string")),
                         }),
                     },
                     "actions":
                     datasets.Sequence({
                         "act":
                         datasets.ClassLabel(names=_ALL_ACTS),
                         # optional
                         "slot":
                         datasets.Value("string"),
                         # optional
                         "canonical_values":
                         datasets.Sequence(datasets.Value("string")),
                         # optional
                         "values":
                         datasets.Sequence(datasets.Value("string")),
                     }),
                     # optional
                     "service_results":
                     datasets.Sequence(
                         # Arrow doesn't like Sequences of Sequences for default values so we need a Sequence of Features of Sequences
                         {
                             "service_results_list":
                             datasets.Sequence(
                                 # originally each list item was a dictionary (optional)
                                 {
                                     "service_slot_name":
                                     datasets.Value("string"),
                                     "service_canonical_value":
                                     datasets.Value("string"),
                                 })
                         }),
                     # optional
                     "service_call": {
                         "method":
                         datasets.Value("string"),
                         # parameters was originally a dictionary
                         "parameters":
                         datasets.Sequence({
                             "parameter_slot_name":
                             datasets.Value("string"),
                             "parameter_canonical_value":
                             datasets.Value("string"),
                         }),
                     },
                 }),
             }),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

コード例 #30

0

ファイルを表示

ファイル: multi_woz_v22.py プロジェクト: Priyansh2/nlp

 def _info(self):
     features = datasets.Features({
         "dialogue_id":
         datasets.Value("string"),
         "services":
         datasets.Sequence(datasets.Value("string")),
         "turns":
         datasets.Sequence({
             "turn_id":
             datasets.Value("string"),
             "speaker":
             datasets.ClassLabel(names=["USER", "SYSTEM"]),
             "utterance":
             datasets.Value("string"),
             "frames":
             datasets.Sequence({
                 "service":
                 datasets.Value("string"),
                 "state": {
                     "active_intent":
                     datasets.Value("string"),
                     "requested_slots":
                     datasets.Sequence(datasets.Value("string")),
                     "slots_values":
                     datasets.Sequence({
                         "slots_values_name":
                         datasets.Value("string"),
                         "slots_values_list":
                         datasets.Sequence(datasets.Value("string")),
                     }),
                 },
                 "slots":
                 datasets.Sequence({
                     "slot":
                     datasets.Value("string"),
                     "value":
                     datasets.Value("string"),
                     "start":
                     datasets.Value("int32"),
                     "exclusive_end":
                     datasets.Value("int32"),
                     "copy_from":
                     datasets.Value("string"),
                     "copy_from_value":
                     datasets.Sequence(datasets.Value("string")),
                 }),
             }),
             "dialogue_acts":
             datasets.Features({
                 "dialog_act":
                 datasets.Sequence({
                     "act_type":
                     datasets.Value("string"),
                     "act_slots":
                     datasets.Sequence(
                         datasets.Features({
                             "slot_name":
                             datasets.Value("string"),
                             "slot_value":
                             datasets.Value("string"),
                         }), ),
                 }),
                 "span_info":
                 datasets.Sequence({
                     "act_type":
                     datasets.Value("string"),
                     "act_slot_name":
                     datasets.Value("string"),
                     "act_slot_value":
                     datasets.Value("string"),
                     "span_start":
                     datasets.Value("int32"),
                     "span_end":
                     datasets.Value("int32"),
                 }),
             }),
         }),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=
         "https://github.com/budzianowski/multiwoz/tree/master/data/MultiWOZ_2.2",
         license=_LICENSE,
         citation=_CITATION,
     )