Python Sequenceの例、datasets.Sequence Pythonの例

コード例 #1

0

ファイルを表示

    def _info(self):
        features = datasets.Features({
            "id":
            datasets.Value("string"),
            "text":
            datasets.Value("string"),
            "tokens":
            datasets.Sequence(datasets.Value("string")),
            "nps": [{
                "text": datasets.Value("string"),
                "first_char": datasets.Value("int32"),
                "last_char": datasets.Value("int32"),
                "first_token": datasets.Value("int32"),
                "last_token": datasets.Value("int32"),
                "id": datasets.Value("string"),
            }],
            "np_relations": [{
                "anchor":
                datasets.Value("string"),
                "complement":
                datasets.Value("string"),
                "preposition":
                datasets.features.ClassLabel(names=[
                    "about",
                    "for",
                    "with",
                    "from",
                    "among",
                    "by",
                    "on",
                    "at",
                    "during",
                    "of",
                    "member(s) of",
                    "in",
                    "after",
                    "under",
                    "to",
                    "into",
                    "before",
                    "near",
                    "outside",
                    "around",
                    "between",
                    "against",
                    "over",
                    "inside",
                ]),
                "complement_coref_cluster_id":
                datasets.Value("string"),
            }],
            "coref": [{
                "id":
                datasets.Value("string"),
                "members":
                datasets.Sequence(datasets.Value("string")),
                "np_type":
                datasets.features.ClassLabel(names=[
                    "standard",
                    "time/date/measurement",
                    "idiomatic",
                ]),
            }],
            "metadata": {
                "annotators": {
                    "coref_worker":
                    datasets.Value("int32"),
                    "consolidator_worker":
                    datasets.Value("int32"),
                    "np-relations_worker":
                    datasets.Sequence(datasets.Value("int32")),
                },
                "url": datasets.Value("string"),
                "source": datasets.Value("string"),
            },
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
            # specify them. They'll be used if as_supervised=True in builder.as_dataset.
            # supervised_keys=("sentence", "label"),
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

コード例 #2

0

ファイルを表示

    def _info(self):
        features = datasets.Features({
            "id":
            datasets.Value("int32"),
            "tokens":
            datasets.Sequence(datasets.Value("string")),
            "ner_ids":
            datasets.Sequence(datasets.Value("int32")),
            "space_after":
            datasets.Sequence(datasets.Value("bool")),
            "ner_tags":
            datasets.Sequence(
                datasets.features.ClassLabel(names=[
                    "O",
                    "B-PERSON",
                    "I-PERSON",
                    "B-ORG",
                    "I-ORG",
                    "B-GPE",
                    "I-GPE",
                    "B-LOC",
                    "I-LOC",
                    "B-NAT_REL_POL",
                    "I-NAT_REL_POL",
                    "B-EVENT",
                    "I-EVENT",
                    "B-LANGUAGE",
                    "I-LANGUAGE",
                    "B-WORK_OF_ART",
                    "I-WORK_OF_ART",
                    "B-DATETIME",
                    "I-DATETIME",
                    "B-PERIOD",
                    "I-PERIOD",
                    "B-MONEY",
                    "I-MONEY",
                    "B-QUANTITY",
                    "I-QUANTITY",
                    "B-NUMERIC",
                    "I-NUMERIC",
                    "B-ORDINAL",
                    "I-ORDINAL",
                    "B-FACILITY",
                    "I-FACILITY",
                ])),
        })

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

コード例 #3

0

ファイルを表示

 def _info(self):
     return datasets.DatasetInfo(
         features=datasets.Features({
             #"position": datasets.Sequence(datasets.Value("string")),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             #"sid": datasets.Value("string"),
             "pos_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     '"',
                     "''",
                     "#",
                     "$",
                     "(",
                     ")",
                     ",",
                     ".",
                     ":",
                     "``",
                     "CC",
                     "CD",
                     "DT",
                     "EX",
                     "FW",
                     "HYPH",
                     "IN",
                     "JJ",
                     "JJR",
                     "JJS",
                     "LS",
                     "-LRB-",
                     "MD",
                     "NN",
                     "NNP",
                     "NNPS",
                     "NNS",
                     "NN|SYM",
                     "PDT",
                     "POS",
                     "PRP",
                     "PRP$",
                     "-RRB-",
                     "RB",
                     "RBR",
                     "RBS",
                     "RP",
                     "SYM",
                     "TO",
                     "UH",
                     "VB",
                     "VBD",
                     "VBG",
                     "VBN",
                     "VBP",
                     "VBZ",
                     "WDT",
                     "WP",
                     "WP$",
                     "WRB",
                 ])),
         }),
         supervised_keys=None,
     )

コード例 #4

0

ファイルを表示

ファイル: diplomacy_detection.py プロジェクト: albertvillanova/huggingface_datasets

 def _info(self):
     features = datasets.Features({
         "messages":
         datasets.Sequence(datasets.Value("string")),
         "sender_labels":
         datasets.Sequence(datasets.ClassLabel(names=["false", "true"])),
         "receiver_labels":
         datasets.Sequence(
             datasets.ClassLabel(names=["false", "true", "noannotation"])),
         "speakers":
         datasets.Sequence(datasets.ClassLabel(names=_PLAYABLE_COUNTRIES)),
         "receivers":
         datasets.Sequence(datasets.ClassLabel(names=_PLAYABLE_COUNTRIES)),
         "absolute_message_index":
         datasets.Sequence(datasets.Value("int64")),
         "relative_message_index":
         datasets.Sequence(datasets.Value("int64")),
         "seasons":
         datasets.Sequence(datasets.ClassLabel(names=_SEASONS)),
         "years":
         datasets.Sequence(datasets.ClassLabel(names=_YEARS)),
         "game_score":
         datasets.Sequence(datasets.ClassLabel(names=_GAME_SCORE)),
         "game_score_delta":
         datasets.Sequence(datasets.ClassLabel(names=_GAME_SCORE_DELTA)),
         "players":
         datasets.Sequence(datasets.ClassLabel(names=_PLAYABLE_COUNTRIES)),
         "game_id":
         datasets.Value("int64"),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         citation=_CITATION,
     )

コード例 #5

0

ファイルを表示

ファイル: kor_ner.py プロジェクト: albertvillanova/huggingface_datasets

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "text":
             datasets.Value("string"),
             "annot_text":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "pos_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "SO",
                     "SS",
                     "VV",
                     "XR",
                     "VCP",
                     "JC",
                     "VCN",
                     "JKB",
                     "MM",
                     "SP",
                     "XSN",
                     "SL",
                     "NNP",
                     "NP",
                     "EP",
                     "JKQ",
                     "IC",
                     "XSA",
                     "EC",
                     "EF",
                     "SE",
                     "XPN",
                     "ETN",
                     "SH",
                     "XSV",
                     "MAG",
                     "SW",
                     "ETM",
                     "JKO",
                     "NNB",
                     "MAJ",
                     "NNG",
                     "JKV",
                     "JKC",
                     "VA",
                     "NR",
                     "JKG",
                     "VX",
                     "SF",
                     "JX",
                     "JKS",
                     "SN",
                 ])),
             "ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "I", "O", "B_OG", "B_TI", "B_LC", "B_DT", "B_PS"
                 ])),
         }),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

コード例 #6

0

ファイルを表示

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "pos_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "''",
                     "#",
                     "$",
                     "(",
                     ")",
                     ",",
                     ".",
                     ":",
                     "``",
                     "CC",
                     "CD",
                     "DT",
                     "EX",
                     "FW",
                     "IN",
                     "JJ",
                     "JJR",
                     "JJS",
                     "MD",
                     "NN",
                     "NNP",
                     "NNPS",
                     "NNS",
                     "PDT",
                     "POS",
                     "PRP",
                     "PRP$",
                     "RB",
                     "RBR",
                     "RBS",
                     "RP",
                     "SYM",
                     "TO",
                     "UH",
                     "VB",
                     "VBD",
                     "VBG",
                     "VBN",
                     "VBP",
                     "VBZ",
                     "WDT",
                     "WP",
                     "WP$",
                     "WRB",
                 ])),
             "chunk_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-ADJP",
                     "I-ADJP",
                     "B-ADVP",
                     "I-ADVP",
                     "B-CONJP",
                     "I-CONJP",
                     "B-INTJ",
                     "I-INTJ",
                     "B-LST",
                     "I-LST",
                     "B-NP",
                     "I-NP",
                     "B-PP",
                     "I-PP",
                     "B-PRT",
                     "I-PRT",
                     "B-SBAR",
                     "I-SBAR",
                     "B-UCP",
                     "I-UCP",
                     "B-VP",
                     "I-VP",
                 ])),
         }),
         supervised_keys=None,
         homepage="https://www.clips.uantwerpen.be/conll2000/chunking/",
         citation=_CITATION,
     )

コード例 #7

0

ファイルを表示

ファイル: wmt20_mlqe_task3.py プロジェクト: ruch798/datasets

    def _info(self):
        features = datasets.Features({
            "document_id":
            datasets.Value("string"),
            "source_segments":
            datasets.Sequence(datasets.Value("string")),
            "source_tokenized":
            datasets.Sequence(datasets.Value("string")),
            "mt_segments":
            datasets.Sequence(datasets.Value("string")),
            "mt_tokenized":
            datasets.Sequence(datasets.Value("string")),
            "annotations":
            datasets.Sequence({
                "segment_id":
                datasets.Sequence(datasets.Value("int32")),
                "annotation_start":
                datasets.Sequence(datasets.Value("int32")),
                "annotation_length":
                datasets.Sequence(datasets.Value("int32")),
                "severity":
                datasets.ClassLabel(names=["minor", "major", "critical"]),
                "severity_weight":
                datasets.Value("float32"),
                "category":
                datasets.ClassLabel(names=_ANNOTATION_CATEGORIES),
            }),
            "token_annotations":
            datasets.Sequence({
                "segment_id":
                datasets.Sequence(datasets.Value("int32")),
                "first_token":
                datasets.Sequence(datasets.Value("int32")),
                "last_token":
                datasets.Sequence(datasets.Value("int32")),
                "token_after_gap":
                datasets.Sequence(datasets.Value("int32")),
                "severity":
                datasets.ClassLabel(names=["minor", "major", "critical"]),
                "category":
                datasets.ClassLabel(names=_ANNOTATION_CATEGORIES),
            }),
            "token_index":
            datasets.Sequence(
                datasets.Sequence(datasets.Sequence(datasets.Value("int32")))),
            "total_words":
            datasets.Value("int32"),
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

コード例 #8

0

ファイルを表示

ファイル: enriched_web_nlg.py プロジェクト: albertvillanova/huggingface_datasets

 def _info(self):
     if self.config.name == "en":
         features = datasets.Features({
             "category":
             datasets.Value("string"),
             "size":
             datasets.Value("int32"),
             "eid":
             datasets.Value("string"),
             "original_triple_sets":
             datasets.Sequence({
                 "otriple_set":
                 datasets.Sequence(datasets.Value("string"))
             }),
             "modified_triple_sets":
             datasets.Sequence({
                 "mtriple_set":
                 datasets.Sequence(datasets.Value("string"))
             }),
             "shape":
             datasets.Value("string"),
             "shape_type":
             datasets.Value("string"),
             "lex":
             datasets.Sequence({
                 "comment":
                 datasets.Value("string"),
                 "lid":
                 datasets.Value("string"),
                 "text":
                 datasets.Value("string"),
                 "template":
                 datasets.Value("string"),
                 "sorted_triple_sets":
                 datasets.Sequence(datasets.Value("string")),
                 # only present in the en version
                 "lexicalization":
                 datasets.Value("string"),
             }),
         })
     else:
         features = datasets.Features({
             "category":
             datasets.Value("string"),
             "size":
             datasets.Value("int32"),
             "eid":
             datasets.Value("string"),
             "original_triple_sets":
             datasets.Sequence({
                 "otriple_set":
                 datasets.Sequence(datasets.Value("string"))
             }),
             "modified_triple_sets":
             datasets.Sequence({
                 "mtriple_set":
                 datasets.Sequence(datasets.Value("string"))
             }),
             "shape":
             datasets.Value("string"),
             "shape_type":
             datasets.Value("string"),
             "lex":
             datasets.Sequence({
                 "comment":
                 datasets.Value("string"),
                 "lid":
                 datasets.Value("string"),
                 "text":
                 datasets.Value("string"),
                 "template":
                 datasets.Value("string"),
                 "sorted_triple_sets":
                 datasets.Sequence(datasets.Value("string")),
             }),
         })
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=
         features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         citation=_CITATION,
         license=_LICENSE,
     )

コード例 #9

0

ファイルを表示

ファイル: schema_guided_dstc8.py プロジェクト: adamlin120/cross-ontology-dst

 def _info(self):
     if self.config.name == "slot_description":
         features = datasets.Features({
             "service_name":
             datasets.Value("string"),
             "description":
             datasets.Value("string"),
             "slots":
             datasets.Sequence({
                 "name":
                 datasets.Value("string"),
                 "description":
                 datasets.Value("string"),
                 "is_categorical":
                 datasets.Value("bool"),
                 "possible_values":
                 datasets.Sequence(datasets.Value("string")),
             }),
             "intents":
             datasets.Sequence(
                 {
                     "name":
                     datasets.Value("string"),
                     "description":
                     datasets.Value("string"),
                     "is_transactional":
                     datasets.Value("bool"),
                     "required_slots":
                     datasets.Sequence(datasets.Value("string")),
                     # optional_slots was originally a dictionary
                     "optional_slots":
                     datasets.Sequence(
                         {
                             "slot_name": datasets.Value("string"),
                             "slot_value": datasets.Value("string"),
                         }),
                     "result_slots":
                     datasets.Sequence(datasets.Value("string")),
                 }, ),
         })
     elif self.config.name == "dialogues":
         features = datasets.Features({
             "dialogue_id":
             datasets.Value("string"),
             "services":
             datasets.Sequence(datasets.Value("string")),
             "turns":
             datasets.Sequence({
                 "speaker":
                 datasets.ClassLabel(names=["USER", "SYSTEM"]),
                 "utterance":
                 datasets.Value("string"),
                 "frames":
                 datasets.Sequence({
                     "service":
                     datasets.Value("string"),
                     "slots":
                     datasets.Sequence({
                         "slot":
                         datasets.Value("string"),
                         "start":
                         datasets.Value("int32"),
                         "exclusive_end":
                         datasets.Value("int32"),
                     }),
                     # optional
                     "state": {
                         "active_intent":
                         datasets.Value("string"),
                         "requested_slots":
                         datasets.Sequence(datasets.Value("string")),
                         # slot_values was originally a dictionary
                         "slot_values":
                         datasets.Sequence({
                             "slot_name":
                             datasets.Value("string"),
                             "slot_value_list":
                             datasets.Sequence(datasets.Value("string")),
                         }),
                     },
                     "actions":
                     datasets.Sequence({
                         "act":
                         datasets.ClassLabel(names=_ALL_ACTS),
                         # optional
                         "slot":
                         datasets.Value("string"),
                         # optional
                         "canonical_values":
                         datasets.Sequence(datasets.Value("string")),
                         # optional
                         "values":
                         datasets.Sequence(datasets.Value("string")),
                     }),
                     # optional
                     "service_results":
                     datasets.Sequence(
                         # Arrow doesn't like Sequences of Sequences for default values so we need a Sequence of Features of Sequences
                         {
                             "service_results_list":
                             datasets.Sequence(
                                 # originally each list item was a dictionary (optional)
                                 {
                                     "service_slot_name":
                                     datasets.Value("string"),
                                     "service_canonical_value":
                                     datasets.Value("string"),
                                 })
                         }),
                     # optional
                     "service_call": {
                         "method":
                         datasets.Value("string"),
                         # parameters was originally a dictionary
                         "parameters":
                         datasets.Sequence({
                             "parameter_slot_name":
                             datasets.Value("string"),
                             "parameter_canonical_value":
                             datasets.Value("string"),
                         }),
                     },
                 }),
             }),
         })
     elif self.config.name == "turns":
         features = datasets.Features({
             "dialogue_id":
             datasets.Value("string"),
             "services":
             datasets.Sequence(datasets.Value("string")),
             "speaker":
             datasets.ClassLabel(names=["USER", "SYSTEM"]),
             "utterance":
             datasets.Value("string"),
             "frames":
             datasets.Sequence({
                 "service":
                 datasets.Value("string"),
                 "slots":
                 datasets.Sequence({
                     "slot": datasets.Value("string"),
                     "start": datasets.Value("int32"),
                     "exclusive_end": datasets.Value("int32"),
                 }),
                 # optional
                 "state": {
                     "active_intent":
                     datasets.Value("string"),
                     "requested_slots":
                     datasets.Sequence(datasets.Value("string")),
                     # slot_values was originally a dictionary
                     "slot_values":
                     datasets.Sequence({
                         "slot_name":
                         datasets.Value("string"),
                         "slot_value_list":
                         datasets.Sequence(datasets.Value("string")),
                     }),
                 },
                 "actions":
                 datasets.Sequence({
                     "act":
                     datasets.ClassLabel(names=_ALL_ACTS),
                     # optional
                     "slot":
                     datasets.Value("string"),
                     # optional
                     "canonical_values":
                     datasets.Sequence(datasets.Value("string")),
                     # optional
                     "values":
                     datasets.Sequence(datasets.Value("string")),
                 }),
                 # optional
                 "service_results":
                 datasets.Sequence(
                     # Arrow doesn't like Sequences of Sequences for default values so we need a
                     # Sequence of Features of Sequences
                     {
                         "service_results_list":
                         datasets.Sequence(
                             # originally each list item was a dictionary (optional)
                             {
                                 "service_slot_name":
                                 datasets.Value("string"),
                                 "service_canonical_value":
                                 datasets.Value("string"),
                             })
                     }),
                 # optional
                 "service_call": {
                     "method":
                     datasets.Value("string"),
                     # parameters was originally a dictionary
                     "parameters":
                     datasets.Sequence({
                         "parameter_slot_name":
                         datasets.Value("string"),
                         "parameter_canonical_value":
                         datasets.Value("string"),
                     }),
                 },
             }),
         })
     elif self.config.name == "slots":
         features = datasets.Features({
             "dialogue_id":
             datasets.Value("string"),
             "services":
             datasets.Sequence(datasets.Value("string")),
             "speaker":
             datasets.ClassLabel(names=["USER", "SYSTEM"]),
             "utterance":
             datasets.Value("string"),
             "history":
             datasets.Value("string"),
             "name":
             datasets.Value("string"),
             "description":
             datasets.Value("string"),
             "value":
             datasets.Value("string"),
             "service+description+history":
             datasets.Value("string"),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )

コード例 #10

0

ファイルを表示

ファイル: xglue.py プロジェクト: albertvillanova/huggingface_datasets

    def _info(self):
        if self.config.name == "ner":
            features = {
                "words": datasets.Sequence(datasets.Value("string")),
                "ner": datasets.Sequence(
                    datasets.features.ClassLabel(
                        names=[
                            "O",
                            "B-PER",
                            "I-PER",
                            "B-ORG",
                            "I-ORG",
                            "B-LOC",
                            "I-LOC",
                            "B-MISC",
                            "I-MISC",
                        ]
                    )
                ),
            }
        elif self.config.name == "pos":
            features = {
                "words": datasets.Sequence(datasets.Value("string")),
                "pos": datasets.Sequence(
                    datasets.features.ClassLabel(
                        names=[
                            "ADJ",
                            "ADP",
                            "ADV",
                            "AUX",
                            "CCONJ",
                            "DET",
                            "INTJ",
                            "NOUN",
                            "NUM",
                            "PART",
                            "PRON",
                            "PROPN",
                            "PUNCT",
                            "SCONJ",
                            "SYM",
                            "VERB",
                            "X",
                        ]
                    )
                ),
            }
        elif self.config.name == "mlqa":
            features = {
                "context": datasets.Value("string"),
                "question": datasets.Value("string"),
                "answers": datasets.features.Sequence(
                    {"answer_start": datasets.Value("int32"), "text": datasets.Value("string")}
                ),
                # These are the features of your dataset like images, labels ...
            }
        elif self.config.name == "nc":
            features = {
                "news_title": datasets.Value("string"),
                "news_body": datasets.Value("string"),
                "news_category": datasets.ClassLabel(
                    names=[
                        "foodanddrink",
                        "sports",
                        "travel",
                        "finance",
                        "lifestyle",
                        "news",
                        "entertainment",
                        "health",
                        "video",
                        "autos",
                    ]
                ),
            }
        elif self.config.name == "xnli":
            features = {
                "premise": datasets.Value("string"),
                "hypothesis": datasets.Value("string"),
                "label": datasets.features.ClassLabel(names=["entailment", "neutral", "contradiction"]),
            }
        elif self.config.name == "paws-x":
            features = {
                "sentence1": datasets.Value("string"),
                "sentence2": datasets.Value("string"),
                "label": datasets.features.ClassLabel(names=["different", "same"]),
            }
        elif self.config.name == "qadsm":
            features = {
                "query": datasets.Value("string"),
                "ad_title": datasets.Value("string"),
                "ad_description": datasets.Value("string"),
                "relevance_label": datasets.features.ClassLabel(names=["Bad", "Good"]),
            }
        elif self.config.name == "wpr":
            features = {
                "query": datasets.Value("string"),
                "web_page_title": datasets.Value("string"),
                "web_page_snippet": datasets.Value("string"),
                "relavance_label": datasets.features.ClassLabel(names=["Bad", "Fair", "Good", "Excellent", "Perfect"]),
            }
        elif self.config.name == "qam":
            features = {
                "question": datasets.Value("string"),
                "answer": datasets.Value("string"),
                "label": datasets.features.ClassLabel(names=["False", "True"]),
            }
        elif self.config.name == "qg":
            features = {
                "answer_passage": datasets.Value("string"),
                "question": datasets.Value("string"),
            }
        elif self.config.name == "ntg":
            features = {
                "news_body": datasets.Value("string"),
                "news_title": datasets.Value("string"),
            }

        return datasets.DatasetInfo(
            description=_XGLUE_DESCRIPTION,
            features=datasets.Features(features),
            homepage=self.config.url,
            citation=self.config.citation + "\n" + _XGLUE_CITATION,
        )

コード例 #11

0

ファイルを表示

    def _info(self):
        features = datasets.Features(
            {
                "text": datasets.Value("string"),
                "sentence_offsets": datasets.features.Sequence(
                    {"begin_char_offset": datasets.Value("int64"), "end_char_offset": datasets.Value("int64")}
                ),
                "sentences": datasets.features.Sequence(datasets.Value("string")),
                "sentence_labels": datasets.features.Sequence(datasets.Value("int64")),
                "token_offsets": datasets.features.Sequence(
                    {
                        "offsets": datasets.features.Sequence(
                            {"begin_char_offset": datasets.Value("int64"), "end_char_offset": datasets.Value("int64")}
                        )
                    }
                ),
                "tokens": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string"))),
                "entity_labels": datasets.features.Sequence(
                    datasets.features.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "B-DEVICE",
                                "B-EXPERIMENT",
                                "B-MATERIAL",
                                "B-VALUE",
                                "I-DEVICE",
                                "I-EXPERIMENT",
                                "I-MATERIAL",
                                "I-VALUE",
                                "O",
                            ]
                        )
                    )
                ),
                "slot_labels": datasets.features.Sequence(
                    datasets.features.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "B-anode_material",
                                "B-cathode_material",
                                "B-conductivity",
                                "B-current_density",
                                "B-degradation_rate",
                                "B-device",
                                "B-electrolyte_material",
                                "B-experiment_evoking_word",
                                "B-fuel_used",
                                "B-interlayer_material",
                                "B-interconnect_material",
                                "B-open_circuit_voltage",
                                "B-power_density",
                                "B-resistance",
                                "B-support_material",
                                "B-thickness",
                                "B-time_of_operation",
                                "B-voltage",
                                "B-working_temperature",
                                "I-anode_material",
                                "I-cathode_material",
                                "I-conductivity",
                                "I-current_density",
                                "I-degradation_rate",
                                "I-device",
                                "I-electrolyte_material",
                                "I-experiment_evoking_word",
                                "I-fuel_used",
                                "I-interlayer_material",
                                "I-interconnect_material",
                                "I-open_circuit_voltage",
                                "I-power_density",
                                "I-resistance",
                                "I-support_material",
                                "I-thickness",
                                "I-time_of_operation",
                                "I-voltage",
                                "I-working_temperature",
                                "O",
                            ]
                        )
                    )
                ),
                "links": datasets.Sequence(
                    {
                        "relation_label": datasets.features.ClassLabel(
                            names=["coreference", "experiment_variation", "same_experiment", "thickness"]
                        ),
                        "start_span_id": datasets.Value("int64"),
                        "end_span_id": datasets.Value("int64"),
                    }
                ),
                "slots": datasets.features.Sequence(
                    {
                        "frame_participant_label": datasets.features.ClassLabel(
                            names=[
                                "anode_material",
                                "cathode_material",
                                "current_density",
                                "degradation_rate",
                                "device",
                                "electrolyte_material",
                                "fuel_used",
                                "interlayer_material",
                                "open_circuit_voltage",
                                "power_density",
                                "resistance",
                                "support_material",
                                "time_of_operation",
                                "voltage",
                                "working_temperature",
                            ]
                        ),
                        "slot_id": datasets.Value("int64"),
                    }
                ),
                "spans": datasets.features.Sequence(
                    {
                        "span_id": datasets.Value("int64"),
                        "entity_label": datasets.features.ClassLabel(names=["", "DEVICE", "MATERIAL", "VALUE"]),
                        "sentence_id": datasets.Value("int64"),
                        "experiment_mention_type": datasets.features.ClassLabel(
                            names=["", "current_exp", "future_work", "general_info", "previous_work"]
                        ),
                        "begin_char_offset": datasets.Value("int64"),
                        "end_char_offset": datasets.Value("int64"),
                    }
                ),
                "experiments": datasets.features.Sequence(
                    {
                        "experiment_id": datasets.Value("int64"),
                        "span_id": datasets.Value("int64"),
                        "slots": datasets.features.Sequence(
                            {
                                "frame_participant_label": datasets.features.ClassLabel(
                                    names=[
                                        "anode_material",
                                        "cathode_material",
                                        "current_density",
                                        "degradation_rate",
                                        "conductivity",
                                        "device",
                                        "electrolyte_material",
                                        "fuel_used",
                                        "interlayer_material",
                                        "open_circuit_voltage",
                                        "power_density",
                                        "resistance",
                                        "support_material",
                                        "time_of_operation",
                                        "voltage",
                                        "working_temperature",
                                    ]
                                ),
                                "slot_id": datasets.Value("int64"),
                            }
                        ),
                    }
                ),
            }
        )

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

コード例 #12

0

ファイルを表示

ファイル: germeval_14.py プロジェクト: iamvarol/datasets-1

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "source":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-LOC",
                     "I-LOC",
                     "B-LOCderiv",
                     "I-LOCderiv",
                     "B-LOCpart",
                     "I-LOCpart",
                     "B-ORG",
                     "I-ORG",
                     "B-ORGderiv",
                     "I-ORGderiv",
                     "B-ORGpart",
                     "I-ORGpart",
                     "B-OTH",
                     "I-OTH",
                     "B-OTHderiv",
                     "I-OTHderiv",
                     "B-OTHpart",
                     "I-OTHpart",
                     "B-PER",
                     "I-PER",
                     "B-PERderiv",
                     "I-PERderiv",
                     "B-PERpart",
                     "I-PERpart",
                 ])),
             "nested_ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-LOC",
                     "I-LOC",
                     "B-LOCderiv",
                     "I-LOCderiv",
                     "B-LOCpart",
                     "I-LOCpart",
                     "B-ORG",
                     "I-ORG",
                     "B-ORGderiv",
                     "I-ORGderiv",
                     "B-ORGpart",
                     "I-ORGpart",
                     "B-OTH",
                     "I-OTH",
                     "B-OTHderiv",
                     "I-OTHderiv",
                     "B-OTHpart",
                     "I-OTHpart",
                     "B-PER",
                     "I-PER",
                     "B-PERderiv",
                     "I-PERderiv",
                     "B-PERpart",
                     "I-PERpart",
                 ])),
         }),
         supervised_keys=None,
         homepage="https://sites.google.com/site/germeval2014ner/",
         citation=_CITATION,
     )

コード例 #13

0

ファイルを表示

ファイル: selqa.py プロジェクト: ruch798/datasets

 def _info(self):
     if (
             self.config.mode == "experiments"
     ):  # This is the name of the configuration selected in BUILDER_CONFIGS above
         features = datasets.Features({
             "question":
             datasets.Value("string"),
             "candidate":
             datasets.Value("string"),
             "label":
             datasets.ClassLabel(names=["0", "1"]),
         })
     else:
         if self.config.type_ == "answer_selection":
             features = datasets.Features({
                 "section":
                 datasets.Value("string"),
                 "question":
                 datasets.Value("string"),
                 "article":
                 datasets.Value("string"),
                 "is_paraphrase":
                 datasets.Value("bool"),
                 "topic":
                 datasets.ClassLabel(names=[
                     "MUSIC",
                     "TV",
                     "TRAVEL",
                     "ART",
                     "SPORT",
                     "COUNTRY",
                     "MOVIES",
                     "HISTORICAL EVENTS",
                     "SCIENCE",
                     "FOOD",
                 ]),
                 "answers":
                 datasets.Sequence(datasets.Value("int32")),
                 "candidates":
                 datasets.Sequence(datasets.Value("string")),
                 "q_types":
                 datasets.Sequence(
                     datasets.ClassLabel(names=[
                         "what", "why", "when", "who", "where", "how", ""
                     ])),
             })
         else:
             features = datasets.Features({
                 "section":
                 datasets.Value("string"),
                 "question":
                 datasets.Value("string"),
                 "article":
                 datasets.Value("string"),
                 "is_paraphrase":
                 datasets.Value("bool"),
                 "topic":
                 datasets.ClassLabel(names=[
                     "MUSIC",
                     "TV",
                     "TRAVEL",
                     "ART",
                     "SPORT",
                     "COUNTRY",
                     "MOVIES",
                     "HISTORICAL EVENTS",
                     "SCIENCE",
                     "FOOD",
                 ]),
                 "q_types":
                 datasets.Sequence(
                     datasets.ClassLabel(names=[
                         "what", "why", "when", "who", "where", "how", ""
                     ])),
                 "candidate_list":
                 datasets.Sequence({
                     "article":
                     datasets.Value("string"),
                     "section":
                     datasets.Value("string"),
                     "candidates":
                     datasets.Sequence(datasets.Value("string")),
                     "answers":
                     datasets.Sequence(datasets.Value("int32")),
                 }),
             })
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         # This defines the different columns of the dataset and their types
         features=
         features,  # Here we define them above because they are different between the two configurations
         # If there's a common (input, target) tuple from the features,
         # specify them here. They'll be used if as_supervised=True in
         # builder.as_dataset.
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )

コード例 #14

0

ファイルを表示

def benchmark_iterating():
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
    functions = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "pandas",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "torch",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "tensorflow",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]

    functions_shuffled = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]
    with tempfile.TemporaryDirectory() as tmp_dir:
        print("generating dataset")
        features = datasets.Features({
            "list":
            datasets.Sequence(datasets.Value("float32")),
            "numbers":
            datasets.Value("float32")
        })
        dataset = generate_example_dataset(
            os.path.join(tmp_dir, "dataset.arrow"),
            features,
            num_examples=SPEED_TEST_N_EXAMPLES,
            seq_shapes={"list": (100, )},
        )
        print("first set of iterations")
        for func, kwargs in functions:
            print(func.__name__, str(kwargs))
            times[func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

        print("shuffling dataset")
        dataset = dataset.shuffle()
        print("Second set of iterations (after shuffling")
        for func, kwargs in functions_shuffled:
            print("shuffled ", func.__name__, str(kwargs))
            times["shuffled " + func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))

コード例 #15

0

ファイルを表示

ファイル: xtreme.py プロジェクト: Nasrin-Akter-88/datasets-1

    def _info(self):
        # TODO(xtreme): Specifies the datasets.DatasetInfo object
        features = {text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)}
        if "answers" in features.keys():
            features["answers"] = datasets.features.Sequence(
                {"answer_start": datasets.Value("int32"), "text": datasets.Value("string")}
            )
        if self.config.name.startswith("PAWS-X"):
            features["label"] = datasets.Value("string")
        if self.config.name == "XNLI":
            features["gold_label"] = datasets.Value("string")

        if self.config.name.startswith("udpos"):
            features = datasets.Features(
                {
                    "token": datasets.Value("string"),
                    "pos_tag": datasets.features.ClassLabel(
                        names=[
                            "ADJ",
                            "ADP",
                            "ADV",
                            "AUX",
                            "CCONJ",
                            "DET",
                            "INTJ",
                            "NOUN",
                            "NUM",
                            "PART",
                            "PRON",
                            "PROPN",
                            "PUNCT",
                            "SCONJ",
                            "SYM",
                            "VERB",
                            "X",
                        ]
                    ),
                }
            )

        if self.config.name.startswith("PAN-X"):
            features = datasets.Features(
                {
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names=[
                                "O",
                                "B-PER",
                                "I-PER",
                                "B-ORG",
                                "I-ORG",
                                "B-LOC",
                                "I-LOC",
                            ]
                        )
                    ),
                    "langs": datasets.Sequence(datasets.Value("string")),
                }
            )
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=self.config.description + "\n" + _DESCRIPTION,
            # datasets.features.FeatureConnectors
            features=datasets.Features(
                features
                # These are the features of your dataset like images, labels ...
            ),
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url,
            citation=self.config.citation + "\n" + _CITATION,
        )

コード例 #16

0

ファイルを表示

    def _info(self):

        if self.config.name == configs["classification"]:
            features = datasets.Features(
                {
                    "text": datasets.Value("string"),
                    "label": datasets.features.ClassLabel(names=["Not-Related", "Related"]),
                }
            )

        if self.config.name == configs["RE_ade"]:
            features = datasets.Features(
                {
                    "text": datasets.Value("string"),
                    "drug": datasets.Value("string"),
                    "effect": datasets.Value("string"),
                    "indexes": {
                        "drug": datasets.Sequence(
                            {
                                "start_char": datasets.Value("int32"),
                                "end_char": datasets.Value("int32"),
                            }
                        ),
                        "effect": datasets.Sequence(
                            {
                                "start_char": datasets.Value("int32"),
                                "end_char": datasets.Value("int32"),
                            }
                        ),
                    },
                }
            )

        if self.config.name == configs["RE_dosage"]:
            features = datasets.Features(
                {
                    "text": datasets.Value("string"),
                    "drug": datasets.Value("string"),
                    "dosage": datasets.Value("string"),
                    "indexes": {
                        "drug": datasets.Sequence(
                            {
                                "start_char": datasets.Value("int32"),
                                "end_char": datasets.Value("int32"),
                            }
                        ),
                        "dosage": datasets.Sequence(
                            {
                                "start_char": datasets.Value("int32"),
                                "end_char": datasets.Value("int32"),
                            }
                        ),
                    },
                }
            )

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage="https://www.sciencedirect.com/science/article/pii/S1532046412000615",
            citation=_CITATION,
        )

コード例 #17

0

ファイルを表示

ファイル: dane.py プロジェクト: ruch798/datasets

    def _info(self):
        features = datasets.Features(
            {
                "sent_id": datasets.Value("string"),
                "text": datasets.Value("string"),
                "tok_ids": datasets.Sequence(datasets.Value("int64")),
                "tokens": datasets.Sequence(datasets.Value("string")),
                "lemmas": datasets.Sequence(datasets.Value("string")),
                "pos_tags": datasets.Sequence(
                    datasets.features.ClassLabel(
                        names=[
                            "NUM",
                            "CCONJ",
                            "PRON",
                            "VERB",
                            "INTJ",
                            "AUX",
                            "ADJ",
                            "PROPN",
                            "PART",
                            "ADV",
                            "PUNCT",
                            "ADP",
                            "NOUN",
                            "X",
                            "DET",
                            "SYM",
                            "SCONJ",
                        ]
                    )
                ),
                "morph_tags": datasets.Sequence(datasets.Value("string")),
                "dep_ids": datasets.Sequence(datasets.Value("int64")),
                "dep_labels": datasets.Sequence(
                    datasets.ClassLabel(
                        names=[
                            "parataxis",
                            "mark",
                            "nummod",
                            "discourse",
                            "compound:prt",
                            "reparandum",
                            "vocative",
                            "list",
                            "obj",
                            "dep",
                            "det",
                            "obl:loc",
                            "flat",
                            "iobj",
                            "cop",
                            "expl",
                            "obl",
                            "conj",
                            "nmod",
                            "root",
                            "acl:relcl",
                            "goeswith",
                            "appos",
                            "fixed",
                            "obl:tmod",
                            "xcomp",
                            "advmod",
                            "nmod:poss",
                            "aux",
                            "ccomp",
                            "amod",
                            "cc",
                            "advcl",
                            "nsubj",
                            "punct",
                            "case",
                        ]
                    )
                ),
                "ner_tags": datasets.Sequence(
                    datasets.features.ClassLabel(
                        names=[
                            "O",
                            "B-PER",
                            "I-PER",
                            "B-ORG",
                            "I-ORG",
                            "B-LOC",
                            "I-LOC",
                            "B-MISC",
                            "I-MISC",
                        ]
                    )
                ),
            }
        )

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

コード例 #18

0

ファイルを表示

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "pos_tags":
             datasets.Sequence(
                 # https://www.ims.uni-stuttgart.de/forschung/ressourcen/lexika/germantagsets/
                 datasets.features.ClassLabel(names=[
                     "ADJA",
                     "ADJD",
                     "ADV",
                     "APPR",
                     "APPRART",
                     "APPO",
                     "APZR",
                     "ART",
                     "CARD",
                     "FM",
                     "ITJ",
                     "KOUI",
                     "KOUS",
                     "KON",
                     "KOKOM",
                     "NN",
                     "NE",
                     "PDS",
                     "PDAT",
                     "PIS",
                     "PIAT",
                     "PIDAT",
                     "PPER",
                     "PPOSS",
                     "PPOSAT",
                     "PRELS",
                     "PRELAT",
                     "PRF",
                     "PWS",
                     "PWAT",
                     "PWAV",
                     "PAV",
                     "PTKZU",
                     "PTKNEG",
                     "PTKVZ",
                     "PTKANT",
                     "PTKA",
                     "TRUNC",
                     "VVFIN",
                     "VVIMP",
                     "VVINF",
                     "VVIZU",
                     "VVPP",
                     "VAFIN",
                     "VAIMP",
                     "VAINF",
                     "VAPP",
                     "VMFIN",
                     "VMINF",
                     "VMPP",
                     "XY",
                     "$,",
                     "$.",
                     "$(",
                 ])),
             "chunk_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP",
                     "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST",
                     "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT",
                     "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP",
                     "I-NC", "B-NC", "I-PC", "B-PC", "I-VC", "B-VC"
                 ])),
             "ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER",
                     "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
                     "I-OTH"
                 ])),
         }),
         supervised_keys=None,
         homepage="https://www.aclweb.org/anthology/W03-0419/",
         citation=_CITATION,
     )

コード例 #19

0

ファイルを表示

 def _info(self):
     if self.config.name == "CLS" or self.config.name == "XNLI":
         features = {
             text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)
         }
         features[self.config.label_column] = datasets.features.ClassLabel(names=self.config.label_classes)
         features["idx"] = datasets.Value("int32")
     elif self.config.name == "WSD-V":
         features = {
             text_feature: datasets.Sequence(datasets.Value("string"))
             for text_feature in six.iterkeys(self.config.text_features)
         }
         features["fine_pos_tags"] = datasets.Sequence(
             datasets.features.ClassLabel(
                 names=[
                     "DET",
                     "P+D",
                     "CC",
                     "VS",
                     "P",
                     "CS",
                     "NC",
                     "NPP",
                     "ADJWH",
                     "VINF",
                     "VPP",
                     "ADVWH",
                     "PRO",
                     "V",
                     "CLO",
                     "PREF",
                     "VPR",
                     "PROREL",
                     "ADV",
                     "PROWH",
                     "N",
                     "DETWH",
                     "ADJ",
                     "P+PRO",
                     "ET",
                     "VIMP",
                     "CLS",
                     "PONCT",
                     "I",
                     "CLR",
                 ]
             )
         )
         features["pos_tags"] = datasets.Sequence(
             datasets.features.ClassLabel(
                 names=[
                     "V",
                     "PREF",
                     "P+D",
                     "I",
                     "A",
                     "P+PRO",
                     "PRO",
                     "P",
                     "anonyme",
                     "D",
                     "C",
                     "CL",
                     "ET",
                     "PONCT",
                     "ADV",
                     "N",
                 ]
             )
         )
         features["disambiguate_tokens_ids"] = datasets.Sequence(datasets.Value("int32"))
         features["disambiguate_labels"] = datasets.Sequence(datasets.Value("string"))
         features["idx"] = datasets.Value("string")
     else:
         features = {
             text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)
         }
         features[self.config.label_column] = datasets.Value("int32")
         features["idx"] = datasets.Value("int32")
     return datasets.DatasetInfo(
         description=_FLUE_DESCRIPTION,
         features=datasets.Features(features),
         homepage=self.config.url,
         citation=self.config.citation + "\n" + _FLUE_CITATION,
     )

コード例 #20

0

ファイルを表示

ファイル: pubmed.py プロジェクト: ruch798/datasets

    def _info(self):
        Date = {
            "Year": datasets.Value("int32"),
            "Month": datasets.Value("int32"),
            "Day": datasets.Value("int32"),
        }

        MeshHeading = {
            "DescriptorName": datasets.Value("string"),
            "QualifierName": datasets.Value("string")
        }

        MedlineJournalInfo = {
            "Country": datasets.Value("string"),
            # Too inconsistent
            # 'MedlineTA': datasets.Value('string'),
            # 'NlmUniqueID': datasets.Value('string'),
            # 'ISSNLinking': datasets.Value('string'),
        }
        Chemical = {
            "RegistryNumber": datasets.Value("string"),
            "NameOfSubstance": datasets.Value("string"),
        }
        # Too inconsistent in the data to be used
        # Journal = {
        #         'ISSN': datasets.Value('string'),
        #         'JournalIssue': {
        #             'Volume': datasets.Value('string'),
        #             'Issue': datasets.Value('string'),
        #         },
        #         # 'PubDate': Date,
        #         'Title': datasets.Value('string'),
        #         'ISOAbbreviation': datasets.Value('string')
        #         }
        Author = {
            "LastName": datasets.Value("string"),
            "ForeName": datasets.Value("string"),
            "Initials": datasets.Value("string"),
            "CollectiveName": datasets.Value("string"),
        }
        Reference = {
            "Citation": datasets.Value("string"),
            "CitationId": datasets.Value("int32"),
        }
        Grant = {
            "GrantID": datasets.Value("string"),
            "Agency": datasets.Value("string"),
            "Country": datasets.Value("string"),
        }
        Article = {
            # 'Journal': Journal,
            "Abstract": {
                "AbstractText": datasets.Value("string")
            },
            "ArticleTitle": datasets.Value("string"),
            # Too inconistent
            # 'Pagination': {'MedlinePgn': datasets.Value('string')},
            "AuthorList": {
                "Author": datasets.Sequence(Author)
            },
            "Language": datasets.Value("string"),
            "GrantList": {
                "Grant": datasets.Sequence(Grant),
            },
            "PublicationTypeList": {
                "PublicationType": datasets.Sequence(datasets.Value("string"))
            },
        }
        features = datasets.Features({
            "MedlineCitation": {
                "PMID": datasets.Value("int32"),
                "DateCompleted": Date,
                "NumberOfReferences": datasets.Value("int32"),
                "DateRevised": Date,
                "Article": Article,
                "MedlineJournalInfo": MedlineJournalInfo,
                "ChemicalList": {
                    "Chemical": datasets.Sequence(Chemical)
                },
                "CitationSubset": datasets.Value("string"),
                "MeshHeadingList": {
                    "MeshHeading": datasets.Sequence(MeshHeading),
                },
            },
            "PubmedData": {
                "ArticleIdList":
                datasets.Sequence(
                    {"ArticleId":
                     datasets.Sequence(datasets.Value("string"))}),
                "PublicationStatus":
                datasets.Value("string"),
                "History": {
                    "PubMedPubDate": datasets.Sequence(Date)
                },
                "ReferenceList":
                datasets.Sequence(Reference),
            },
        })
        self.fill_keys_from_features(features)
        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=
            features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

コード例 #21

0

ファイルを表示

ファイル: norne.py プロジェクト: albertvillanova/huggingface_datasets

 def _info(self):
     if self.config.name.endswith("-7"):
         ner_tags = datasets.Sequence(
             datasets.features.ClassLabel(names=[
                 "O",
                 "B-PER",
                 "I-PER",
                 "B-ORG",
                 "I-ORG",
                 "B-PROD",
                 "I-PROD",
                 "B-LOC",
                 "I-LOC",
                 "B-DRV",
                 "I-DRV",
                 "B-EVT",
                 "I-EVT",
                 "B-MISC",
                 "I-MISC",
             ]))
     elif self.config.name.endswith("-8"):
         ner_tags = datasets.Sequence(
             datasets.features.ClassLabel(names=[
                 "O",
                 "B-PER",
                 "I-PER",
                 "B-ORG",
                 "I-ORG",
                 "B-PROD",
                 "I-PROD",
                 "B-LOC",
                 "I-LOC",
                 "B-GPE",
                 "I-GPE",
                 "B-DRV",
                 "I-DRV",
                 "B-EVT",
                 "I-EVT",
                 "B-MISC",
                 "I-MISC",
             ]))
     else:
         ner_tags = datasets.Sequence(
             datasets.features.ClassLabel(names=[
                 "O",
                 "B-PER",
                 "I-PER",
                 "B-ORG",
                 "I-ORG",
                 "B-GPE_LOC",
                 "I-GPE_LOC",
                 "B-PROD",
                 "I-PROD",
                 "B-LOC",
                 "I-LOC",
                 "B-GPE_ORG",
                 "I-GPE_ORG",
                 "B-DRV",
                 "I-DRV",
                 "B-EVT",
                 "I-EVT",
                 "B-MISC",
                 "I-MISC",
             ]))
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "idx":
             datasets.Value("string"),
             "lang":
             datasets.Value("string"),
             "text":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "lemmas":
             datasets.Sequence(datasets.Value("string")),
             "pos_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "NOUN",
                     "PUNCT",
                     "ADP",
                     "NUM",
                     "SYM",
                     "SCONJ",
                     "ADJ",
                     "PART",
                     "DET",
                     "CCONJ",
                     "PROPN",
                     "PRON",
                     "X",
                     "ADV",
                     "INTJ",
                     "VERB",
                     "AUX",
                 ])),
             "ner_tags":
             ner_tags,
         }),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         citation=_CITATION,
     )

コード例 #22

0

ファイルを表示

ファイル: smartdata.py プロジェクト: ruch798/datasets

    def _info(self):
        features = datasets.Features(
            {
                "id": datasets.Value("string"),
                "tokens": datasets.Sequence(datasets.Value("string")),
                "ner_tags": datasets.Sequence(
                    datasets.features.ClassLabel(
                        names=[
                            "O",
                            "B-DATE",
                            "I-DATE",
                            "B-DISASTER_TYPE",
                            "I-DISASTER_TYPE",
                            "B-DISTANCE",
                            "I-DISTANCE",
                            "B-DURATION",
                            "I-DURATION",
                            "B-LOCATION",
                            "I-LOCATION",
                            "B-LOCATION_CITY",
                            "I-LOCATION_CITY",
                            "B-LOCATION_ROUTE",
                            "I-LOCATION_ROUTE",
                            "B-LOCATION_STOP",
                            "I-LOCATION_STOP",
                            "B-LOCATION_STREET",
                            "I-LOCATION_STREET",
                            "B-NUMBER",
                            "I-NUMBER",
                            "B-ORGANIZATION",
                            "I-ORGANIZATION",
                            "B-ORGANIZATION_COMPANY",
                            "I-ORGANIZATION_COMPANY",
                            "B-ORG_POSITION",
                            "I-ORG_POSITION",
                            "B-PERSON",
                            "I-PERSON",
                            "B-TIME",
                            "I-TIME",
                            "B-TRIGGER",
                            "I-TRIGGER",
                        ]
                    )
                ),
            }
        )

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

コード例 #23

0

ファイルを表示

ファイル: wiki_auto.py プロジェクト: Priyansh2/nlp

 def _info(self):
     if self.config.name == "manual":  # This is the name of the configuration selected in BUILDER_CONFIGS above
         features = datasets.Features({
             "alignment_label":
             datasets.ClassLabel(names=["notAligned", "aligned"]),
             "normal_sentence_id":
             datasets.Value("string"),
             "simple_sentence_id":
             datasets.Value("string"),
             "normal_sentence":
             datasets.Value("string"),
             "simple_sentence":
             datasets.Value("string"),
         })
     elif self.config.name == "auto_acl":
         features = datasets.Features({
             "normal_sentence":
             datasets.Value("string"),
             "simple_sentence":
             datasets.Value("string"),
         })
     else:
         features = datasets.Features({
             "example_id":
             datasets.Value("string"),
             "normal": {
                 "normal_article_id":
                 datasets.Value("int32"),
                 "normal_article_title":
                 datasets.Value("string"),
                 "normal_article_url":
                 datasets.Value("string"),
                 "normal_article_content":
                 datasets.Sequence({
                     "normal_sentence_id":
                     datasets.Value("string"),
                     "normal_sentence":
                     datasets.Value("string"),
                 }),
             },
             "simple": {
                 "simple_article_id":
                 datasets.Value("int32"),
                 "simple_article_title":
                 datasets.Value("string"),
                 "simple_article_url":
                 datasets.Value("string"),
                 "simple_article_content":
                 datasets.Sequence({
                     "simple_sentence_id":
                     datasets.Value("string"),
                     "simple_sentence":
                     datasets.Value("string"),
                 }),
             },
             "paragraph_alignment":
             datasets.Sequence({
                 "normal_paragraph_id":
                 datasets.Value("string"),
                 "simple_paragraph_id":
                 datasets.Value("string"),
             }),
             "sentence_alignment":
             datasets.Sequence({
                 "normal_sentence_id":
                 datasets.Value("string"),
                 "simple_sentence_id":
                 datasets.Value("string"),
             }),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage="https://github.com/chaojiang06/wiki-auto",
         license=_LICENSE,
         citation=_CITATION,
     )

コード例 #24

0

ファイルを表示

ファイル: test_beam.py プロジェクト: Priyansh2/nlp

 def _info(self):
     return datasets.DatasetInfo(
         features=datasets.Features({"a": datasets.Sequence({"b": datasets.Value("string")})}),
         # No default supervised_keys.
         supervised_keys=None,
     )

コード例 #25

0

ファイルを表示

ファイル: turkish_shrinked_ner.py プロジェクト: ruch798/datasets

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "id":
             datasets.Value("string"),
             "tokens":
             datasets.Sequence(datasets.Value("string")),
             "ner_tags":
             datasets.Sequence(
                 datasets.features.ClassLabel(names=[
                     "O",
                     "B-academic",
                     "I-academic",
                     "B-academic_person",
                     "I-academic_person",
                     "B-aircraft",
                     "I-aircraft",
                     "B-album_person",
                     "I-album_person",
                     "B-anatomy",
                     "I-anatomy",
                     "B-animal",
                     "I-animal",
                     "B-architect_person",
                     "I-architect_person",
                     "B-capital",
                     "I-capital",
                     "B-chemical",
                     "I-chemical",
                     "B-clothes",
                     "I-clothes",
                     "B-country",
                     "I-country",
                     "B-culture",
                     "I-culture",
                     "B-currency",
                     "I-currency",
                     "B-date",
                     "I-date",
                     "B-food",
                     "I-food",
                     "B-genre",
                     "I-genre",
                     "B-government",
                     "I-government",
                     "B-government_person",
                     "I-government_person",
                     "B-language",
                     "I-language",
                     "B-location",
                     "I-location",
                     "B-material",
                     "I-material",
                     "B-measure",
                     "I-measure",
                     "B-medical",
                     "I-medical",
                     "B-military",
                     "I-military",
                     "B-military_person",
                     "I-military_person",
                     "B-nation",
                     "I-nation",
                     "B-newspaper",
                     "I-newspaper",
                     "B-organization",
                     "I-organization",
                     "B-organization_person",
                     "I-organization_person",
                     "B-person",
                     "I-person",
                     "B-production_art_music",
                     "I-production_art_music",
                     "B-production_art_music_person",
                     "I-production_art_music_person",
                     "B-quantity",
                     "I-quantity",
                     "B-religion",
                     "I-religion",
                     "B-science",
                     "I-science",
                     "B-shape",
                     "I-shape",
                     "B-ship",
                     "I-ship",
                     "B-software",
                     "I-software",
                     "B-space",
                     "I-space",
                     "B-space_person",
                     "I-space_person",
                     "B-sport",
                     "I-sport",
                     "B-sport_name",
                     "I-sport_name",
                     "B-sport_person",
                     "I-sport_person",
                     "B-structure",
                     "I-structure",
                     "B-subject",
                     "I-subject",
                     "B-tech",
                     "I-tech",
                     "B-train",
                     "I-train",
                     "B-vehicle",
                     "I-vehicle",
                 ])),
         }),
         supervised_keys=None,
         # Homepage of the dataset for documentation
         homepage=_HOMEPAGE,
         # License for the dataset if available
         license=_LICENSE,
         # Citation for the dataset
         citation=_CITATION,
     )

コード例 #26

0

ファイルを表示

ファイル: clue.py プロジェクト: merveenoyan/datasets

    def _info(self):
        if self.config.name in [
                "afqmc", "tnews", "iflytek", "cmnli", "diagnostics", "ocnli"
        ]:
            features = {
                text_feature: datasets.Value("string")
                for text_feature in self.config.text_features.keys()
            }
            if self.config.label_classes:
                features["label"] = datasets.features.ClassLabel(
                    names=self.config.label_classes)
            else:
                features["label"] = datasets.Value("float32")
            features["idx"] = datasets.Value("int32")
        elif self.config.name == "cluewsc2020":
            features = {
                "idx": datasets.Value("int32"),
                "text": datasets.Value("string"),
                "label": datasets.features.ClassLabel(names=["true", "false"]),
                "target": {
                    "span1_text": datasets.Value("string"),
                    "span2_text": datasets.Value("string"),
                    "span1_index": datasets.Value("int32"),
                    "span2_index": datasets.Value("int32"),
                },
            }
        elif self.config.name == "csl":
            features = {
                "idx":
                datasets.Value("int32"),
                "corpus_id":
                datasets.Value("int32"),
                "abst":
                datasets.Value("string"),
                "label":
                datasets.features.ClassLabel(names=self.config.label_classes),
                "keyword":
                datasets.Sequence(datasets.Value("string")),
            }
        elif self.config.name in ["cmrc2018", "drcd"]:
            features = {
                "id":
                datasets.Value("string"),
                "context":
                datasets.Value("string"),
                "question":
                datasets.Value("string"),
                "answers":
                datasets.Sequence({
                    "text": datasets.Value("string"),
                    "answer_start": datasets.Value("int32"),
                }),
            }
        elif self.config.name == "chid":
            features = {
                "idx":
                datasets.Value("int32"),
                "candidates":
                datasets.Sequence(datasets.Value("string")),
                "content":
                datasets.Sequence(datasets.Value("string")),
                "answers":
                datasets.features.Sequence({
                    "text":
                    datasets.Value("string"),
                    "candidate_id":
                    datasets.Value("int32"),
                }),
            }
        elif self.config.name == "c3":
            features = {
                "id": datasets.Value("int32"),
                "context": datasets.Sequence(datasets.Value("string")),
                "question": datasets.Value("string"),
                "choice": datasets.Sequence(datasets.Value("string")),
                "answer": datasets.Value("string"),
            }
        else:
            raise NotImplementedError(
                "This task is not implemented. If you believe"
                " this task was recently added to the CLUE benchmark, "
                "please open a GitHub issue and we will add it.")

        return datasets.DatasetInfo(
            description=_CLUE_DESCRIPTION,
            features=datasets.Features(features),
            homepage=self.config.url,
            citation=self.config.citation + "\n" + _CLUE_CITATION,
        )

コード例 #27

0

ファイルを表示

from processing_image import Preprocess
from utils import Config


"""
USAGE:
``python extracting_data.py -i <img_dir> -o <dataset_file>.datasets <batch_size>``
"""


TEST = False
CONFIG = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
DEFAULT_SCHEMA = datasets.Features(
    OrderedDict(
        {
            "attr_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
            "attr_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
            "boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"),
            "img_id": datasets.Value("int32"),
            "obj_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
            "obj_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
            "roi_features": datasets.Array2D((CONFIG.MAX_DETECTIONS, 2048), dtype="float32"),
            "sizes": datasets.Sequence(length=2, feature=datasets.Value("float32")),
            "preds_per_image": datasets.Value(dtype="int32"),
        }
    )
)


class Extract:
    def __init__(self, argv=sys.argv[1:]):

コード例 #28

0

ファイルを表示

ファイル: loading_ud.py プロジェクト: andybi7676/rnn_typology

 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "id": datasets.Value("string"),
                 "tokens": datasets.Sequence(datasets.Value("string")),
                 "pos_tags": datasets.Sequence(
                     datasets.features.ClassLabel(
                         names=['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']
                     )
                 ),
                 "dependency_tags": datasets.Sequence(
                     datasets.features.ClassLabel(
                         names=[
                             'acl',
                             'acl:relcl',
                             'advcl',
                             'advcl:cleft', 
                             'advmod',
                             'advmod:emph',
                             'advmod:lmod',
                             'amod',
                             'appos',
                             'aux',
                             'aux:aspect', 
                             'aux:pass',
                             'aux:q', 
                             'aux:tense', 
                             'case',
                             'case:dec', 
                             'case:pref',
                             'case:suff',  
                             'cc',
                             'cc:preconj',
                             'ccomp',
                             'ccomp:agent', 
                             'ccomp:obj', 
                             'clf',
                             'compound',
                             'compound:lvc',
                             'compound:prt',
                             'compound:redup',
                             'compound:svc',
                             'conj',
                             'cop',
                             'csubj',
                             'csubj:cop', 
                             'csubj:pass',
                             'dep',
                             'dep:comp', 
                             'det',
                             'det:numgov',
                             'det:nummod',
                             'det:poss',
                             'discourse',
                             'dislocated',
                             'det:predet',
                             'expl',
                             'expl:impers',
                             'expl:pass',
                             'expl:pv',
                             'expl:subj', 
                             'fixed',
                             'flat',
                             'flat:foreign',
                             'flat:name',
                             'goeswith',
                             'iobj',
                             'list',
                             'mark',
                             'mark:advb',
                             'mark:comp', 
                             'mark:relcl', 
                             'nmod',
                             'nmod:comp', 
                             'nmod:part', 
                             'nmod:poss',
                             'nmod:tmod',
                             'nmod:npmod', 
                             'nsubj',
                             'nsubj:cop', 
                             'nsubj:pass',
                             'nummod',
                             'nummod:gov',
                             'obj',
                             'obj:lvc', 
                             'obl',
                             'obl:agent',
                             'obl:arg',
                             'obl:lmod',
                             'obl:mod', 
                             'obl:loc', 
                             'obl:tmod',
                             'obl:npmod', 
                             'obl:patient', 
                             'orphan',
                             'parataxis',
                             'punct',
                             'reparandum',
                             'root',
                             'vocative',
                             'xcomp',
                             'xcomp:obj', 
                             'xcomp:obl', 
                         ] + ['acl:appos', 'acl:inf', 'acl:part', 'advcl:arg', 'advcl:cond', 'advmod:cc', 'amod:advmod', 'aux:caus', 'aux:neg', 'case:voc', 'ccomp:obl', 'ccomp:pred', 'compound:conjv', 'compound:nv', 'compound:plur', 'conj:expl', 'csubj:cleft', 'iobj:agent', 'iobj:loc', 'mark:prt', 'nmod:advmod', 'nmod:appos', 'nsubj:caus', 'nsubj:nc', 'obj:agent', 'obl:abl', 'obl:ben', 'obl:cmpr', 'obl:inst', 'obl:pmod', 'obl:prep', 'obl:soc', 'xcomp:adj', 'xcomp:pred']
                     )
                 ),
                 "lang": datasets.Sequence(
                     datasets.features.ClassLabel(
                         names=list(testing_path.keys())
                     )
                 ),
             }
         ),
         supervised_keys=None,
         homepage="https://www.aclweb.org/anthology/W03-0419/",
         citation=_CITATION,
     )

コード例 #29

0

ファイルを表示

ファイル: ami.py プロジェクト: edugp/datasets

    def _info(self):
        features_dict = {
            "word_ids": datasets.Sequence(datasets.Value("string")),
            "word_start_times": datasets.Sequence(datasets.Value("float")),
            "word_end_times": datasets.Sequence(datasets.Value("float")),
            "word_speakers": datasets.Sequence(datasets.Value("string")),
            "segment_ids": datasets.Sequence(datasets.Value("string")),
            "segment_start_times": datasets.Sequence(datasets.Value("float")),
            "segment_end_times": datasets.Sequence(datasets.Value("float")),
            "segment_speakers": datasets.Sequence(datasets.Value("string")),
            "words": datasets.Sequence(datasets.Value("string")),
            "channels": datasets.Sequence(datasets.Value("string")),
        }

        if self.config.name == "headset-single":
            features_dict.update({"file": datasets.Value("string")})
            features_dict.update(
                {"audio": datasets.features.Audio(sampling_rate=16_000)})
            config_description = (
                "Close talking audio of single headset. "
                "This configuration only includes audio belonging to the "
                "headset of the person currently speaking.")
        elif self.config.name == "microphone-single":
            features_dict.update({"file": datasets.Value("string")})
            features_dict.update(
                {"audio": datasets.features.Audio(sampling_rate=16_000)})
            config_description = (
                "Far field audio of single microphone. "
                "This configuration only includes audio belonging the first microphone, "
                "*i.e.* 1-1, of the microphone array.")
        elif self.config.name == "headset-multi":
            features_dict.update(
                {f"file-{i}": datasets.Value("string")
                 for i in range(4)})
            features_dict.update({
                f"file-{i}": datasets.features.Audio(sampling_rate=16_000)
                for i in range(4)
            })
            config_description = (
                "Close talking audio of four individual headset. "
                "This configuration includes audio belonging to four individual headsets."
                " For each annotation there are 4 audio files 0, 1, 2, 3.")
        elif self.config.name == "microphone-multi":
            features_dict.update(
                {f"file-1-{i}": datasets.Value("string")
                 for i in range(1, 8)})
            features_dict.update({
                f"file-1-{i}": datasets.features.Audio(sampling_rate=16_000)
                for i in range(1, 8)
            })
            config_description = (
                "Far field audio of microphone array. "
                "This configuration includes audio of "
                "the first microphone array 1-1, 1-2, ..., 1-8.")
        else:
            raise ValueError(
                f"Configuration {self.config.name} does not exist.")

        return datasets.DatasetInfo(
            description=_DESCRIPTION + config_description,
            features=datasets.Features(features_dict),
            homepage=_URL,
            citation=_CITATION,
        )

コード例 #30

0

ファイルを表示

ファイル: multi_woz_v22.py プロジェクト: Priyansh2/nlp

 def _info(self):
     features = datasets.Features({
         "dialogue_id":
         datasets.Value("string"),
         "services":
         datasets.Sequence(datasets.Value("string")),
         "turns":
         datasets.Sequence({
             "turn_id":
             datasets.Value("string"),
             "speaker":
             datasets.ClassLabel(names=["USER", "SYSTEM"]),
             "utterance":
             datasets.Value("string"),
             "frames":
             datasets.Sequence({
                 "service":
                 datasets.Value("string"),
                 "state": {
                     "active_intent":
                     datasets.Value("string"),
                     "requested_slots":
                     datasets.Sequence(datasets.Value("string")),
                     "slots_values":
                     datasets.Sequence({
                         "slots_values_name":
                         datasets.Value("string"),
                         "slots_values_list":
                         datasets.Sequence(datasets.Value("string")),
                     }),
                 },
                 "slots":
                 datasets.Sequence({
                     "slot":
                     datasets.Value("string"),
                     "value":
                     datasets.Value("string"),
                     "start":
                     datasets.Value("int32"),
                     "exclusive_end":
                     datasets.Value("int32"),
                     "copy_from":
                     datasets.Value("string"),
                     "copy_from_value":
                     datasets.Sequence(datasets.Value("string")),
                 }),
             }),
             "dialogue_acts":
             datasets.Features({
                 "dialog_act":
                 datasets.Sequence({
                     "act_type":
                     datasets.Value("string"),
                     "act_slots":
                     datasets.Sequence(
                         datasets.Features({
                             "slot_name":
                             datasets.Value("string"),
                             "slot_value":
                             datasets.Value("string"),
                         }), ),
                 }),
                 "span_info":
                 datasets.Sequence({
                     "act_type":
                     datasets.Value("string"),
                     "act_slot_name":
                     datasets.Value("string"),
                     "act_slot_value":
                     datasets.Value("string"),
                     "span_start":
                     datasets.Value("int32"),
                     "span_end":
                     datasets.Value("int32"),
                 }),
             }),
         }),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=
         features,  # Here we define them above because they are different between the two configurations
         supervised_keys=None,
         homepage=
         "https://github.com/budzianowski/multiwoz/tree/master/data/MultiWOZ_2.2",
         license=_LICENSE,
         citation=_CITATION,
     )