def _info(self): features = datasets.Features({ "text": datasets.Value("string"), "sentence_offsets": datasets.features.Sequence({ "begin_char_offset": datasets.Value("int64"), "end_char_offset": datasets.Value("int64") }), "sentences": datasets.features.Sequence(datasets.Value("string")), "sentence_labels": datasets.features.Sequence(datasets.Value("int64")), "token_offsets": datasets.features.Sequence({ "offsets": datasets.features.Sequence({ "begin_char_offset": datasets.Value("int64"), "end_char_offset": datasets.Value("int64") }) }), "tokens": datasets.features.Sequence( datasets.features.Sequence(datasets.Value("string"))), "entity_labels": datasets.features.Sequence( datasets.features.Sequence( datasets.features.ClassLabel(names=[ "B-DEVICE", "B-EXPERIMENT", "B-MATERIAL", "B-VALUE", "I-DEVICE", "I-EXPERIMENT", "I-MATERIAL", "I-VALUE", "O", ]))), "slot_labels": datasets.features.Sequence( datasets.features.Sequence( datasets.features.ClassLabel(names=[ "B-anode_material", "B-cathode_material", "B-conductivity", "B-current_density", "B-degradation_rate", "B-device", "B-electrolyte_material", "B-experiment_evoking_word", "B-fuel_used", "B-interlayer_material", "B-interconnect_material", "B-open_circuit_voltage", "B-power_density", "B-resistance", "B-support_material", "B-thickness", "B-time_of_operation", "B-voltage", "B-working_temperature", "I-anode_material", "I-cathode_material", "I-conductivity", "I-current_density", "I-degradation_rate", "I-device", "I-electrolyte_material", "I-experiment_evoking_word", "I-fuel_used", "I-interlayer_material", "I-interconnect_material", "I-open_circuit_voltage", "I-power_density", "I-resistance", "I-support_material", "I-thickness", "I-time_of_operation", "I-voltage", "I-working_temperature", "O", ]))), "links": datasets.Sequence({ "relation_label": datasets.features.ClassLabel(names=[ "coreference", "experiment_variation", "same_experiment", "thickness" ]), "start_span_id": datasets.Value("int64"), "end_span_id": datasets.Value("int64"), }), "slots": datasets.features.Sequence({ "frame_participant_label": datasets.features.ClassLabel(names=[ "anode_material", "cathode_material", "current_density", "degradation_rate", "device", "electrolyte_material", "fuel_used", "interlayer_material", "open_circuit_voltage", "power_density", "resistance", "support_material", "time_of_operation", "voltage", "working_temperature", ]), "slot_id": datasets.Value("int64"), }), "spans": datasets.features.Sequence({ "span_id": datasets.Value("int64"), "entity_label": datasets.features.ClassLabel( names=["", "DEVICE", "MATERIAL", "VALUE"]), "sentence_id": datasets.Value("int64"), "experiment_mention_type": datasets.features.ClassLabel(names=[ "", "current_exp", "future_work", "general_info", "previous_work" ]), "begin_char_offset": datasets.Value("int64"), "end_char_offset": datasets.Value("int64"), }), "experiments": datasets.features.Sequence({ "experiment_id": datasets.Value("int64"), "span_id": datasets.Value("int64"), "slots": datasets.features.Sequence({ "frame_participant_label": datasets.features.ClassLabel(names=[ "anode_material", "cathode_material", "current_density", "degradation_rate", "conductivity", "device", "electrolyte_material", "fuel_used", "interlayer_material", "open_circuit_voltage", "power_density", "resistance", "support_material", "time_of_operation", "voltage", "working_temperature", ]), "slot_id": datasets.Value("int64"), }), }), }) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "constituent_dataset": datasets.Value("string"), "id": datasets.Value("string"), "context": datasets.Value("string"), "question": datasets.Value("string"), "reference": datasets.Value("string"), "candidate": datasets.Value("string"), "score": datasets.Value("float"), "metadata": { "scores": datasets.features.Sequence(datasets.Value("int32")), "source": datasets.Value("string"), }, # features for minimal pairs "candidate2": datasets.Value("string"), "score2": datasets.Value("float"), }), supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def benchmark_iterating(): times = {"num examples": SPEED_TEST_N_EXAMPLES} functions = [ (read, { "length": SMALL_TEST }), (read, { "length": SPEED_TEST_N_EXAMPLES }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 10 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 100 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000 }), (read_formatted, { "type": "numpy", "length": SMALL_TEST }), (read_formatted, { "type": "pandas", "length": SMALL_TEST }), (read_formatted, { "type": "torch", "length": SMALL_TEST }), (read_formatted, { "type": "tensorflow", "length": SMALL_TEST }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 10 }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 1_000 }), ] functions_shuffled = [ (read, { "length": SMALL_TEST }), (read, { "length": SPEED_TEST_N_EXAMPLES }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 10 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 100 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000 }), (read_formatted, { "type": "numpy", "length": SMALL_TEST }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 10 }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 1_000 }), ] with tempfile.TemporaryDirectory() as tmp_dir: print("generating dataset") features = datasets.Features({ "list": datasets.Sequence(datasets.Value("float32")), "numbers": datasets.Value("float32") }) dataset = generate_example_dataset( os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES, seq_shapes={"list": (100, )}, ) print("first set of iterations") for func, kwargs in functions: print(func.__name__, str(kwargs)) times[func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func( dataset, **kwargs) print("shuffling dataset") dataset = dataset.shuffle() print("Second set of iterations (after shuffling") for func, kwargs in functions_shuffled: print("shuffled ", func.__name__, str(kwargs)) times["shuffled " + func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func( dataset, **kwargs) with open(RESULTS_FILE_PATH, "wb") as f: f.write(json.dumps(times).encode("utf-8"))
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "id": datasets.Value("string"), "context_id": datasets.Value("string"), "question_id": datasets.Value("string"), "domain": datasets.Value("string"), "metadata": { "author": datasets.Value("string"), "title": datasets.Value("string"), "url": datasets.Value("string"), }, "context": datasets.Value("string"), "question": datasets.Value("string"), "question_type": datasets.Value("string"), "answers": datasets.features.Sequence(datasets.Value("string"), ), "correct_answer_id": datasets.Value("int32"), }), # No default supervised_keys (as we have to pass both question # and context as input). supervised_keys=None, homepage="https://text-machine-lab.github.io/blog/2020/quail/", citation=_CITATION, )
def _info(self): features = datasets.Features({ "id": datasets.Value("string"), "text": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "nps": [{ "text": datasets.Value("string"), "first_char": datasets.Value("int32"), "last_char": datasets.Value("int32"), "first_token": datasets.Value("int32"), "last_token": datasets.Value("int32"), "id": datasets.Value("string"), }], "np_relations": [{ "anchor": datasets.Value("string"), "complement": datasets.Value("string"), "preposition": datasets.features.ClassLabel(names=[ "about", "for", "with", "from", "among", "by", "on", "at", "during", "of", "member(s) of", "in", "after", "under", "to", "into", "before", "near", "outside", "around", "between", "against", "over", "inside", ]), "complement_coref_cluster_id": datasets.Value("string"), }], "coref": [{ "id": datasets.Value("string"), "members": datasets.Sequence(datasets.Value("string")), "np_type": datasets.features.ClassLabel(names=[ "standard", "time/date/measurement", "idiomatic", ]), }], "metadata": { "annotators": { "coref_worker": datasets.Value("int32"), "consolidator_worker": datasets.Value("int32"), "np-relations_worker": datasets.Sequence(datasets.Value("int32")), }, "url": datasets.Value("string"), "source": datasets.Value("string"), }, }) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and # specify them. They'll be used if as_supervised=True in builder.as_dataset. # supervised_keys=("sentence", "label"), # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _get_feature_types(self): if self.config_name == "record": return { "predictions": { "idx": { "passage": datasets.Value("int64"), "query": datasets.Value("int64"), }, "prediction_text": datasets.Value("string"), }, "references": { "idx": { "passage": datasets.Value("int64"), "query": datasets.Value("int64"), }, "answers": datasets.Sequence(datasets.Value("string")), }, } elif self.config_name == "multirc": return { "predictions": { "idx": { "answer": datasets.Value("int64"), "paragraph": datasets.Value("int64"), "question": datasets.Value("int64"), }, "prediction": datasets.Value("int64"), }, "references": datasets.Value("int64"), } else: return { "predictions": datasets.Value("int64"), "references": datasets.Value("int64"), }
def _info(self): # TODO(xtreme): Specifies the datasets.DatasetInfo object features = {text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)} if "answers" in features.keys(): features["answers"] = datasets.features.Sequence( {"answer_start": datasets.Value("int32"), "text": datasets.Value("string")} ) if self.config.name.startswith("PAWS-X"): features["label"] = datasets.Value("string") if self.config.name == "XNLI": features["gold_label"] = datasets.Value("string") if self.config.name.startswith("udpos"): features = datasets.Features( { "token": datasets.Value("string"), "pos_tag": datasets.features.ClassLabel( names=[ "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", ] ), } ) if self.config.name.startswith("PAN-X"): features = datasets.Features( { "tokens": datasets.Sequence(datasets.Value("string")), "ner_tags": datasets.Sequence( datasets.features.ClassLabel( names=[ "O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", ] ) ), "langs": datasets.Sequence(datasets.Value("string")), } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=self.config.description + "\n" + _DESCRIPTION, # datasets.features.FeatureConnectors features=datasets.Features( features # These are the features of your dataset like images, labels ... ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url, citation=self.config.citation + "\n" + _CITATION, )
def _info(self): features = datasets.Features({ "event": datasets.Value("string"), "oEffect": datasets.Sequence(datasets.Value("string")), "oReact": datasets.Sequence(datasets.Value("string")), "oWant": datasets.Sequence(datasets.Value("string")), "xAttr": datasets.Sequence(datasets.Value("string")), "xEffect": datasets.Sequence(datasets.Value("string")), "xIntent": datasets.Sequence(datasets.Value("string")), "xNeed": datasets.Sequence(datasets.Value("string")), "xReact": datasets.Sequence(datasets.Value("string")), "xWant": datasets.Sequence(datasets.Value("string")), "prefix": datasets.Sequence(datasets.Value("string")), "split": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "qid": datasets.Value("string"), "question": datasets.Value("string"), "answer": datasets.features.Sequence( { "answer_type": datasets.Value("string"), "answer_argument": datasets.Value("string"), "entity_name": datasets.Value("string"), } ), "function": datasets.Value("string"), "num_node": datasets.Value("int32"), "num_edge": datasets.Value("int32"), "graph_query": { "nodes": datasets.features.Sequence( { "nid": datasets.Value("int32"), "node_type": datasets.Value("string"), "id": datasets.Value("string"), "class": datasets.Value("string"), "friendly_name": datasets.Value("string"), "question_node": datasets.Value("int32"), "function": datasets.Value("string"), } ), "edges": datasets.features.Sequence( { "start": datasets.Value("int32"), "end": datasets.Value("int32"), "relation": datasets.Value("string"), "friendly_name": datasets.Value("string"), } ), }, "sparql_query": datasets.Value("string"), "domains": datasets.features.Sequence(datasets.Value("string")), "level": datasets.Value("string"), "s_expression": datasets.Value("string"), } ), # No default supervised_keys (as we have to pass both question # and context as input). supervised_keys=None, homepage="https://dki-lab.github.io/GrailQA/", citation=_CITATION, )
class CodeXGlueTcNLCodeSearchAdvImpl(CodeXGlueCtCodeToTextBaseImpl): LANGUAGE = "python" SINGLE_LANGUAGE = True _FEATURES = { "id": datasets.Value("int32"), # Index of the sample "repo": datasets.Value("string"), # repo: the owner/repo "path": datasets.Value("string"), # path: the full path to the original file "func_name": datasets.Value("string"), # func_name: the function or method name "original_string": datasets.Value("string"), # original_string: the raw string before tokenization or parsing "language": datasets.Value("string"), # language: the programming language "code": datasets.Value("string"), # code/function: the part of the original_string that is code "code_tokens": datasets.features.Sequence( datasets.Value("string") ), # code_tokens/function_tokens: tokenized version of code "docstring": datasets.Value( "string" ), # docstring: the top-level comment or docstring, if it exists in the original string "docstring_tokens": datasets.features.Sequence( datasets.Value("string") ), # docstring_tokens: tokenized version of docstring "sha": datasets.Value("string"), # sha of the file "url": datasets.Value("string"), # url of the file "docstring_summary": datasets.Value("string"), # Summary of the docstring "parameters": datasets.Value("string"), # parameters of the function "return_statement": datasets.Value("string"), # return statement "argument_list": datasets.Value("string"), # list of arguments of the function "identifier": datasets.Value("string"), # identifier "nwo": datasets.Value("string"), # nwo "score": datasets.Value("float"), # score for this search } def post_process(self, split_name, language, js): for suffix in "_tokens", "": key = "function" + suffix if key in js: js["code" + suffix] = js[key] del js[key] for key in self._FEATURES: if key not in js: if key == "score": js[key] = -1 else: js[key] = "" return js def generate_urls(self, split_name): for e in super().generate_urls(split_name, self.LANGUAGE): yield e def get_data_files(self, split_name, file_paths, language): if split_name == "train": return super().get_data_files(split_name, file_paths, language) else: data_set_path = file_paths["dataset"] data_file = os.path.join(data_set_path, "dataset", "test_code.jsonl") return [data_file] def _generate_examples(self, split_name, file_paths): for e in super()._generate_examples(split_name, file_paths, self.LANGUAGE): yield e
class CodeXGlueCtCodeToTextBaseImpl(TrainValidTestChild): _DESCRIPTION = _DESCRIPTION _CITATION = _CITATION # For each file, each line in the uncompressed file represents one function. _FEATURES = { "id": datasets.Value("int32"), # Index of the sample "repo": datasets.Value("string"), # repo: the owner/repo "path": datasets.Value("string"), # path: the full path to the original file "func_name": datasets.Value("string"), # func_name: the function or method name "original_string": datasets.Value("string"), # original_string: the raw string before tokenization or parsing "language": datasets.Value("string"), # language: the programming language name "code": datasets.Value("string"), # code/function: the part of the original_string that is code "code_tokens": datasets.features.Sequence( datasets.Value("string") ), # code_tokens/function_tokens: tokenized version of code "docstring": datasets.Value( "string" ), # docstring: the top-level comment or docstring, if it exists in the original string "docstring_tokens": datasets.features.Sequence( datasets.Value("string") ), # docstring_tokens: tokenized version of docstring "sha": datasets.Value("string"), # sha of the file "url": datasets.Value("string"), # url of the file } _SUPERVISED_KEYS = ["docstring", "docstring_tokens"] def generate_urls(self, split_name, language): yield "language", f"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{language}.zip" yield "dataset", "dataset.zip" def get_data_files(self, split_name, file_paths, language): language_specific_path = file_paths["language"] final_path = os.path.join(language_specific_path, language, "final") # Make some cleanup to save space for path in os.listdir(final_path): if path.endswith(".pkl"): os.unlink(path) data_files = [] for root, dirs, files in os.walk(final_path): for file in files: temp = os.path.join(root, file) if ".jsonl" in temp: if split_name in temp: data_files.append(temp) return data_files def post_process(self, split_name, language, js): return js def _generate_examples(self, split_name, file_paths, language): import gzip data_set_path = file_paths["dataset"] data_files = self.get_data_files(split_name, file_paths, language) urls = {} f1_path_parts = [data_set_path, "dataset", language, f"{split_name}.txt"] if self.SINGLE_LANGUAGE: del f1_path_parts[2] f1_path = os.path.join(*f1_path_parts) with open(f1_path, encoding="utf-8") as f1: for line in f1: line = line.strip() urls[line] = True idx = 0 for file in data_files: if ".gz" in file: f = gzip.open(file) else: f = open(file, encoding="utf-8") for line in f: line = line.strip() js = json.loads(line) if js["url"] in urls: js["id"] = idx js = self.post_process(split_name, language, js) if "partition" in js: del js["partition"] yield idx, js idx += 1 f.close()
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "ID": datasets.Value("string"), "Text": datasets.Value("string"), "Pronoun": datasets.Value("string"), "Pronoun-offset": datasets.Value("int32"), "A": datasets.Value("string"), "A-offset": datasets.Value("int32"), "A-coref": datasets.Value("bool"), "B": datasets.Value("string"), "B-offset": datasets.Value("int32"), "B-coref": datasets.Value("bool"), "URL": datasets.Value("string"), } ), supervised_keys=None, homepage="https://github.com/google-research-datasets/gap-coreference", citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "id": datasets.Value("string"), "source": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "ner_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-LOC", "I-LOC", "B-LOCderiv", "I-LOCderiv", "B-LOCpart", "I-LOCpart", "B-ORG", "I-ORG", "B-ORGderiv", "I-ORGderiv", "B-ORGpart", "I-ORGpart", "B-OTH", "I-OTH", "B-OTHderiv", "I-OTHderiv", "B-OTHpart", "I-OTHpart", "B-PER", "I-PER", "B-PERderiv", "I-PERderiv", "B-PERpart", "I-PERpart", ])), "nested_ner_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-LOC", "I-LOC", "B-LOCderiv", "I-LOCderiv", "B-LOCpart", "I-LOCpart", "B-ORG", "I-ORG", "B-ORGderiv", "I-ORGderiv", "B-ORGpart", "I-ORGpart", "B-OTH", "I-OTH", "B-OTHderiv", "I-OTHderiv", "B-OTHpart", "I-OTHpart", "B-PER", "I-PER", "B-PERderiv", "I-PERderiv", "B-PERpart", "I-PERpart", ])), }), supervised_keys=None, homepage="https://sites.google.com/site/germeval2014ner/", citation=_CITATION, )
def _info(self): # TODO(quartz): Specifies the datasets.DatasetInfo object return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # datasets.features.FeatureConnectors features=datasets.Features({ # These are the features of your dataset like images, labels ... "id": datasets.Value("string"), "question": datasets.Value("string"), "choices": datasets.features.Sequence({ "text": datasets.Value("string"), "label": datasets.Value("string") }), "answerKey": datasets.Value("string"), "para": datasets.Value("string"), "para_id": datasets.Value("string"), "para_anno": { "effect_prop": datasets.Value("string"), "cause_dir_str": datasets.Value("string"), "effect_dir_str": datasets.Value("string"), "cause_dir_sign": datasets.Value("string"), "effect_dir_sign": datasets.Value("string"), "cause_prop": datasets.Value("string"), }, "question_anno": { "more_effect_dir": datasets.Value("string"), "less_effect_dir": datasets.Value("string"), "less_cause_prop": datasets.Value("string"), "more_effect_prop": datasets.Value("string"), "less_effect_prop": datasets.Value("string"), "less_cause_dir": datasets.Value("string"), }, }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://allenai.org/data/quartz", citation=_CITATION, )
def _info(self): """ Specify the datasets.DatasetInfo object which contains information and typings for the dataset. """ return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types. features=datasets.Features({ "swda_filename": datasets.Value("string"), "ptb_basename": datasets.Value("string"), "conversation_no": datasets.Value("int64"), "transcript_index": datasets.Value("int64"), "act_tag": datasets.ClassLabel(num_classes=217, names=_ACT_TAGS), "damsl_act_tag": datasets.ClassLabel(num_classes=43, names=_DAMSL_ACT_TAGS), "caller": datasets.Value("string"), "utterance_index": datasets.Value("int64"), "subutterance_index": datasets.Value("int64"), "text": datasets.Value("string"), "pos": datasets.Value("string"), "trees": datasets.Value("string"), "ptb_treenumbers": datasets.Value("string"), "talk_day": datasets.Value("string"), "length": datasets.Value("int64"), "topic_description": datasets.Value("string"), "prompt": datasets.Value("string"), "from_caller": datasets.Value("int64"), "from_caller_sex": datasets.Value("string"), "from_caller_education": datasets.Value("int64"), "from_caller_birth_year": datasets.Value("int64"), "from_caller_dialect_area": datasets.Value("string"), "to_caller": datasets.Value("int64"), "to_caller_sex": datasets.Value("string"), "to_caller_education": datasets.Value("int64"), "to_caller_birth_year": datasets.Value("int64"), "to_caller_dialect_area": datasets.Value("string"), }), supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "id": datasets.Value("int32"), "dialogId": datasets.Value("int32"), "context": datasets.Value("string"), "users": [{"userType": datasets.Value("string"), "id": datasets.Value("string")}], "evaluation": [ { "breadth": datasets.Value("int32"), "userId": datasets.Value("string"), "quality": datasets.Value("int32"), "engagement": datasets.Value("int32"), } ], "thread": [ { "evaluation": datasets.Value("int32"), "text": datasets.Value("string"), "userId": datasets.Value("string"), "time": datasets.Value("int32"), } ], } ), supervised_keys=None, homepage="https://github.com/DeepPavlov/convai/tree/master/2017", )
def _info(self): features = datasets.Features( { "text": datasets.Value("string"), "label": datasets.features.ClassLabel( names=[ "activate_my_card", "age_limit", "apple_pay_or_google_pay", "atm_support", "automatic_top_up", "balance_not_updated_after_bank_transfer", "balance_not_updated_after_cheque_or_cash_deposit", "beneficiary_not_allowed", "cancel_transfer", "card_about_to_expire", "card_acceptance", "card_arrival", "card_delivery_estimate", "card_linking", "card_not_working", "card_payment_fee_charged", "card_payment_not_recognised", "card_payment_wrong_exchange_rate", "card_swallowed", "cash_withdrawal_charge", "cash_withdrawal_not_recognised", "change_pin", "compromised_card", "contactless_not_working", "country_support", "declined_card_payment", "declined_cash_withdrawal", "declined_transfer", "direct_debit_payment_not_recognised", "disposable_card_limits", "edit_personal_details", "exchange_charge", "exchange_rate", "exchange_via_app", "extra_charge_on_statement", "failed_transfer", "fiat_currency_support", "get_disposable_virtual_card", "get_physical_card", "getting_spare_card", "getting_virtual_card", "lost_or_stolen_card", "lost_or_stolen_phone", "order_physical_card", "passcode_forgotten", "pending_card_payment", "pending_cash_withdrawal", "pending_top_up", "pending_transfer", "pin_blocked", "receiving_money", "Refund_not_showing_up", "request_refund", "reverted_card_payment?", "supported_cards_and_currencies", "terminate_account", "top_up_by_bank_transfer_charge", "top_up_by_card_charge", "top_up_by_cash_or_cheque", "top_up_failed", "top_up_limits", "top_up_reverted", "topping_up_by_card", "transaction_charged_twice", "transfer_fee_charged", "transfer_into_account", "transfer_not_received_by_recipient", "transfer_timing", "unable_to_verify_identity", "verify_my_identity", "verify_source_of_funds", "verify_top_up", "virtual_card_not_working", "visa_or_mastercard", "why_verify_identity", "wrong_amount_of_cash_received", "wrong_exchange_rate_for_cash_withdrawal", ] ), } ) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): features = { feature: datasets.Value("string") for feature in self.config.features } if self.config.name.startswith("wsc"): features["span1_index"] = datasets.Value("int32") features["span2_index"] = datasets.Value("int32") if self.config.name == "wic": features["start1"] = datasets.Value("int32") features["start2"] = datasets.Value("int32") features["end1"] = datasets.Value("int32") features["end2"] = datasets.Value("int32") if self.config.name == "multirc": features["idx"] = dict({ "paragraph": datasets.Value("int32"), "question": datasets.Value("int32"), "answer": datasets.Value("int32"), }) elif self.config.name == "record": features["idx"] = dict({ "passage": datasets.Value("int32"), "query": datasets.Value("int32"), }) else: features["idx"] = datasets.Value("int32") if self.config.name == "record": # Entities are the set of possible choices for the placeholder. features["entities"] = datasets.features.Sequence( datasets.Value("string")) # Answers are the subset of entities that are correct. features["answers"] = datasets.features.Sequence( datasets.Value("string")) else: features["label"] = datasets.features.ClassLabel( names=self.config.label_classes) return datasets.DatasetInfo( description=_GLUE_DESCRIPTION + self.config.description, features=datasets.Features(features), homepage=self.config.url, citation=self.config.citation + "\n" + _SUPER_GLUE_CITATION, )
def _info(self): if self.config.name == "evaluation_dataset": features = datasets.Features({ "stackoverflow_id": datasets.Value("int32"), "question": datasets.Value("string"), "question_url": datasets.Value("string"), "question_author": datasets.Value("string"), "question_author_url": datasets.Value("string"), "answer": datasets.Value("string"), "answer_url": datasets.Value("string"), "answer_author": datasets.Value("string"), "answer_author_url": datasets.Value("string"), "examples": datasets.features.Sequence(datasets.Value("int32")), "examples_url": datasets.features.Sequence(datasets.Value("string")), }) else: features = datasets.Features({ "id": datasets.Value("int32"), "filepath": datasets.Value("string"), "method_name": datasets.Value("string"), "start_line": datasets.Value("int32"), "end_line": datasets.Value("int32"), "url": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): span_features = { "start": datasets.Value("int32"), "end": datasets.Value("int32"), "string": datasets.Value("string"), } reference_features = { "start": datasets.Value("int32"), "end": datasets.Value("int32"), "bridge": datasets.Value("bool_"), "string": datasets.Value("string"), } return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "example_id": datasets.Value("int64"), "title_text": datasets.Value("string"), "url": datasets.Value("string"), "question": datasets.Value("string"), "paragraph_text": datasets.Value("string"), "sentence_starts": datasets.Sequence(datasets.Value("int32")), "original_nq_answers": [span_features], "annotation": { "referential_equalities": [ { "question_reference": span_features, "sentence_reference": reference_features, } ], "answer": [ { "sentence_reference": reference_features, "paragraph_reference": span_features, } ], "explanation_type": datasets.Value("string"), "selected_sentence": span_features, }, } ), supervised_keys=None, homepage=_HOMEPAGE, citation=_CITATION, )
} """ _DESCRIPTION = """\ A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification. It is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace("[n]", "\n")`) and then use them for your purposes. """ _HOMEPAGE = "https://github.com/hooshvare/pn-summary" _LICENSE = "MIT License" _URLs = { "1.0.0": { "data": "https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download", "features": [ {"name": "id", "type": datasets.Value("string")}, {"name": "title", "type": datasets.Value("string")}, {"name": "article", "type": datasets.Value("string")}, {"name": "summary", "type": datasets.Value("string")}, { "name": "category", "type": datasets.ClassLabel( names=[ "Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation",
def _info(self): if self.config.name == "dialogue_domain": features = datasets.Features({ "dial_id": datasets.Value("string"), "doc_id": datasets.Value("string"), "domain": datasets.Value("string"), "turns": [{ "turn_id": datasets.Value("int32"), "role": datasets.Value("string"), "da": datasets.Value("string"), "references": [{ "sp_id": datasets.Value("string"), "label": datasets.Value("string"), }], "utterance": datasets.Value("string"), }], }) elif self.config.name == "document_domain": features = datasets.Features({ "domain": datasets.Value("string"), "doc_id": datasets.Value("string"), "title": datasets.Value("string"), "doc_text": datasets.Value("string"), "spans": [{ "id_sp": datasets.Value("string"), "tag": datasets.Value("string"), "start_sp": datasets.Value("int32"), "end_sp": datasets.Value("int32"), "text_sp": datasets.Value("string"), "title": datasets.Value("string"), "parent_titles": datasets.Value("string"), "id_sec": datasets.Value("string"), "start_sec": datasets.Value("int32"), "text_sec": datasets.Value("string"), "end_sec": datasets.Value("int32"), }], "doc_html_ts": datasets.Value("string"), "doc_html_raw": datasets.Value("string"), }) elif self.config.name == "doc2dial_rc": features = datasets.Features({ "id": datasets.Value("string"), "title": datasets.Value("string"), "context": datasets.Value("string"), "question": datasets.Value("string"), "answers": datasets.features.Sequence({ "text": datasets.Value("string"), "answer_start": datasets.Value("int32"), }), "domain": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, citation=_CITATION, )
def _info(self): if self.config.name == "trex": features = datasets.Features({ "uuid": datasets.Value("string"), "obj_uri": datasets.Value("string"), "obj_label": datasets.Value("string"), "sub_uri": datasets.Value("string"), "sub_label": datasets.Value("string"), "predicate_id": datasets.Value("string"), "sub_surface": datasets.Value("string"), "obj_surface": datasets.Value("string"), "masked_sentence": datasets.Value("string"), "template": datasets.Value("string"), "template_negated": datasets.Value("string"), "label": datasets.Value("string"), "description": datasets.Value("string"), "type": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) elif self.config.name == "conceptnet": features = datasets.Features({ "uuid": datasets.Value("string"), "sub": datasets.Value("string"), "obj": datasets.Value("string"), "pred": datasets.Value("string"), "obj_label": datasets.Value("string"), "masked_sentence": datasets.Value("string"), "negated": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) elif self.config.name == "squad": features = datasets.Features({ "id": datasets.Value("string"), "sub_label": datasets.Value("string"), "obj_label": datasets.Value("string"), "negated": datasets.Value("string"), "masked_sentence": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) elif self.config.name == "google_re": features = datasets.Features({ "pred": datasets.Value("string"), "sub": datasets.Value("string"), "obj": datasets.Value("string"), "evidences": datasets.Value("string"), "judgments": datasets.Value("string"), "sub_w": datasets.Value("string"), "sub_label": datasets.Value("string"), "sub_aliases": datasets.Value("string"), "obj_w": datasets.Value("string"), "obj_label": datasets.Value("string"), "obj_aliases": datasets.Value("string"), "uuid": datasets.Value("string"), "masked_sentence": datasets.Value("string"), "template": datasets.Value("string"), "template_negated": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
f"https://the-eye.eu/public/AI/pile/train/{i:0>2}.jsonl.zst" for i in range(30) ], "validation": ["https://the-eye.eu/public/AI/pile/val.jsonl.zst"], "test": ["https://the-eye.eu/public/AI/pile/test.jsonl.zst"], }, "free_law": "https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst", "pubmed_central": "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz", } _FEATURES = { "all": datasets.Features({ "text": datasets.Value("string"), "meta": { "pile_set_name": datasets.Value("string") }, }), "free_law": datasets.Features({ "text": datasets.Value("string"), "meta": { "case_ID": datasets.Value("string"), "case_jurisdiction": datasets.Value("string"), "date_created": datasets.Value("string"), }, }), "pubmed_central": datasets.Features({
def features(self): if self.name == "simplified": return { "text": datasets.Value("string"), "labels": datasets.Sequence(datasets.ClassLabel(names=_CLASS_NAMES)), "id": datasets.Value("string"), } elif self.name == "raw": d = { "text": datasets.Value("string"), "id": datasets.Value("string"), "author": datasets.Value("string"), "subreddit": datasets.Value("string"), "link_id": datasets.Value("string"), "parent_id": datasets.Value("string"), "created_utc": datasets.Value("float"), "rater_id": datasets.Value("int32"), "example_very_unclear": datasets.Value("bool"), } d.update( {label: datasets.Value("int32") for label in _CLASS_NAMES}) return d
def _info(self): # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset features = datasets.Features( { "id": datasets.Value("int32"), "category": datasets.Value("string"), "text": datasets.Value("string"), "ner": datasets.features.Sequence( { "source": { "from": datasets.Value("int32"), "text": datasets.Value("string"), "to": datasets.Value("int32"), "type": datasets.features.ClassLabel( names=[ "PRODUCT_NAME", "PRODUCT_NAME_IMP", "PRODUCT_NO_BRAND", "BRAND_NAME", "BRAND_NAME_IMP", "VERSION", "PRODUCT_ADJ", "BRAND_ADJ", "LOCATION", "LOCATION_IMP", ] ), }, "target": { "from": datasets.Value("int32"), "text": datasets.Value("string"), "to": datasets.Value("int32"), "type": datasets.features.ClassLabel( names=[ "PRODUCT_NAME", "PRODUCT_NAME_IMP", "PRODUCT_NO_BRAND", "BRAND_NAME", "BRAND_NAME_IMP", "VERSION", "PRODUCT_ADJ", "BRAND_ADJ", "LOCATION", "LOCATION_IMP", ] ), }, } ), } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "document": { "id": datasets.Value("string"), "kind": datasets.Value("string"), "url": datasets.Value("string"), "file_size": datasets.Value("int32"), "word_count": datasets.Value("int32"), "start": datasets.Value("string"), "end": datasets.Value("string"), "summary": { "text": datasets.Value("string"), "tokens": datasets.features.Sequence(datasets.Value("string")), "url": datasets.Value("string"), "title": datasets.Value("string"), }, "text": datasets.Value("string"), }, "question": { "text": datasets.Value("string"), "tokens": datasets.features.Sequence(datasets.Value("string")), }, "answers": [{ "text": datasets.Value("string"), "tokens": datasets.features.Sequence(datasets.Value("string")), }], }), supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset features = datasets.Features({ "full_text": datasets.Value("string"), "text_translation": datasets.Value("string"), "screen_name": datasets.Value("string"), "description": datasets.Value("string"), "desc_translation": datasets.Value("string"), "location": datasets.Value("string"), "weekofyear": datasets.Value("int64"), "weekday": datasets.Value("int64"), "month": datasets.Value("int64"), "year": datasets.Value("int64"), "day": datasets.Value("int64"), "point_info": datasets.Value("string"), "point": datasets.Value("string"), "latitude": datasets.Value("float64"), "longitude": datasets.Value("float64"), "altitude": datasets.Value("float64"), "province": datasets.Value("string"), "hisco_standard": datasets.Value("string"), "hisco_code": datasets.Value("string"), "industry": datasets.Value("bool_"), "sentiment_pattern": datasets.Value("float64"), "subjective_pattern": datasets.Value("float64"), "label": datasets.ClassLabel(num_classes=3, names=["neg", "neu", "pos"], names_file=None, id=None), }) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): if self.config.name == "schema": features = datasets.Features({ "service_name": datasets.Value("string"), "description": datasets.Value("string"), "slots": datasets.Sequence({ "name": datasets.Value("string"), "description": datasets.Value("string"), "is_categorical": datasets.Value("bool"), "possible_values": datasets.Sequence(datasets.Value("string")), }), "intents": datasets.Sequence( { "name": datasets.Value("string"), "description": datasets.Value("string"), "is_transactional": datasets.Value("bool"), "required_slots": datasets.Sequence(datasets.Value("string")), # optional_slots was originally a dictionary "optional_slots": datasets.Sequence( { "slot_name": datasets.Value("string"), "slot_value": datasets.Value("string"), }), "result_slots": datasets.Sequence(datasets.Value("string")), }, ), }) else: features = datasets.Features({ "dialogue_id": datasets.Value("string"), "services": datasets.Sequence(datasets.Value("string")), "turns": datasets.Sequence({ "speaker": datasets.ClassLabel(names=["USER", "SYSTEM"]), "utterance": datasets.Value("string"), "frames": datasets.Sequence({ "service": datasets.Value("string"), "slots": datasets.Sequence({ "slot": datasets.Value("string"), "start": datasets.Value("int32"), "exclusive_end": datasets.Value("int32"), }), # optional "state": { "active_intent": datasets.Value("string"), "requested_slots": datasets.Sequence(datasets.Value("string")), # slot_values was originally a dictionary "slot_values": datasets.Sequence({ "slot_name": datasets.Value("string"), "slot_value_list": datasets.Sequence(datasets.Value("string")), }), }, "actions": datasets.Sequence({ "act": datasets.ClassLabel(names=_ALL_ACTS), # optional "slot": datasets.Value("string"), # optional "canonical_values": datasets.Sequence(datasets.Value("string")), # optional "values": datasets.Sequence(datasets.Value("string")), }), # optional "service_results": datasets.Sequence( # Arrow doesn't like Sequences of Sequences for default values so we need a Sequence of Features of Sequences { "service_results_list": datasets.Sequence( # originally each list item was a dictionary (optional) { "service_slot_name": datasets.Value("string"), "service_canonical_value": datasets.Value("string"), }) }), # optional "service_call": { "method": datasets.Value("string"), # parameters was originally a dictionary "parameters": datasets.Sequence({ "parameter_slot_name": datasets.Value("string"), "parameter_canonical_value": datasets.Value("string"), }), }, }), }), }) return datasets.DatasetInfo( description=_DESCRIPTION, features= features, # Here we define them above because they are different between the two configurations supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): features = datasets.Features({ "dialogue_id": datasets.Value("string"), "services": datasets.Sequence(datasets.Value("string")), "turns": datasets.Sequence({ "turn_id": datasets.Value("string"), "speaker": datasets.ClassLabel(names=["USER", "SYSTEM"]), "utterance": datasets.Value("string"), "frames": datasets.Sequence({ "service": datasets.Value("string"), "state": { "active_intent": datasets.Value("string"), "requested_slots": datasets.Sequence(datasets.Value("string")), "slots_values": datasets.Sequence({ "slots_values_name": datasets.Value("string"), "slots_values_list": datasets.Sequence(datasets.Value("string")), }), }, "slots": datasets.Sequence({ "slot": datasets.Value("string"), "value": datasets.Value("string"), "start": datasets.Value("int32"), "exclusive_end": datasets.Value("int32"), "copy_from": datasets.Value("string"), "copy_from_value": datasets.Sequence(datasets.Value("string")), }), }), "dialogue_acts": datasets.Features({ "dialog_act": datasets.Sequence({ "act_type": datasets.Value("string"), "act_slots": datasets.Sequence( datasets.Features({ "slot_name": datasets.Value("string"), "slot_value": datasets.Value("string"), }), ), }), "span_info": datasets.Sequence({ "act_type": datasets.Value("string"), "act_slot_name": datasets.Value("string"), "act_slot_value": datasets.Value("string"), "span_start": datasets.Value("int32"), "span_end": datasets.Value("int32"), }), }), }), }) return datasets.DatasetInfo( description=_DESCRIPTION, features= features, # Here we define them above because they are different between the two configurations supervised_keys=None, homepage= "https://github.com/budzianowski/multiwoz/tree/master/data/MultiWOZ_2.2", license=_LICENSE, citation=_CITATION, )