def _info(self): if self.config_name not in [ "sst2", "mnli", "mnli_mismatched", "mnli_matched", "cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans" ]: raise KeyError( 'You should supply a configuration name selected in ' '["sst2", "mnli", "mnli_mismatched", "mnli_matched", ' '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]' ) return nlp.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=nlp.Features({ 'predictions': nlp.Value( 'int64' if self.config_name != 'stsb' else 'float32'), 'references': nlp.Value( 'int64' if self.config_name != 'stsb' else 'float32'), }), codebase_urls=[], reference_urls=[], format='numpy')
def _info(self): return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( { "string": nlp.Value("string"), "sectionName": nlp.Value("string"), "label": nlp.features.ClassLabel(names=["method", "background", "result"]), "citingPaperId": nlp.Value("string"), "citedPaperId": nlp.Value("string"), "excerpt_index": nlp.Value("int32"), "isKeyCitation": nlp.Value("bool"), "label2": nlp.features.ClassLabel( names=["supportive", "not_supportive", "cant_determine", "none"] ), "citeEnd": nlp.Value("int64"), "citeStart": nlp.Value("int64"), "source": nlp.features.ClassLabel(names=_SOURCE_NAMES), "label_confidence": nlp.Value("float32"), "label2_confidence": nlp.Value("float32"), "id": nlp.Value("string"), } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/allenai/scicite", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({ 'whoTarget': nlp.Value("string"), 'intentYN': nlp.Value("string"), 'sexYN': nlp.Value("string"), 'sexReason': nlp.Value("string"), 'offensiveYN': nlp.Value("string"), 'annotatorGender': nlp.Value("string"), 'annotatorMinority': nlp.Value("string"), 'sexPhrase': nlp.Value("string"), 'speakerMinorityYN': nlp.Value("string"), 'WorkerId': nlp.Value("string"), 'HITId': nlp.Value("string"), 'annotatorPolitics': nlp.Value("string"), 'annotatorRace': nlp.Value("string"), 'annotatorAge': nlp.Value("string"), 'post': nlp.Value("string"), 'targetMinority': nlp.Value("string"), 'targetCategory': nlp.Value("string"), 'targetStereotype': nlp.Value("string") }), # No default supervised_keys (as we have to pass both premise # and hypothesis as input). supervised_keys=None, homepage= "https://homes.cs.washington.edu/~msap/social-bias-frames/", citation=_CITATION, )
def _info(self): # TODO(qangaroo): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features({ # These are the features of your dataset like images, labels ... "query": nlp.Value("string"), "supports": nlp.features.Sequence({"support": nlp.Value("string")}), "candidates": nlp.features.Sequence({"candidate": nlp.Value("string")}), "answer": nlp.Value("string"), "id": nlp.Value("string") # These are the features of your dataset like images, labels ... }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="http://qangaroo.cs.ucl.ac.uk/index.html", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({ "id": nlp.Value("string"), "title": nlp.Value("string"), "context": nlp.Value("string"), "question": nlp.Value("string"), "answers": nlp.features.Sequence({ "text": nlp.Value("string"), "answer_start": nlp.Value("int32") }), }), # No default supervised_keys (as we have to pass both question # and context as input). supervised_keys=None, homepage= "https://github.com/husseinmozannar/SOQAL/tree/master/data", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( { "text": nlp.Value("string"), "summary": nlp.Value("string"), "topic": nlp.Value("string"), "url": nlp.Value("string"), "title": nlp.Value("string"), "date":nlp.Value("string") # These are the features of your dataset like images, labels ... } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="", citation=_CITATION, )
def _info(self): # TODO(mlqa): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features({ 'context': nlp.Value('string'), 'questions': nlp.features.Sequence({ 'question': nlp.Value('string') } ), 'answers': nlp.features.Sequence({ "text": nlp.Value('string'), "answer_start": nlp.Value('int32'), }), 'ids': nlp.features.Sequence({ 'idx':nlp.Value('string') }) # These are the features of your dataset like images, labels ... }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage='https://github.com/facebookresearch/MLQA', citation=_CITATION, )
def _info(self): # TODO(openBookQA): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features({ # These are the features of your dataset like images, labels ... "id": nlp.Value("string"), "question_stem": nlp.Value("string"), "choices": nlp.features.Sequence({ "text": nlp.Value("string"), "label": nlp.Value("string") }), "answerKey": nlp.Value("string"), }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://allenai.org/data/open-book-qa", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({ "q_id": nlp.Value("string"), "title": nlp.Value("string"), "selftext": nlp.Value("string"), "document": nlp.Value("string"), "subreddit": nlp.Value("string"), "answers": nlp.features.Sequence({ "a_id": nlp.Value("string"), "text": nlp.Value("string"), "score": nlp.Value("int32") }), "title_urls": nlp.features.Sequence(nlp.Value("string")), "selftext_urls": nlp.features.Sequence(nlp.Value("string")), "answers_urls": nlp.features.Sequence(nlp.Value("string")), }), supervised_keys=None, homepage="https://facebookresearch.github.io/ELI5/explore.html", citation=_CITATION, )
def _info(self): # TODO(discofuse): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( { "connective_string": nlp.Value("string"), "discourse_type": nlp.Value("string"), "coherent_second_sentence": nlp.Value("string"), "has_coref_type_pronoun": nlp.Value("float32"), "incoherent_first_sentence": nlp.Value("string"), "incoherent_second_sentence": nlp.Value("string"), "has_coref_type_nominal": nlp.Value("float32"), "coherent_first_sentence": nlp.Value("string"), # These are the features of your dataset like images, labels ... } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/google-research-datasets/discofuse", citation=_CITATION, )
def _info(self): # TODO(lc_quad): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features({ "NNQT_question": nlp.Value("string"), "uid": nlp.Value("int32"), "subgraph": nlp.Value("string"), "template_index": nlp.Value("int32"), "question": nlp.Value("string"), "sparql_wikidata": nlp.Value("string"), "sparql_dbpedia18": nlp.Value("string"), "template": nlp.Value("string"), # "template_id": nlp.Value('string'), "paraphrased_question": nlp.Value("string") # These are the features of your dataset like images, labels ... }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="http://lc-quad.sda.tech/", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION + "\n" + self.config.description, features=nlp.Features({ "category": nlp.Value("string"), "air_date": nlp.Value("string"), "question": nlp.Value("string"), "value": nlp.Value("string"), "answer": nlp.Value("string"), "round": nlp.Value("string"), "category": nlp.Value("string"), "show_number": nlp.Value("int32"), "search_results": nlp.features.Sequence({ "urls": nlp.Value("string"), "snippets": nlp.Value("string"), "titles": nlp.Value("string"), "related_links": nlp.Value("string"), }) # These are the features of your dataset like images, labels ... }), homepage="https://github.com/nyu-dl/dl4ir-searchQA", citation=_CITATION, )
def _info(self): # TODO(jeopardy): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( { "category": nlp.Value("string"), "air_date": nlp.Value("string"), "question": nlp.Value("string"), "value": nlp.Value("int32"), "answer": nlp.Value("string"), "round": nlp.Value("string"), "category": nlp.Value("string"), "show_number": nlp.Value("int32"), # These are the features of your dataset like images, labels ... } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_URL, citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({ "id": nlp.Value("string"), "title": nlp.Value("string"), "context": nlp.Value("string"), "question": nlp.Value("string"), "answers": nlp.features.Sequence({ "text": nlp.Value("string"), "answer_start": nlp.Value("int32"), }), }), # No default supervised_keys (as we have to pass both question # and context as input). supervised_keys=None, homepage= "https://modestyachts.github.io/squadshifts-website/index.html", citation=_CITATION, )
def _info(self): # TODO(squad_it): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( { "id": nlp.Value("string"), "context": nlp.Value("string"), "question": nlp.Value("string"), "answers": nlp.features.Sequence( {"text": nlp.Value("string"), "answer_start": nlp.Value("int32"),} ), # These are the features of your dataset like images, labels ... } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/crux82/squad-it", citation=_CITATION, )
def _info(self): # TODO(cosmos_qa): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features({ 'id': nlp.Value('string'), 'context': nlp.Value('string'), 'question': nlp.Value('string'), 'answer0': nlp.Value('string'), 'answer1': nlp.Value('string'), 'answer2':nlp.Value('string'), 'answer3': nlp.Value('string'), 'label': nlp.Value('int32') # These are the features of your dataset like images, labels ... }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage='https://wilburone.github.io/cosmos/', citation=_CITATION, )
def _info(self): # TODO(qasc): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features({ 'id': nlp.Value('string'), 'question': nlp.Value('string'), 'choices': nlp.features.Sequence({ 'text': nlp.Value('string'), 'label': nlp.Value('string') }), 'answerKey': nlp.Value('string'), 'fact1': nlp.Value('string'), 'fact2': nlp.Value('string'), 'combinedfact': nlp.Value('string'), 'formatted_question': nlp.Value('string'), # These are the features of your dataset like images, labels ... }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage='https://allenai.org/data/qasc', citation=_CITATION, )
def _info(self): # TODO(xtreme): Specifies the nlp.DatasetInfo object features = {text_feature: nlp.Value("string") for text_feature in six.iterkeys(self.config.text_features)} if "answers" in features.keys(): features["answers"] = nlp.features.Sequence( {"answer_start": nlp.Value("int32"), "text": nlp.Value("string")} ) if self.config.name.startswith("PAWS-X"): features["label"] = nlp.Value("string") if self.config.name == "XNLI": features["gold_label"] = nlp.Value("string") return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=self.config.description + "\n" + _DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( features # These are the features of your dataset like images, labels ... ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url, citation=self.config.citation + "\n" + _CITATION, )
def _info(self): # TODO(empathetic_dialogues): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features({ 'conv_id': nlp.Value('string'), 'utterance_idx': nlp.Value('int32'), 'context': nlp.Value('string'), 'prompt': nlp.Value('string'), 'speaker_idx': nlp.Value('int32'), 'utterance': nlp.Value('string'), 'selfeval': nlp.Value('string'), 'tags': nlp.Value('string') # These are the features of your dataset like images, labels ... }), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage='https://github.com/facebookresearch/EmpatheticDialogues', citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({_DOCUMENT: nlp.Value("string"), _SUMMARY: nlp.Value("string"),}), supervised_keys=(_DOCUMENT, _SUMMARY), homepage="https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({_DOCUMENT: nlp.Value("string"), _SUMMARY: nlp.Value("string")}), supervised_keys=(_DOCUMENT, _SUMMARY), homepage="https://github.com/ryanzhumich/AESLC", citation=_CITATION, )
def write_flattened_sequence(feats, dummy_data, tmp_dir): my_features = nlp.Features(feats) writer = ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) for key, record in dummy_data: example = my_features.encode_example(record) writer.write(example) num_examples, num_bytes = writer.finalize()
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "id": nlp.Value("string"), "text": nlp.Value("string"), "title": nlp.Value("string"), "embeddings": nlp.Sequence(nlp.Value("float32")), } ) if self.config.with_embeddings else nlp.Features({"id": nlp.Value("string"), "text": nlp.Value("string"), "title": nlp.Value("string")}), supervised_keys=None, homepage="https://github.com/facebookresearch/DPR", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({"text": nlp.Value("string"), "label": nlp.features.ClassLabel(names=["1", "2"]),}), supervised_keys=None, homepage="https://course.fast.ai/datasets", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({"text": nlp.Value("string"),}), supervised_keys=None, homepage="https://yknzhu.wixsite.com/mbweb", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({"text": nlp.Value("string")}), supervised_keys=("text", "text"), homepage="http://www.statmt.org/lm-benchmark/", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({"title": nlp.Value("string"), "text": nlp.Value("string"),}), # No default supervised_keys. supervised_keys=None, homepage="https://dumps.wikimedia.org", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({"text": nlp.Value("string")}), supervised_keys=None, homepage= "https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt", citation=_CITATION, )
def benchmark_map_filter(): times = {"num examples": SPEED_TEST_N_EXAMPLES} with tempfile.TemporaryDirectory() as tmp_dir: features = nlp.Features({ "text": nlp.Value("string"), "numbers": nlp.Value("float32") }) dataset = generate_example_dataset(os.path.join( tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES) tokenizer = transformers.AutoTokenizer.from_pretrained( "bert-base-cased", use_fast=True) def tokenize(examples): return tokenizer(examples["text"]) times["map identity"] = map(dataset) times["map identity batched"] = map(dataset, batched=True) times["map no-op batched"] = map(dataset, function=lambda x: None, batched=True) with dataset.formatted_as(type="numpy"): times["map no-op batched numpy"] = map(dataset, function=lambda x: None, batched=True) with dataset.formatted_as(type="pandas"): times["map no-op batched pandas"] = map(dataset, function=lambda x: None, batched=True) with dataset.formatted_as(type="torch", columns="numbers"): times["map no-op batched pytorch"] = map(dataset, function=lambda x: None, batched=True) with dataset.formatted_as(type="tensorflow", columns="numbers"): times["map no-op batched tensorflow"] = map( dataset, function=lambda x: None, batched=True) times["map fast-tokenizer batched"] = map(dataset, function=tokenize, batched=True) times["filter"] = filter(dataset) # Activate later when tokenizer support batched inputs # with dataset.formatted_as(type='numpy'): # times[func.__name__ + " fast-tokenizer batched numpy"] = func(dataset, function=tokenize, batched=True) with open(RESULTS_FILE_PATH, "wb") as f: f.write(json.dumps(times).encode("utf-8"))
def _info(self): return nlp.DatasetInfo( # nlp.features.FeatureConnectors features=nlp.Features({ "buggy": nlp.Value("string"), "fixed": nlp.Value("string") }), supervised_keys=None, )