def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "text": nlp.Value("string"), "topics": nlp.Sequence(nlp.Value("string")), "lewis_split": nlp.Value("string"), "cgis_split": nlp.Value("string"), "old_id": nlp.Value("string"), "new_id": nlp.Value("string"), "places": nlp.Sequence(nlp.Value("string")), "people": nlp.Sequence(nlp.Value("string")), "orgs": nlp.Sequence(nlp.Value("string")), "exchanges": nlp.Sequence(nlp.Value("string")), "date": nlp.Value("string"), "title": nlp.Value("string"), } ), # No default supervised_keys (as we have to pass both premise # and hypothesis as input). supervised_keys=None, homepage="https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html", citation=_CITATION, )
def _info(self): features = { feature: nlp.Value("string") for feature in self.config.features } if self.config.name == 'task1': features["id"] = nlp.Value("int64") features["text"] = nlp.Value("string") features["url"] = nlp.Value("string") features["label"] = nlp.ClassLabel(names=["0", "1"]) elif self.config.name == 'task2': features["id"] = nlp.Value("int64") features["label"] = nlp.ClassLabel(names=["0", "1"]) features["last"] = nlp.Value("bool") features["sent_num"] = nlp.Value("int64") features["sentence"] = nlp.Value("string") elif self.config.name in ['task3_document', 'task3_sentence']: features['token'] = nlp.Sequence(nlp.Value("string")) features['label'] = nlp.Sequence( nlp.ClassLabel(names=[ 'B-etime', 'B-fname', 'B-loc', 'B-organizer', 'B-participant', 'B-place', 'B-target', 'B-trigger', 'I-etime', 'I-fname', 'I-loc', 'I-organizer', 'I-participant', 'I-place', 'I-target', 'I-trigger', 'O' ])) else: raise SystemExit('Invalid task name') return nlp.DatasetInfo(features=nlp.Features(features), )
def _info(self): return nlp.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=nlp.Features({ 'predictions': nlp.Sequence(nlp.Value('string', id='token'), id='sequence'), 'references': nlp.Sequence(nlp.Sequence(nlp.Value('string', id='token'), id='sequence'), id='references'), }), codebase_urls=["https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py"], reference_urls=["https://en.wikipedia.org/wiki/BLEU", "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213"] )
def _info(self): return nlp.MetricInfo( description=_DESCRIPTION, citation=_CITATION, homepage="https://github.com/chakki-works/seqeval", inputs_description=_KWARGS_DESCRIPTION, features=nlp.Features({ 'predictions': nlp.Sequence(nlp.Value('string', id='label'), id='sequence'), 'references': nlp.Sequence(nlp.Value('string', id='label'), id='sequence'), }), codebase_urls=["https://github.com/chakki-works/seqeval"], reference_urls=["https://github.com/chakki-works/seqeval"])
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "id": nlp.Value("string"), "tokens": nlp.Sequence(nlp.Value("string")), "labels": nlp.Sequence(nlp.Value("string")), } ), supervised_keys=None, homepage="http://noisy-text.github.io/2017/emerging-rare-entities.html", citation=_CITATION, )
def _info(self): return nlp.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=nlp.Features({ 'predictions': nlp.Sequence(nlp.Value('string', id='token'), id='sequence'), 'references': nlp.Sequence(nlp.Sequence(nlp.Value('string', id='token'), id='sequence'), id='references'), }), codebase_urls=["https://github.com/cnap/gec-ranking"], reference_urls=["https://github.com/cnap/gec-ranking"])
def test_nested_features(self): expected_num_examples = len(get_test_nested_examples()) with tempfile.TemporaryDirectory() as tmp_cache_dir: builder = NestedBeamDataset(cache_dir=tmp_cache_dir, beam_runner="DirectRunner") builder.download_and_prepare() self.assertTrue( os.path.exists( os.path.join(tmp_cache_dir, "nested_beam_dataset", "default", "0.0.0", "nested_beam_dataset-train.arrow"))) self.assertDictEqual( builder.info.features, nlp.Features({"a": nlp.Sequence({"b": nlp.Value("string")})})) dset = builder.as_dataset() self.assertEqual(dset["train"].num_rows, expected_num_examples) self.assertEqual(dset["train"].info.splits["train"].num_examples, expected_num_examples) self.assertDictEqual(dset["train"][0], get_test_nested_examples()[0][1]) self.assertDictEqual( dset["train"][expected_num_examples - 1], get_test_nested_examples()[expected_num_examples - 1][1]) self.assertTrue( os.path.exists( os.path.join(tmp_cache_dir, "nested_beam_dataset", "default", "0.0.0", "dataset_info.json")))
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "id": nlp.Value("string"), "source": nlp.Value("string"), "tokens": nlp.Sequence(nlp.Value("string")), "labels": nlp.Sequence(nlp.Value("string")), "nested-labels": nlp.Sequence(nlp.Value("string")), } ), supervised_keys=None, homepage="https://sites.google.com/site/germeval2014ner/", citation=_CITATION, )
def _info(self): return nlp.DatasetInfo( features=nlp.Features( {"a": nlp.Sequence({"b": nlp.Value("string")})}), # No default supervised_keys. supervised_keys=None, )
def _info(self): # TODO(xtreme): Specifies the nlp.DatasetInfo object features = { text_feature: nlp.Value("string") for text_feature in six.iterkeys(self.config.text_features) } if "answers" in features.keys(): features["answers"] = nlp.features.Sequence({ "answer_start": nlp.Value("int32"), "text": nlp.Value("string") }) if self.config.name.startswith("PAWS-X"): features["label"] = nlp.Value("string") if self.config.name == "XNLI": features["gold_label"] = nlp.Value("string") if self.config.name.startswith("PAN-X"): features = nlp.Features({ "words": nlp.Sequence(nlp.Value("string")), "ner_tags": nlp.Sequence(nlp.Value("string")), "langs": nlp.Sequence(nlp.Value("string")), }) return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=self.config.description + "\n" + _DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( features # These are the features of your dataset like images, labels ... ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url, citation=self.config.citation + "\n" + _CITATION, )
def _info(self): return nlp.MetricInfo( description=_DESCRIPTION, citation=_CITATION, homepage="https://github.com/Tiiiger/bert_score", inputs_description=_KWARGS_DESCRIPTION, features=nlp.Features({ 'predictions': nlp.Value('string', id='sequence'), 'references': nlp.Sequence(nlp.Value('string', id='sequence'), id='references'), }), codebase_urls=["https://github.com/Tiiiger/bert_score"], reference_urls=["https://github.com/Tiiiger/bert_score", "https://arxiv.org/abs/1904.09675"] )
def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "id": nlp.Value("string"), "text": nlp.Value("string"), "title": nlp.Value("string"), "embeddings": nlp.Sequence(nlp.Value("float32")), } ) if self.config.with_embeddings else nlp.Features({"id": nlp.Value("string"), "text": nlp.Value("string"), "title": nlp.Value("string")}), supervised_keys=None, homepage="https://github.com/facebookresearch/DPR", citation=_CITATION, )
def test_benchmark_speed(self): times = {} read_functions = ( read_unformated, read_formatted_as_numpy, read_batch_unformated, read_batch_formatted_as_numpy, read_col_unformated, read_col_formatted_as_numpy, ) with tempfile.TemporaryDirectory() as tmp_dir: feats = nlp.Features( {"image": Array2D(SPEED_TEST_SHAPE, dtype="float32")}) data = generate_examples(features=feats, num_examples=SPEED_TEST_N_EXAMPLES) write_func = write_array2d times[write_func.__name__] = write_func(feats, data, tmp_dir) for read_func in read_functions: times[read_func.__name__ + " after " + write_func.__name__] = read_func(feats, tmp_dir) with tempfile.TemporaryDirectory() as tmp_dir: feats = nlp.Features({ "image": nlp.Sequence( nlp.Sequence(nlp.Value("float32"), SPEED_TEST_SHAPE[1]), SPEED_TEST_SHAPE[0]) }) data = generate_examples(features=feats, num_examples=SPEED_TEST_N_EXAMPLES) write_func = write_nested_sequence times[write_func.__name__] = write_func(feats, data, tmp_dir) for read_func in read_functions: times[read_func.__name__ + " after " + write_func.__name__] = read_func(feats, tmp_dir) with tempfile.TemporaryDirectory() as tmp_dir: feats = nlp.Features({ "image": nlp.Sequence(nlp.Value("float32"), SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1]) }) data = generate_examples(features=feats, num_examples=SPEED_TEST_N_EXAMPLES) write_func = write_flattened_sequence times[write_func.__name__] = write_func(feats, data, tmp_dir) for read_func in read_functions: times[read_func.__name__ + " after " + write_func.__name__] = read_func(feats, tmp_dir) benchmark_df = pd.DataFrame.from_dict(times, orient="index", columns=["time"]).sort_index() warn("Speed benchmark:\n" + str(benchmark_df)) self.assertGreater( times["write_nested_sequence"], times["write_array2d"] * 10 ) # At leasr 10 times faster (it is supposed to be ~25 times faster) self.assertGreater( times["read_batch_formatted_as_numpy after write_nested_sequence"], times["read_batch_formatted_as_numpy after write_array2d"], ) # At least faster (it is supposed to be ~2 times faster) self.assertGreater( times["read_batch_unformated after write_nested_sequence"], times["read_batch_formatted_as_numpy after write_array2d"] * 5, ) # At least 5 times faster (it is supposed to be ~10 times faster)
def benchmark_iterating(): times = {"num examples": SPEED_TEST_N_EXAMPLES} functions = [ (read, { "length": SMALL_TEST }), (read, { "length": SPEED_TEST_N_EXAMPLES }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 10 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 100 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000 }), (read_formatted, { "type": "numpy", "length": SMALL_TEST }), (read_formatted, { "type": "pandas", "length": SMALL_TEST }), (read_formatted, { "type": "torch", "length": SMALL_TEST }), (read_formatted, { "type": "tensorflow", "length": SMALL_TEST }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 10 }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 1_000 }), ] functions_shuffled = [ (read, { "length": SMALL_TEST }), (read, { "length": SPEED_TEST_N_EXAMPLES }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 10 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 100 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000 }), (read_formatted, { "type": "numpy", "length": SMALL_TEST }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 10 }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 1_000 }), ] with tempfile.TemporaryDirectory() as tmp_dir: print("generating dataset") features = nlp.Features({ "list": nlp.Sequence(nlp.Value("float32")), "numbers": nlp.Value("float32") }) dataset = generate_example_dataset( os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES, seq_shapes={"list": (100, )}, ) print("first set of iterations") for func, kwargs in functions: print(func.__name__, str(kwargs)) times[func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func( dataset, **kwargs) print("shuffling dataset") dataset = dataset.shuffle() print("Second set of iterations (after shuffling") for func, kwargs in functions_shuffled: print("shuffled ", func.__name__, str(kwargs)) times["shuffled " + func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func( dataset, **kwargs) with open(RESULTS_FILE_PATH, "wb") as f: f.write(json.dumps(times).encode("utf-8"))
def _info(self): if self.config.gameplay_scenario == "original": return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=self._DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( { "id": nlp.Value("int32"), "target_id": nlp.Value("int32"), "timestamp": nlp.Value("string"), "status": nlp.Value("string"), "image": { "id": nlp.Value("int32"), "file_name": nlp.Value("string"), "flickr_url": nlp.Value("string"), "coco_url": nlp.Value("string"), "height": nlp.Value("int32"), "width": nlp.Value("int32"), "vg_id": nlp.Value("int32"), "vg_url": nlp.Value("string"), }, "qas": nlp.features.Sequence( {"question": nlp.Value("string"), "answer": nlp.Value("string"), "id": nlp.Value("int32")} ), "objects": nlp.features.Sequence( { "id": nlp.Value("int32"), "bbox": nlp.Sequence(nlp.Value("float32"), length=4), "category": nlp.Value("string"), "area": nlp.Value("float32"), "category_id": nlp.Value("int32"), "segment": nlp.features.Sequence(nlp.features.Sequence(nlp.Value("float32"))), } ), } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://compguesswhat.github.io/", citation=self._CITATION, ) elif self.config.gameplay_scenario == "zero_shot": return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=self._DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( { "id": nlp.Value("int32"), "target_id": nlp.Value("string"), "status": nlp.Value("string"), "image": { "id": nlp.Value("int32"), "file_name": nlp.Value("string"), "coco_url": nlp.Value("string"), "height": nlp.Value("int32"), "width": nlp.Value("int32"), "license": nlp.Value("int32"), "open_images_id": nlp.Value("string"), "date_captured": nlp.Value("string"), }, "objects": nlp.features.Sequence( { "id": nlp.Value("string"), "bbox": nlp.Sequence(nlp.Value("float32"), length=4), "category": nlp.Value("string"), "area": nlp.Value("float32"), "category_id": nlp.Value("int32"), "IsOccluded": nlp.Value("int32"), "IsTruncated": nlp.Value("int32"), "segment": nlp.features.Sequence( { "MaskPath": nlp.Value("string"), "LabelName": nlp.Value("string"), "BoxID": nlp.Value("string"), "BoxXMin": nlp.Value("string"), "BoxXMax": nlp.Value("string"), "BoxYMin": nlp.Value("string"), "BoxYMax": nlp.Value("string"), "PredictedIoU": nlp.Value("string"), "Clicks": nlp.Value("string"), } ), } ), } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://compguesswhat.github.io/", citation=self._CITATION, )
class Cord19Docrel(nlp.GeneratorBasedBuilder): """CORD-19 document relation dataset.""" BUILDER_CONFIGS = [ Cord19DocrelConfig( name="docs", description="document text and meta data", features={ "doi": nlp.Value("string"), "cord19_id": nlp.Value("string"), "s2_id": nlp.Value("string"), "title": nlp.Value("string"), "abstract": nlp.Value("string"), "arxivId": nlp.Value("string"), "venue": nlp.Value("string"), "year": nlp.Value("int16"), "citations_count": nlp.Value("int32"), "references_count": nlp.Value("int32"), "authors": nlp.Sequence(nlp.Value('string', id='author_name')), }, data_url=DATA_URL, ), Cord19DocrelConfig( name="relations", description=" relation data", features={ DOC_A_COL: nlp.Value("string"), DOC_B_COL: nlp.Value("string"), LABEL_COL: nlp.Sequence(nlp.Value('string', id='label')) }, data_url=DATA_URL, ), ] def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION + self.config.description, features=nlp.Features(self.config.features), homepage=_HOMEPAGE, citation=_CITATION, ) def _split_generators(self, dl_manager): arch_path = dl_manager.download_and_extract(self.config.data_url) if "relations" in self.config.name: train_file = "train.csv" test_file = "test.csv" generators = [] for k in [1, 2, 3, 4]: folds_path = os.path.join(arch_path, 'folds', str(k)) generators += [ nlp.SplitGenerator(name=get_train_split(k), gen_kwargs={ 'filepath': os.path.join( folds_path, train_file) }), nlp.SplitGenerator(name=get_test_split(k), gen_kwargs={ 'filepath': os.path.join(folds_path, test_file) }) ] return generators elif "docs" in self.config.name: # docs docs_file = os.path.join(arch_path, "docs.jsonl") return [ nlp.SplitGenerator(name=nlp.Split('docs'), gen_kwargs={"filepath": docs_file}), ] else: raise ValueError() @staticmethod def get_dict_value(d, key, default=None): if key in d: return d[key] else: return default def _generate_examples(self, filepath): """Generate docs + rel examples.""" if "relations" in self.config.name: df = csv.read_csv(filepath).to_pandas() for idx, row in df.iterrows(): yield idx, { DOC_A_COL: row[DOC_A_COL], DOC_B_COL: row[DOC_B_COL], LABEL_COL: row[LABEL_COL].split(','), } elif self.config.name == "docs": with open(filepath, 'r') as f: for i, line in enumerate(f): doc = json.loads(line) yield i, { 'doi': str(self.get_dict_value( doc, 'doi')), # cast to str otherwise float 'cord19_id': self.get_dict_value(doc, 'cord19_id'), 's2_id': self.get_dict_value(doc, 's2_id'), 'title': self.get_dict_value(doc, 'title'), 'abstract': self.get_dict_value(doc, 'abstract'), 'arxivId': self.get_dict_value(doc, 'arxivId'), 'venue': str(self.get_dict_value(doc, 'venue') or ''), 'year': int(self.get_dict_value(doc, 'year', 0) or 0), 'citations_count': int( self.get_dict_value(doc, 'citations_count', 0) or 0), 'references_count': int( self.get_dict_value(doc, 'references_count', 0) or 0), 'authors': self.get_dict_value(doc, 'authors', []), }
class AclDocrel(nlp.GeneratorBasedBuilder): """ACL anthology document relation dataset.""" BUILDER_CONFIGS = [ AclDocrelConfig( name="docs", description="document text and meta data", features={ "s2_id": nlp.Value("string"), "title": nlp.Value("string"), "abstract": nlp.Value("string"), "arxivId": nlp.Value("string"), "doi": nlp.Value("string"), "venue": nlp.Value("string"), "year": nlp.Value("int16"), "citations_count": nlp.Value("int32"), "references_count": nlp.Value("int32"), "authors": nlp.Sequence(nlp.Value('string', id='author_name')), }, data_url=DATA_URL, ), AclDocrelConfig( name="relations", description=" relation data", features={ "from_s2_id": nlp.Value("string"), "to_s2_id": nlp.Value("string"), "label": nlp.Sequence(nlp.Value('string', id='label')) }, data_url=DATA_URL, ), ] def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION + self.config.description, features=nlp.Features(self.config.features), homepage=_HOMEPAGE, citation=_CITATION, ) def _split_generators(self, dl_manager): arch_path = dl_manager.download_and_extract(self.config.data_url) if self.config.name == "relations": train_file = "train.csv" test_file = "test.csv" generators = [] for k in [1, 2, 3, 4]: folds_path = os.path.join(arch_path, 'folds', str(k)) generators += [ nlp.SplitGenerator(name=get_train_split(k), gen_kwargs={ 'filepath': os.path.join( folds_path, train_file) }), nlp.SplitGenerator(name=get_test_split(k), gen_kwargs={ 'filepath': os.path.join(folds_path, test_file) }) ] return generators elif self.config.name == "docs": # docs docs_file = os.path.join(arch_path, "docs.jsonl") return [ nlp.SplitGenerator(name=nlp.Split('docs'), gen_kwargs={"filepath": docs_file}), ] else: raise ValueError() @staticmethod def get_s2_value(s2, key, default=None): if key in s2: return s2[key] else: return default def _generate_examples(self, filepath): """Generate docs + rel examples.""" if self.config.name == "relations": df = csv.read_csv(filepath).to_pandas() for idx, row in df.iterrows(): yield idx, dict(from_s2_id=row['from_s2_id'], to_s2_id=row['to_s2_id'], label=row['label'].split(',')) elif self.config.name == "docs": with open(filepath, 'r') as f: for i, line in enumerate(f): s2 = json.loads(line) yield i, { 's2_id': self.get_s2_value(s2, 'paperId'), 'title': self.get_s2_value(s2, 'title'), 'abstract': self.get_s2_value(s2, 'abstract'), 'doi': self.get_s2_value(s2, 'doi'), 'arxivId': self.get_s2_value(s2, 'arxivId'), 'venue': self.get_s2_value(s2, 'venue'), 'year': self.get_s2_value(s2, 'year', 0), 'citations_count': len(self.get_s2_value(s2, 'citations', [])), 'references_count': len(self.get_s2_value(s2, 'references', [])), 'authors': [ a['name'] for a in self.get_s2_value(s2, 'authors', []) if 'name' in a ], }