def test_remove_and_map_on_task_template(self): features = Features({"text": Value("string"), "label": ClassLabel(names=("pos", "neg"))}) task_templates = TextClassification(text_column="text", label_column="label") info = DatasetInfo(features=features, task_templates=task_templates) dataset = Dataset.from_dict({"text": ["A sentence."], "label": ["pos"]}, info=info) def process(example): return example modified_dataset = dataset.remove_columns("label") mapped_dataset = modified_dataset.map(process) assert mapped_dataset.info.task_templates == []
def test_from_dict(self): input_schema = Features({"image_file_path": Value("string")}) label_schema = Features( {"labels": ClassLabel(names=tuple(self.labels))}) template_dict = { "image_file_path_column": "input_image_file_path", "label_column": "input_label", "labels": self.labels, } task = ImageClassification.from_dict(template_dict) self.assertEqual("image-classification", task.task) self.assertEqual(input_schema, task.input_schema) self.assertEqual(label_schema, task.label_schema)
def test_from_dict(self): input_schema = Features({"text": Value("string")}) # Labels are cast to tuple during `TextClassification.__post_init__`, so we do the same here label_schema = Features( {"labels": ClassLabel(names=tuple(self.labels))}) template_dict = { "text_column": "input_text", "label_column": "input_labels", "labels": self.labels } task = TextClassification.from_dict(template_dict) self.assertEqual("text-classification", task.task) self.assertEqual(input_schema, task.input_schema) self.assertEqual(label_schema, task.label_schema)
def test_interleave_datasets_with_features(dataset: IterableDataset, generate_examples_fn): features = Features( { "id": Value("int64"), "label": ClassLabel(names=["negative", "positive"]), } ) ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0}) dataset_with_features = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) merged_dataset = interleave_datasets([dataset, dataset_with_features], probabilities=[0, 1]) assert isinstance(merged_dataset._ex_iterable, CyclingMultiSourcesExamplesIterable) assert isinstance(merged_dataset._ex_iterable.ex_iterables[1], TypedExamplesIterable) assert merged_dataset._ex_iterable.ex_iterables[1].features == features assert next(iter(merged_dataset)) == next(iter(dataset_with_features))
def test_encode_batch_with_example_with_empty_first_elem(): features = Features( { "x": Sequence(Sequence(ClassLabel(names=["a", "b"]))), } ) encoded_batch = features.encode_batch( { "x": [ [["a"], ["b"]], [[], ["b"]], ] } ) assert encoded_batch == {"x": [[[0], [1]], [[], [1]]]}
def test_write_with_features(): output = pa.BufferOutputStream() features = Features({"labels": ClassLabel(names=["neg", "pos"])}) with ArrowWriter(stream=output, features=features) as writer: writer.write({"labels": 0}) writer.write({"labels": 1}) num_examples, num_bytes = writer.finalize() assert num_examples == 2 assert num_bytes > 0 assert writer._schema == features.arrow_schema assert writer._schema.metadata == features.arrow_schema.metadata stream = pa.BufferReader(output.getvalue()) f = pa.ipc.open_stream(stream) pa_table: pa.Table = f.read_all() schema = pa_table.schema assert pa_table.num_rows == 2 assert schema == features.arrow_schema assert schema.metadata == features.arrow_schema.metadata assert features == Features.from_arrow_schema(schema)
def __init__(self): super(BinarySentiment, self).__init__( num_classes=2, input_schema=Schema( features=OrderedDict([ ("text", Value(dtype="string")), ]), grounding_candidates={ "text": {"text", "sentence"}, }, ), output_schema=Schema( features=OrderedDict([ ("label", ClassLabel(names=["negative", "positive"])), ]), grounding_candidates={ "label": {"label"}, }, ), identifier=self.__class__.__name__, )
def test_classlabel_init(tmp_path_factory): names = ["negative", "positive"] names_file = str(tmp_path_factory.mktemp("features") / "labels.txt") with open(names_file, "w", encoding="utf-8") as f: f.write("\n".join(names)) classlabel = ClassLabel(names=names) assert classlabel.names == names and classlabel.num_classes == len(names) classlabel = ClassLabel(names_file=names_file) assert classlabel.names == names and classlabel.num_classes == len(names) classlabel = ClassLabel(num_classes=len(names), names=names) assert classlabel.names == names and classlabel.num_classes == len(names) classlabel = ClassLabel(num_classes=len(names)) assert classlabel.names == [str(i) for i in range(len(names))] and classlabel.num_classes == len(names) with pytest.raises(ValueError): classlabel = ClassLabel(num_classes=len(names) + 1, names=names) with pytest.raises(ValueError): classlabel = ClassLabel(names=names, names_file=names_file) with pytest.raises(ValueError): classlabel = ClassLabel()
def test_features(self): n_rows = 10 n_cols = 3 def get_features(type): return Features({str(i): type for i in range(n_cols)}) with tempfile.TemporaryDirectory() as tmp_dir: open(os.path.join(tmp_dir, "table.csv"), "w", encoding="utf-8").write( "\n".join(",".join([str(i) for i in range(n_cols)]) for _ in range(n_rows + 1)) ) for type in [Value("float64"), Value("int8"), ClassLabel(num_classes=n_cols)]: features = get_features(type) ds = load_dataset( "csv", data_files=os.path.join(tmp_dir, "table.csv"), cache_dir=tmp_dir, split="train", features=features, ) self.assertEqual(len(ds), n_rows) self.assertDictEqual(ds.features, features) del ds
def dataset(): n = 10 features = Features( { "tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"])), "answers": Sequence( { "text": Value("string"), "answer_start": Value("int32"), } ), } ) dataset = Dataset.from_dict( { "tokens": [["foo"] * 5] * n, "labels": [[1] * 5] * n, "answers": [{"answer_start": [97], "text": ["1976"]}] * 10, }, features=features, ) return dataset
def __init__(self): super(BinaryNaturalLanguageInference, self).__init__( num_classes=2, input_schema=Schema( features=OrderedDict([ ("premise", Value(dtype="string")), ("hypothesis", Value(dtype="string")), ]), grounding_candidates={ "premise": {"premise", "sentence1"}, "hypothesis": {"hypothesis", "sentence2"}, }, ), output_schema=Schema( features=OrderedDict([ ("label", ClassLabel(names=["entailment", "non entailment"])), ]), grounding_candidates={ "label": {"label"}, }, ), identifier=self.__class__.__name__, )
@pytest.mark.parametrize( "features", [ None, Features( { "id": Value("int64"), "label": Value("int64"), } ), Features( { "id": Value("int64"), "label": ClassLabel(names=["negative", "positive"]), } ), ], ) def test_iterable_dataset_features(generate_examples_fn, features): ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0}) dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features)) if features: expected = [features.encode_example(x) for _, x in ex_iterable] else: expected = [x for _, x in ex_iterable] assert list(dataset) == expected @require_torch
features = Features({ "x": Sequence(Sequence(ClassLabel(names=["a", "b"]))), }) encoded_batch = features.encode_batch( {"x": [ [["a"], ["b"]], [[], ["b"]], ]}) assert encoded_batch == {"x": [[[0], [1]], [[], [1]]]} @pytest.mark.parametrize( "feature", [ Value("int32"), ClassLabel(num_classes=2), Translation(languages=["en", "fr"]), TranslationVariableLanguages(languages=["en", "fr"]), ], ) def test_dataset_feature_with_none(feature): data = {"col": [None]} features = Features({"col": feature}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"col"} assert item["col"] is None batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"col"} assert isinstance(batch["col"], list) and all(item is None
def test_align_with_features(self): task = TextClassification(text_column="input_text", label_column="input_label") self.assertEqual(task.label_schema["labels"], ClassLabel) task = task.align_with_features(Features({"input_label": ClassLabel(names=self.labels)})) self.assertEqual(task.label_schema["labels"], ClassLabel(names=self.labels))
def dataset(): features = Features( {"tokens": Sequence(Value("string")), "labels": Sequence(ClassLabel(names=["negative", "positive"]))} ) dataset = Dataset.from_dict({"tokens": [["foo"] * 5] * 10, "labels": [[1] * 5] * 10}, features=features) return dataset
def test_reorder_fields_as(self): features = Features( { "id": Value("string"), "document": { "title": Value("string"), "url": Value("string"), "html": Value("string"), "tokens": Sequence({"token": Value("string"), "is_html": Value("bool")}), }, "question": { "text": Value("string"), "tokens": Sequence(Value("string")), }, "annotations": Sequence( { "id": Value("string"), "long_answer": { "start_token": Value("int64"), "end_token": Value("int64"), "start_byte": Value("int64"), "end_byte": Value("int64"), }, "short_answers": Sequence( { "start_token": Value("int64"), "end_token": Value("int64"), "start_byte": Value("int64"), "end_byte": Value("int64"), "text": Value("string"), } ), "yes_no_answer": ClassLabel(names=["NO", "YES"]), } ), } ) other = Features( # same but with [] instead of sequences, and with a shuffled fields order { "id": Value("string"), "document": { "tokens": Sequence({"token": Value("string"), "is_html": Value("bool")}), "title": Value("string"), "url": Value("string"), "html": Value("string"), }, "question": { "text": Value("string"), "tokens": [Value("string")], }, "annotations": { "yes_no_answer": [ClassLabel(names=["NO", "YES"])], "id": [Value("string")], "long_answer": [ { "end_byte": Value("int64"), "start_token": Value("int64"), "end_token": Value("int64"), "start_byte": Value("int64"), } ], "short_answers": [ Sequence( { "text": Value("string"), "start_token": Value("int64"), "end_token": Value("int64"), "start_byte": Value("int64"), "end_byte": Value("int64"), } ) ], }, } ) expected = Features( { "id": Value("string"), "document": { "tokens": Sequence({"token": Value("string"), "is_html": Value("bool")}), "title": Value("string"), "url": Value("string"), "html": Value("string"), }, "question": { "text": Value("string"), "tokens": Sequence(Value("string")), }, "annotations": Sequence( { "yes_no_answer": ClassLabel(names=["NO", "YES"]), "id": Value("string"), "long_answer": { "end_byte": Value("int64"), "start_token": Value("int64"), "end_token": Value("int64"), "start_byte": Value("int64"), }, "short_answers": Sequence( { "text": Value("string"), "start_token": Value("int64"), "end_token": Value("int64"), "start_byte": Value("int64"), "end_byte": Value("int64"), } ), } ), } ) reordered_features = features.reorder_fields_as(other) self.assertDictEqual(reordered_features, expected) self.assertEqual(reordered_features.type, other.type) self.assertEqual(reordered_features.type, expected.type) self.assertNotEqual(reordered_features.type, features.type)
# pre_trained_model_name = 'roberta-base' logger.critical("Build pre-trained model {}".format(pre_trained_model_name)) base_pre_trained_model_path = '/home/ubuntu/likun/nlp_pretrained/{}'.format( pre_trained_model_name) # trained_model_path = '/home/ubuntu/likun/nlp_save_kernels/zero-shot-metric-learning-benchmark-topic-small' # tokenizer = AutoTokenizer.from_pretrained(trained_model_path) tokenizer = BertTokenizer.from_pretrained(base_pre_trained_model_path) from datasets.features import ClassLabel from datasets.features import Features yahoo_zsl_path = '/home/ubuntu/likun/nlp_data/zsl/BenchmarkingZeroShot/topic_yahoo' fea = Features({ "text": datasets.Value("string"), "label": ClassLabel(names_file=os.path.join(yahoo_zsl_path, 'classes.txt')) }) download_config = datasets.DownloadConfig() download_config.max_retries = 20 dataset = datasets.load_dataset('csv', data_files={ 'train': os.path.join(yahoo_zsl_path, 'train_half_v0.csv'), 'test': os.path.join(yahoo_zsl_path, 'test.csv') }, features=fea, download_config=download_config, ignore_verifications=True)