def benchmark_array_xd(): times = {} read_functions = ( read_unformated, read_formatted_as_numpy, read_batch_unformated, read_batch_formatted_as_numpy, read_col_unformated, read_col_formatted_as_numpy, ) with tempfile.TemporaryDirectory() as tmp_dir: feats = datasets.Features( {"image": Array2D(SPEED_TEST_SHAPE, dtype="float32")}) data = generate_examples(features=feats, num_examples=SPEED_TEST_N_EXAMPLES) times["write_array2d"] = write(feats, data, tmp_dir) for read_func in read_functions: times[read_func.__name__ + " after write_array2d"] = read_func( feats, tmp_dir) with tempfile.TemporaryDirectory() as tmp_dir: # don't use fixed length for fair comparison # feats = datasets.Features( # {"image": datasets.Sequence(datasets.Sequence(datasets.Value("float32"), SPEED_TEST_SHAPE[1]), SPEED_TEST_SHAPE[0])} # ) feats = datasets.Features({ "image": datasets.Sequence(datasets.Sequence(datasets.Value("float32"))) }) data = generate_examples(features=feats, num_examples=SPEED_TEST_N_EXAMPLES, seq_shapes={"image": SPEED_TEST_SHAPE}) times["write_nested_sequence"] = write(feats, data, tmp_dir) for read_func in read_functions: times[read_func.__name__ + " after write_nested_sequence"] = read_func(feats, tmp_dir) with tempfile.TemporaryDirectory() as tmp_dir: # don't use fixed length for fair comparison # feats = datasets.Features( # {"image": datasets.Sequence(datasets.Value("float32"), SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1])} # ) feats = datasets.Features( {"image": datasets.Sequence(datasets.Value("float32"))}) data = generate_examples( features=feats, num_examples=SPEED_TEST_N_EXAMPLES, seq_shapes={"image": [SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1]]}, ) times["write_flattened_sequence"] = write(feats, data, tmp_dir) for read_func in read_functions: times[read_func.__name__ + " after write_flattened_sequence"] = read_func( feats, tmp_dir) with open(RESULTS_FILE_PATH, "wb") as f: f.write(json.dumps(times).encode("utf-8"))
def test_extension_indexing(self): with tempfile.TemporaryDirectory() as tmp_dir: my_features = DEFAULT_FEATURES.copy() my_features["explicit_ext"] = Array2D((3, 3), dtype="float32") with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) as writer: for key, record in generate_examples(features=my_features, num_examples=1): example = my_features.encode_example(record) writer.write(example) num_examples, num_bytes = writer.finalize() dataset = datasets.Dataset.from_file(os.path.join(tmp_dir, "beta.arrow")) dataset.set_format("numpy") data = dataset[0]["explicit_ext"] self.assertIsInstance(data, np.ndarray, "indexed extension must return numpy.ndarray") del dataset
import pandas as pd import pytest from absl.testing import parameterized import datasets from datasets.arrow_writer import ArrowWriter from datasets.features import Array2D, Array3D, Array4D, Array5D, Value, _ArrayXD SHAPE_TEST_1 = (30, 487) SHAPE_TEST_2 = (36, 1024) SPEED_TEST_SHAPE = (100, 100) SPEED_TEST_N_EXAMPLES = 100 DEFAULT_FEATURES = datasets.Features({ "text": Array2D(SHAPE_TEST_1, dtype="float32"), "image": Array2D(SHAPE_TEST_2, dtype="float32") }) def generate_examples(features: dict, num_examples=100, seq_shapes=None): dummy_data = [] seq_shapes = seq_shapes or {} for i in range(num_examples): example = {} for col_id, (k, v) in enumerate(features.items()): if isinstance(v, _ArrayXD): data = np.random.rand(*v.shape).astype(v.dtype) elif isinstance(v, datasets.Value): data = "foo"
def test_try_incompatible_extension_type(self): arr = pa.array( TypedSequence(["foo", "bar"], try_type=Array2D((1, 3), "int64"))) self.assertEqual(arr.type, pa.string())
def test_try_compatible_extension_type(self): arr = pa.array( TypedSequence([[[1, 2, 3]]], try_type=Array2D((1, 3), "int64"))) self.assertEqual(arr.type, Array2DExtensionType((1, 3), "int64"))
def test_incompatible_extension_type(self): with self.assertRaises((TypeError, pa.lib.ArrowInvalid)): _ = pa.array( TypedSequence(["foo", "bar"], type=Array2D((1, 3), "int64")))
import datasets from datasets.arrow_writer import ArrowWriter from datasets.features import Array2D, Array3D, Array4D, Array5D, Value from datasets.features.features import Array3DExtensionType, PandasArrayExtensionDtype, _ArrayXD from datasets.formatting.formatting import NumpyArrowExtractor, SimpleArrowExtractor SHAPE_TEST_1 = (30, 487) SHAPE_TEST_2 = (36, 1024) SHAPE_TEST_3 = (None, 100) SPEED_TEST_SHAPE = (100, 100) SPEED_TEST_N_EXAMPLES = 100 DEFAULT_FEATURES = datasets.Features( { "text": Array2D(SHAPE_TEST_1, dtype="float32"), "image": Array2D(SHAPE_TEST_2, dtype="float32"), "dynamic": Array2D(SHAPE_TEST_3, dtype="float32"), } ) def generate_examples(features: dict, num_examples=100, seq_shapes=None): dummy_data = [] seq_shapes = seq_shapes or {} for i in range(num_examples): example = {} for col_id, (k, v) in enumerate(features.items()): if isinstance(v, _ArrayXD): if k == "dynamic": first_dim = random.randint(1, 3)