def benchmark_array_xd():
    times = {}
    read_functions = (
        read_unformated,
        read_formatted_as_numpy,
        read_batch_unformated,
        read_batch_formatted_as_numpy,
        read_col_unformated,
        read_col_formatted_as_numpy,
    )
    with tempfile.TemporaryDirectory() as tmp_dir:
        feats = datasets.Features(
            {"image": Array2D(SPEED_TEST_SHAPE, dtype="float32")})
        data = generate_examples(features=feats,
                                 num_examples=SPEED_TEST_N_EXAMPLES)
        times["write_array2d"] = write(feats, data, tmp_dir)
        for read_func in read_functions:
            times[read_func.__name__ + " after write_array2d"] = read_func(
                feats, tmp_dir)

    with tempfile.TemporaryDirectory() as tmp_dir:
        # don't use fixed length for fair comparison
        # feats = datasets.Features(
        #     {"image": datasets.Sequence(datasets.Sequence(datasets.Value("float32"), SPEED_TEST_SHAPE[1]), SPEED_TEST_SHAPE[0])}
        # )
        feats = datasets.Features({
            "image":
            datasets.Sequence(datasets.Sequence(datasets.Value("float32")))
        })
        data = generate_examples(features=feats,
                                 num_examples=SPEED_TEST_N_EXAMPLES,
                                 seq_shapes={"image": SPEED_TEST_SHAPE})
        times["write_nested_sequence"] = write(feats, data, tmp_dir)
        for read_func in read_functions:
            times[read_func.__name__ +
                  " after write_nested_sequence"] = read_func(feats, tmp_dir)

    with tempfile.TemporaryDirectory() as tmp_dir:
        # don't use fixed length for fair comparison
        # feats = datasets.Features(
        #     {"image": datasets.Sequence(datasets.Value("float32"), SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1])}
        # )
        feats = datasets.Features(
            {"image": datasets.Sequence(datasets.Value("float32"))})
        data = generate_examples(
            features=feats,
            num_examples=SPEED_TEST_N_EXAMPLES,
            seq_shapes={"image": [SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1]]},
        )
        times["write_flattened_sequence"] = write(feats, data, tmp_dir)
        for read_func in read_functions:
            times[read_func.__name__ +
                  " after write_flattened_sequence"] = read_func(
                      feats, tmp_dir)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))
Example #2
0
 def test_extension_indexing(self):
     with tempfile.TemporaryDirectory() as tmp_dir:
         my_features = DEFAULT_FEATURES.copy()
         my_features["explicit_ext"] = Array2D((3, 3), dtype="float32")
         with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, "beta.arrow")) as writer:
             for key, record in generate_examples(features=my_features, num_examples=1):
                 example = my_features.encode_example(record)
                 writer.write(example)
             num_examples, num_bytes = writer.finalize()
         dataset = datasets.Dataset.from_file(os.path.join(tmp_dir, "beta.arrow"))
         dataset.set_format("numpy")
         data = dataset[0]["explicit_ext"]
         self.assertIsInstance(data, np.ndarray, "indexed extension must return numpy.ndarray")
         del dataset
Example #3
0
import pandas as pd
import pytest
from absl.testing import parameterized

import datasets
from datasets.arrow_writer import ArrowWriter
from datasets.features import Array2D, Array3D, Array4D, Array5D, Value, _ArrayXD

SHAPE_TEST_1 = (30, 487)
SHAPE_TEST_2 = (36, 1024)
SPEED_TEST_SHAPE = (100, 100)
SPEED_TEST_N_EXAMPLES = 100

DEFAULT_FEATURES = datasets.Features({
    "text":
    Array2D(SHAPE_TEST_1, dtype="float32"),
    "image":
    Array2D(SHAPE_TEST_2, dtype="float32")
})


def generate_examples(features: dict, num_examples=100, seq_shapes=None):
    dummy_data = []
    seq_shapes = seq_shapes or {}
    for i in range(num_examples):
        example = {}
        for col_id, (k, v) in enumerate(features.items()):
            if isinstance(v, _ArrayXD):
                data = np.random.rand(*v.shape).astype(v.dtype)
            elif isinstance(v, datasets.Value):
                data = "foo"
Example #4
0
 def test_try_incompatible_extension_type(self):
     arr = pa.array(
         TypedSequence(["foo", "bar"], try_type=Array2D((1, 3), "int64")))
     self.assertEqual(arr.type, pa.string())
Example #5
0
 def test_try_compatible_extension_type(self):
     arr = pa.array(
         TypedSequence([[[1, 2, 3]]], try_type=Array2D((1, 3), "int64")))
     self.assertEqual(arr.type, Array2DExtensionType((1, 3), "int64"))
Example #6
0
 def test_incompatible_extension_type(self):
     with self.assertRaises((TypeError, pa.lib.ArrowInvalid)):
         _ = pa.array(
             TypedSequence(["foo", "bar"], type=Array2D((1, 3), "int64")))
Example #7
0
import datasets
from datasets.arrow_writer import ArrowWriter
from datasets.features import Array2D, Array3D, Array4D, Array5D, Value
from datasets.features.features import Array3DExtensionType, PandasArrayExtensionDtype, _ArrayXD
from datasets.formatting.formatting import NumpyArrowExtractor, SimpleArrowExtractor


SHAPE_TEST_1 = (30, 487)
SHAPE_TEST_2 = (36, 1024)
SHAPE_TEST_3 = (None, 100)
SPEED_TEST_SHAPE = (100, 100)
SPEED_TEST_N_EXAMPLES = 100

DEFAULT_FEATURES = datasets.Features(
    {
        "text": Array2D(SHAPE_TEST_1, dtype="float32"),
        "image": Array2D(SHAPE_TEST_2, dtype="float32"),
        "dynamic": Array2D(SHAPE_TEST_3, dtype="float32"),
    }
)


def generate_examples(features: dict, num_examples=100, seq_shapes=None):
    dummy_data = []
    seq_shapes = seq_shapes or {}
    for i in range(num_examples):
        example = {}
        for col_id, (k, v) in enumerate(features.items()):
            if isinstance(v, _ArrayXD):
                if k == "dynamic":
                    first_dim = random.randint(1, 3)