def test_array_xd_with_none(): # Fixed shape features = datasets.Features( {"foo": datasets.Array2D(dtype="int32", shape=(2, 2))}) dummy_array = np.array([[1, 2], [3, 4]], dtype="int32") dataset = datasets.Dataset.from_dict( {"foo": [dummy_array, None, dummy_array]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) assert isinstance( arr, np.ndarray) and arr.dtype == np.float64 and arr.shape == (3, 2, 2) assert np.allclose(arr[0], dummy_array) and np.allclose( arr[2], dummy_array) assert np.all(np.isnan(arr[1])) # broadcasted np.nan - use np.all # Dynamic shape features = datasets.Features( {"foo": datasets.Array2D(dtype="int32", shape=(None, 2))}) dummy_array = np.array([[1, 2], [3, 4]], dtype="int32") dataset = datasets.Dataset.from_dict( {"foo": [dummy_array, None, dummy_array]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) assert isinstance( arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3, ) np.testing.assert_equal(arr[0], dummy_array) np.testing.assert_equal(arr[2], dummy_array) assert np.isnan(arr[1]) # a single np.nan value - np.all not needed
def test_table_to_pandas(dtype, dummy_value): features = datasets.Features({"foo": datasets.Array2D(dtype=dtype, shape=(2, 2))}) dataset = datasets.Dataset.from_dict({"foo": [[[dummy_value] * 2] * 2]}, features=features) df = dataset._data.to_pandas() assert type(df.foo.dtype) == PandasArrayExtensionDtype arr = df.foo.to_numpy() np.testing.assert_equal(arr, np.array([[[dummy_value] * 2] * 2], dtype=np.dtype(dtype)))
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "sequence_number": datasets.Value("string"), "subject_id": datasets.Value("string"), "rgb": datasets.Sequence(datasets.Image()), "rgb_cal": { "intrisic_mat": datasets.Array2D(shape=(3, 3), dtype="float64"), "extrinsic_mat": { "rotation": datasets.Array2D(shape=(3, 3), dtype="float64"), "translation": datasets.Sequence(datasets.Value("float64"), length=3), }, }, "depth": datasets.Sequence(datasets.Value("string")), "depth_cal": { "intrisic_mat": datasets.Array2D(shape=(3, 3), dtype="float64"), "extrinsic_mat": { "rotation": datasets.Array2D(shape=(3, 3), dtype="float64"), "translation": datasets.Sequence(datasets.Value("float64"), length=3), }, }, "head_pose_gt": datasets.Sequence({ "center": datasets.Sequence(datasets.Value("float64"), length=3), "rotation": datasets.Array2D(shape=(3, 3), dtype="float64"), }), "head_template": datasets.Value("string"), }), homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def test_array_xd_numpy_arrow_extractor(dtype, dummy_value): features = datasets.Features( {"foo": datasets.Array2D(dtype=dtype, shape=(2, 2))}) dataset = datasets.Dataset.from_dict({"foo": [[[dummy_value] * 2] * 2]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) assert isinstance(arr, np.ndarray) np.testing.assert_equal( arr, np.array([[[dummy_value] * 2] * 2], dtype=np.dtype(dtype)))
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "image": datasets.Array2D(shape=(28, 28), dtype="uint8"), "label": datasets.features.ClassLabel( names=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]), }), supervised_keys=("image", "label"), homepage="http://yann.lecun.com/exdb/mnist/", citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "image": datasets.Array2D(shape=(28, 28), dtype="uint8"), "label": datasets.features.ClassLabel(names=[ "T - shirt / top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot", ]), }), supervised_keys=("image", "label"), homepage="https://github.com/zalandoresearch/fashion-mnist", citation=_CITATION, )
USAGE: ``python extracting_data.py -i <img_dir> -o <dataset_file>.datasets <batch_size>`` """ TEST = False CONFIG = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") DEFAULT_SCHEMA = datasets.Features( OrderedDict({ "attr_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "attr_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"), "normalized_boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"), "img_id": datasets.Value("string"), "obj_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "obj_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "roi_features": datasets.Array2D((CONFIG.MAX_DETECTIONS, 2048), dtype="float32"), "sizes": datasets.Sequence(length=2, feature=datasets.Value("float32")), "preds_per_image":
def _info(self): if self.config.name == "raw": features = datasets.Features({ "key_id": datasets.Value("string"), "word": datasets.ClassLabel(names=_NAMES), "recognized": datasets.Value("bool"), "timestamp": datasets.Value("timestamp[us, tz=UTC]"), "countrycode": datasets.Value("string"), "drawing": datasets.Sequence({ "x": datasets.Sequence(datasets.Value("float32")), "y": datasets.Sequence(datasets.Value("float32")), "t": datasets.Sequence(datasets.Value("int32")), }), }) elif self.config.name == "preprocessed_simplified_drawings": features = datasets.Features({ "key_id": datasets.Value("string"), "word": datasets.ClassLabel(names=_NAMES), "recognized": datasets.Value("bool"), "timestamp": datasets.Value("timestamp[us, tz=UTC]"), "countrycode": datasets.Value("string"), "drawing": datasets.Sequence({ "x": datasets.Sequence(datasets.Value("uint8")), "y": datasets.Sequence(datasets.Value("uint8")), }), }) elif self.config.name == "preprocessed_bitmaps": features = datasets.Features({ "image": datasets.Image(), "label": datasets.ClassLabel(names=_NAMES), }) else: # sketch_rnn, sketch_rnn_full features = datasets.Features({ "word": datasets.ClassLabel(names=_NAMES), "drawing": datasets.Array2D(shape=(None, 3), dtype="int16"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, task_templates=[ ImageClassification(image_column="image", label_column="label") ] if self.config.name == "preprocessed_bitmaps" else None, )
""" USAGE: ``python extracting_data.py -i <img_dir> -o <dataset_file>.datasets <batch_size>`` """ TEST = False CONFIG = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") DEFAULT_SCHEMA = datasets.Features( OrderedDict( { "attr_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "attr_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"), "img_id": datasets.Value("int32"), "obj_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "obj_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "roi_features": datasets.Array2D((CONFIG.MAX_DETECTIONS, 2048), dtype="float32"), "sizes": datasets.Sequence(length=2, feature=datasets.Value("float32")), "preds_per_image": datasets.Value(dtype="int32"), } ) ) class Extract: def __init__(self, argv=sys.argv[1:]): inputdir = None outputfile = None
dummy_array = np.array([[1, 2], [3, 4]], dtype="int32") dataset = datasets.Dataset.from_dict( {"foo": [dummy_array, None, dummy_array]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) assert isinstance( arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3, ) np.testing.assert_equal(arr[0], dummy_array) np.testing.assert_equal(arr[2], dummy_array) assert np.isnan(arr[1]) # a single np.nan value - np.all not needed @pytest.mark.parametrize( "data, feature, expected", [ (np.zeros((2, 2)), None, [[0.0, 0.0], [0.0, 0.0]]), (np.zeros((2, 3)), datasets.Array2D(shape=(2, 3), dtype="float32"), [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]), ([np.zeros(2)], datasets.Array2D(shape=(1, 2), dtype="float32"), [[0.0, 0.0]]), ( [np.zeros((2, 3))], datasets.Array3D(shape=(1, 2, 3), dtype="float32"), [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]], ), ], ) def test_array_xd_with_np(data, feature, expected): ds = datasets.Dataset.from_dict( {"col": [data]}, features=datasets.Features({"col": feature}) if feature else None) assert ds[0]["col"] == expected