def test_write_mnist(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() try: train_image_file = os.path.join(temp_dir, "train-images") train_label_file = os.path.join(temp_dir, "train-labels") output_path = os.path.join(temp_dir, "output_dataset") images = np.array([[i] * 16 for i in range(20)]).reshape( (20, 4, 4)).astype(np.uint8) labels = np.array(list(range(20))).reshape((20, )).astype(np.uint8) _images_to_mnist_file(images, train_image_file) _labels_to_mnist_file(labels, train_label_file) write_mnist(image_file=train_image_file, label_file=train_label_file, output_path=output_path) data, schema = ParquetDataset._read_as_dict_rdd(output_path) data = data.sortBy(lambda x: x['label']).collect() images_load = np.reshape(np.stack([d['image'] for d in data]), (-1, 4, 4)) labels_load = np.stack([d['label'] for d in data]) assert np.all(images_load == images) assert np.all(labels_load == labels_load) finally: shutil.rmtree(temp_dir)
def test_write_parquet_simple(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() def generator(num): for i in range(num): yield {"id": i, "feature": np.zeros((10,)), "label": np.ones((4,))} schema = { "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.INT32, shape=()), "feature": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(10,)), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)) } try: ParquetDataset.write("file://" + temp_dir, generator(100), schema) data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir) data = data.collect()[0] assert data['id'] == 0 assert np.all(data['feature'] == np.zeros((10,), dtype=np.float32)) assert np.all(data['label'] == np.ones((4,), dtype=np.float32)) finally: shutil.rmtree(temp_dir)
def test_write_parquet_images(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() def generator(): dataset_path = os.path.join(resource_path, "cat_dog") for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 1, "id": image_path} for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 0, "id": image_path} schema = { "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=(10,)), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)), "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=()) } try: ParquetDataset.write("file://" + temp_dir, generator(), schema) data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir) data = data.collect()[0] image_path = data['id'] with open(image_path, "rb") as f: image_bytes = f.read() assert image_bytes == data['image'] finally: shutil.rmtree(temp_dir)
def test_write_voc(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() try: from zoo.orca.data import SparkXShards dataset_path = os.path.join(resource_path, "VOCdevkit") output_path = os.path.join(temp_dir, "output_dataset") write_voc(dataset_path, splits_names=[(2007, "trainval")], output_path="file://" + output_path) data, schema = ParquetDataset._read_as_dict_rdd("file://" + output_path) data = data.collect()[0] image_path = data["image_id"] with open(image_path, "rb") as f: image_bytes = f.read() assert image_bytes == data['image'] finally: shutil.rmtree(temp_dir)