def test_read_parquet_images_tf_dataset(self): temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema, block_size=4) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } dataset = read_parquet("tf_dataset", input_path=path, output_types=output_types) for dt in dataset.take(1): print(dt.keys()) dataloader = read_parquet("dataloader", input_path=path) cur_dl = iter(dataloader) while True: try: print(next(cur_dl)['label']) except StopIteration: break finally: shutil.rmtree(temp_dir)
def test_parquet_images_training(self): from zoo.orca.learn.tf2 import Estimator temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } output_shapes = {"id": (), "image": (), "label": ()} def data_creator(config, batch_size): dataset = read_parquet("tf_dataset", input_path=path, output_types=output_types, output_shapes=output_shapes) dataset = dataset.shuffle(10) dataset = dataset.map(lambda data_dict: (data_dict["image"], data_dict["label"])) dataset = dataset.map(parse_data_train) dataset = dataset.batch(batch_size) return dataset ray_ctx = RayContext.get() trainer = Estimator.from_keras(model_creator=model_creator) trainer.fit(data=data_creator, epochs=1, batch_size=2) finally: shutil.rmtree(temp_dir)
def test_write_parquet_images(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() def generator(): dataset_path = os.path.join(resource_path, "cat_dog") for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 1, "id": image_path} for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 0, "id": image_path} schema = { "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=(10,)), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)), "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=()) } try: ParquetDataset.write("file://" + temp_dir, generator(), schema) data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir) data = data.collect()[0] image_path = data['id'] with open(image_path, "rb") as f: image_bytes = f.read() assert image_bytes == data['image'] finally: shutil.rmtree(temp_dir)
def test_write_parquet_simple(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() def generator(num): for i in range(num): yield {"id": i, "feature": np.zeros((10,)), "label": np.ones((4,))} schema = { "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.INT32, shape=()), "feature": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(10,)), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)) } try: ParquetDataset.write("file://" + temp_dir, generator(100), schema) data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir) data = data.collect()[0] assert data['id'] == 0 assert np.all(data['feature'] == np.zeros((10,), dtype=np.float32)) assert np.all(data['label'] == np.ones((4,), dtype=np.float32)) finally: shutil.rmtree(temp_dir)
def test_read_parquet_images_tf_dataset(self): temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema, block_size=4) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } dataset = read_parquet("tf_dataset", path=path, output_types=output_types) for dt in dataset.take(1): print(dt.keys()) num_shards, rank = 3, 1 dataset_shard = read_parquet("tf_dataset", path=path, config={ "num_shards": num_shards, "rank": rank }, output_types=output_types) assert len(list(dataset_shard)) <= len(list(dataset)) // num_shards, \ "len of dataset_shard should be 1/`num_shards` of the whole dataset." dataloader = read_parquet("dataloader", path=path) dataloader_shard = read_parquet("dataloader", path=path, config={ "num_shards": num_shards, "rank": rank }) cur_dl = iter(dataloader_shard) cur_count = 0 while True: try: print(next(cur_dl)['label']) cur_count += 1 except StopIteration: break assert cur_count == len(list(dataset_shard)) finally: shutil.rmtree(temp_dir)
def test_read_parquet_images_tf_dataset(self): temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } dataset = read_parquet("tf_dataset", input_path=path, output_types=output_types) for dt in dataset.take(1): print(dt.keys()) finally: shutil.rmtree(temp_dir)