def write_voc(voc_root_path, splits_names, output_path, **kwargs): custom_classes = kwargs.get("classes", None) voc_datasets = VOCDatasets( voc_root_path, splits_names, classes=custom_classes) def make_generator(): for img_path, label in voc_datasets: yield {"image": img_path, "label": label, "image_id": img_path} image, label = voc_datasets[0] label_shape = (-1, label.shape[-1]) schema = { "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=()), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=ndarray_dtype_to_dtype(label.dtype), shape=label_shape), "image_id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=()) } kwargs = {key: value for key, value in kwargs.items() if key not in [ "classes"]} ParquetDataset.write(output_path, make_generator(), schema, **kwargs)
def write_from_directory(directory, label_map, output_path, shuffle=True, **kwargs): labels = os.listdir(directory) valid_labels = [label for label in labels if label in label_map] generator = [] for label in valid_labels: label_path = os.path.join(directory, label) images = os.listdir(label_path) for image in images: image_path = os.path.join(label_path, image) generator.append({"image": image_path, "label": label_map[label], "image_id": image_path, "label_str": label}) if shuffle: random.shuffle(generator) schema = {"image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=()), "label": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.INT32, shape=()), "image_id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=()), "label_str": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=())} ParquetDataset.write(output_path, generator, schema, **kwargs)
def test_write_parquet_images(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() def generator(): dataset_path = os.path.join(resource_path, "cat_dog") for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 1, "id": image_path} for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 0, "id": image_path} schema = { "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=(10,)), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)), "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=()) } try: ParquetDataset.write("file://" + temp_dir, generator(), schema) data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir) data = data.collect()[0] image_path = data['id'] with open(image_path, "rb") as f: image_bytes = f.read() assert image_bytes == data['image'] finally: shutil.rmtree(temp_dir)
def test_write_parquet_simple(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() def generator(num): for i in range(num): yield {"id": i, "feature": np.zeros((10,)), "label": np.ones((4,))} schema = { "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.INT32, shape=()), "feature": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(10,)), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)) } try: ParquetDataset.write("file://" + temp_dir, generator(100), schema) data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir) data = data.collect()[0] assert data['id'] == 0 assert np.all(data['feature'] == np.zeros((10,), dtype=np.float32)) assert np.all(data['label'] == np.ones((4,), dtype=np.float32)) finally: shutil.rmtree(temp_dir)
def _write_ndarrays(images, labels, output_path, **kwargs): images_shape = [int(x) for x in images.shape[1:]] labels_shape = [int(x) for x in labels.shape[1:]] schema = { "image": SchemaField(feature_type=FeatureType.NDARRAY, dtype=ndarray_dtype_to_dtype(images.dtype), shape=images_shape), "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=ndarray_dtype_to_dtype(labels.dtype), shape=labels_shape) } def make_generator(): for i in range(images.shape[0]): yield {"image": images[i], "label": labels[i]} ParquetDataset.write(output_path, make_generator(), schema, **kwargs)
def images_generator(): dataset_path = os.path.join(resource_path, "cat_dog") for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 1, "id": image_path} for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")): for name in files: image_path = os.path.join(root, name) yield {"image": image_path, "label": 0, "id": image_path} images_schema = { "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=()), "label": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.FLOAT32, shape=()), "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=()) } def parse_data_train(image, label): image = tf.io.decode_jpeg(image, NUM_CHANNELS) image = tf.image.resize(image, size=(WIDTH, HEIGHT)) image = tf.reshape(image, [WIDTH, HEIGHT, NUM_CHANNELS]) return image, label