def test_load_dataset(spark: SparkSession, tmp_path: Path): dataset_dir = tmp_path / "features" asset_dir = tmp_path / "assets" asset_dir.mkdir(parents=True) expected = [] data = [] for i in range(1000): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = asset_dir / f"{i}.png" array = wrap(np.random.random_sample((3, 4))) data.append({ "id": i, "array": array, "image": Image.from_array(image_data, image_uri), }) expected.append({"id": i, "array": array, "image": image_data}) df = spark.createDataFrame(data) df.write.mode("overwrite").format("rikai").save(str(dataset_dir)) loader = DataLoader(dataset_dir, batch_size=8) actual = [] for examples in loader: # print(examples) assert len(examples) == 8 actual.extend(examples) actual = sorted(actual, key=lambda x: x["id"]) assert len(actual) == 1000 for expect, act in zip(expected, actual): assert np.array_equal(expect["array"], act["array"]) assert np.array_equal(expect["image"], act["image"])
def test_images(spark: SparkSession, tmp_path): expected = [ { "id": 1, "image": Image(uri="s3://123"), }, { "id": 2, "image": Image(uri="s3://abc"), }, ] df = spark.createDataFrame(expected) df.write.mode("overwrite").parquet(str(tmp_path)) records = sorted(_read_parquets(str(tmp_path)), key=lambda x: x["id"]) assert_count_equal(expected, records)
def test_load_dataset(spark: SparkSession, tmp_path: Path): dataset_dir = tmp_path / "features" asset_dir = tmp_path / "assets" asset_dir.mkdir(parents=True) expected = [] data = [] for i in range(1000): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = asset_dir / f"{i}.png" array = wrap(np.random.random_sample((3, 4))) data.append( { "id": i, "array": array, "image": Image.from_array(image_data, image_uri), } ) expected.append({"id": i, "array": array, "image": image_data}) df = spark.createDataFrame(data) df.write.mode("overwrite").format("rikai").save(str(dataset_dir)) loader = DataLoader(dataset_dir, batch_size=8) _check_loader(loader, expected) loader2 = DataLoader(df, batch_size=8) _check_loader(loader2, expected)
def test_images(self): expected = [ { "id": 1, "image": Image(uri="s3://123"), }, { "id": 2, "image": Image(uri="s3://abc"), }, ] df = self.spark.createDataFrame(expected) df.write.mode("overwrite").parquet(self.test_dir) records = sorted(self._read_parquets(self.test_dir), key=lambda x: x["id"]) self.assertCountEqual(expected, records)
def test_image_copy(spark: SparkSession, tmpdir): source_image = os.path.join(tmpdir, "source_image") with open(source_image, "w") as fobj: fobj.write("abc") os.makedirs(os.path.join(tmpdir, "out")) df = spark.createDataFrame([(Image(source_image), )], ["image"]) # type: pyspark.sql.DataFrame df = df.withColumn( "image", image_copy(col("image"), lit(os.path.join(tmpdir, "out/"))), ) data = df.collect() # force lazy calculation out_file = os.path.join(tmpdir, "out", "source_image") assert Image(out_file) == data[0].image with open(os.path.join(out_file)) as fobj: assert fobj.read() == "abc"
def test_image_copy(self): source_image = os.path.join(self.test_dir, "source_image") with open(source_image, "w") as fobj: fobj.write("abc") os.makedirs(os.path.join(self.test_dir, "out")) df = self.spark.createDataFrame( [(Image(source_image), )], ["image"]) # type: pyspark.sql.DataFrame df = df.withColumn( "image", image_copy(col("image"), lit(os.path.join(self.test_dir, "out/"))), ) data = df.collect() # force lazy calculation out_file = os.path.join(self.test_dir, "out", "source_image") self.assertEqual(Image(out_file), data[0].image) with open(os.path.join(out_file)) as fobj: self.assertEqual("abc", fobj.read())
def test_torch_dataset(spark, tmp_path, num_workers): total = 1000 dataset_dir = tmp_path / "data" asset_dir = tmp_path / "asset" asset_dir.mkdir(parents=True) data = [] expected = [] for i in range(total): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = asset_dir / f"{i}.png" Image.from_array(image_data, image_uri), array = wrap(np.random.random_sample((3, 4))) data.append( { "id": i, "array": array, "image": Image(image_uri), } ) expected.append( { "id": i, "array": torch.as_tensor(np.array([array])), "image": torch.as_tensor(np.array([image_data])), } ) df = spark.createDataFrame(data) df.write.mode("overwrite").format("rikai").save(str(dataset_dir)) dataset = Dataset(dataset_dir) loader = torchDataLoader( dataset, num_workers=num_workers, drop_last=True, ) actual = sorted(list(loader), key=lambda x: x["id"]) assert len(actual) == total for expect, act in zip(expected, actual): assert torch.equal( expect["array"], act["array"] ), f"Expected {expect['array']} got {act['array']}" assert torch.equal(expect["image"], act["image"])
def image_copy(image: Image, uri: str) -> Image: """Copy the image to a new destination, specified by the URI. Parameters ---------- image : Image An image object uri : str The base directory to copy the image to. Return ------ Image Return a new image pointed to the new URI """ logger.info("Copying image src=%s dest=%s", image.uri, uri) return Image(_copy(image.uri, uri))
def test_readme_example(spark: SparkSession): df = spark.createDataFrame([{ "id": 1, "mat": DenseMatrix(2, 2, range(4)), "image": Image("s3://foo/bar/1.png"), "annotations": [ Row( label="cat", mask=wrap(np.random.rand(256, 256)), bbox=Box2d(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0), ) ], }]) df.show()
def _create_dataframe(df_path: Path, spark: SparkSession): asset_dir = df_path / "assets" asset_dir.mkdir(parents=True) data = [] for i in range(100): image_data = np.random.randint(0, 128, size=(64, 64, 3), dtype=np.uint8) image_uri = asset_dir / f"{i}.png" data.append( Row( id=i, image=Image.from_array(image_data, image_uri), label=random.choice(["cat", "dog", "duck", "bird"]), )) return spark.createDataFrame(data)
def test_coco_dataset( spark: SparkSession, tmp_path: Path, ): dataset_dir = tmp_path / "features" asset_dir = tmp_path / "assets" asset_dir.mkdir(parents=True) data = [] for i in range(10): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = asset_dir / f"{i}.png" data.append( Row( image_id=i, split="train", image=Image.from_array(image_data, image_uri), annotations=[ Row( category_id=123, category_text="car", bbox=Box2d(1, 2, 3, 4), ), Row( category_id=234, category_text="dog", bbox=Box2d(1, 2, 3, 4), ), ], ) ) spark.createDataFrame(data).write.mode("overwrite").format("rikai").save( str(dataset_dir) ) loader = DataLoader(dataset_dir, batch_size=1) example = next(iter(loader)) assert isinstance(example, list) assert 1 == len(example) assert 2 == len(example[0]["annotations"]) assert np.array_equal( np.array([1, 2, 3, 4]), example[0]["annotations"][0]["bbox"] ), f"Actual annotations: {example[0]['annotations'][0]['bbox']}"
def test_coco_dataset(self): dataset_dir = os.path.join(self.test_dir, "features") asset_dir = os.path.join(self.test_dir, "assets") os.makedirs(asset_dir) data = [] for i in range(10): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = os.path.join(asset_dir, f"{i}.png") PILImage.fromarray(image_data).save(image_uri) data.append( Row( image_id=i, split="train", image=Image(image_uri), annotations=[ Row(category_id=123, category_text="car", bbox=Box2d(1, 2, 3, 4)), Row(category_id=234, category_text="dog", bbox=Box2d(1, 2, 3, 4)), ], )) self.spark.createDataFrame(data).write.mode("overwrite").format( "rikai").save(dataset_dir) loader = DataLoader(dataset_dir, batch_size=1) example = next(iter(loader)) self.assertTrue(isinstance(example, list)) self.assertEqual(1, len(example)) self.assertEqual(2, len(example[0]["annotations"])) self.assertTrue( np.array_equal(np.array([1, 2, 3, 4]), example[0]["annotations"][0]["bbox"]))
def test_load_dataset(self): dataset_dir = os.path.join(self.test_dir, "features") asset_dir = os.path.join(self.test_dir, "assets") os.makedirs(asset_dir) expected = [] data = [] for i in range(1000): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = os.path.join(asset_dir, f"{i}.png") PILImage.fromarray(image_data).save(image_uri) array = wrap(np.random.random_sample((3, 4))) data.append({ "id": i, "array": array, "image": Image(image_uri), }) expected.append({"id": i, "array": array, "image": image_data}) df = self.spark.createDataFrame(data) df.write.mode("overwrite").format("rikai").save(dataset_dir) loader = DataLoader(dataset_dir, batch_size=8) actual = [] for examples in loader: # print(examples) self.assertEqual(8, len(examples)) actual.extend(examples) actual = sorted(actual, key=lambda x: x["id"]) self.assertEqual(1000, len(actual)) for expect, act in zip(expected, actual): self.assertTrue(np.array_equal(expect["array"], act["array"])) self.assertTrue(np.array_equal(expect["image"], act["image"]))
def test_embedded_images(spark, tmpdir): df = spark.createDataFrame([Row(Image(secrets.token_bytes(128)))]) _check_roundtrip(spark, df, tmpdir)
def convert( spark: SparkSession, dataset_root: str, limit: int = 0, asset_dir: Optional[str] = None, ) -> DataFrame: """Convert a Coco Dataset into Rikai dataset. This function expects the COCO datasets are stored in directory with the following structure: - dataset - annotations - captions_train2017.json - instances_train2017.json - ... - train2017 - val2017 - test2017 Parameters ---------- spark : SparkSession A live spark session dataset_root : str The directory of dataset limit : int, optional The number of images of each split to be converted. asset_dir : str, optional The asset directory to store images, can be a s3 directory. Return ------ DataFrame Returns a Spark DataFrame """ train_json = os.path.join(dataset_root, "annotations", "instances_train2017.json") val_json = os.path.join(dataset_root, "annotations", "instances_val2017.json") categories = load_categories(train_json) examples = [] for split, anno_file in zip(["train", "val"], [train_json, val_json]): coco = COCO(annotation_file=anno_file) # Coco has native dependencies, so we do not distributed them # to the workers. image_ids = coco.imgs if limit > 0: image_ids = islice(image_ids, limit) for image_id in image_ids: ann_id = coco.getAnnIds(imgIds=image_id) annotations = coco.loadAnns(ann_id) annos = [] for ann in annotations: bbox = Box2d(*ann["bbox"]) annos.append({ "category_id": ann["category_id"], "category_text": categories[ann["category_id"]]["name"], "bbox": bbox, "area": float(ann["area"]), }) image_payload = coco.loadImgs(ids=image_id)[0] example = { "image_id": image_id, "annotations": annos, "image": Image( os.path.abspath( os.path.join( os.curdir, "dataset", "{}2017".format(split), image_payload["file_name"], ))), "split": split, } examples.append(example) schema = StructType([ StructField("image_id", LongType(), False), StructField( "annotations", ArrayType( StructType([ StructField("category_id", IntegerType()), StructField("category_text", StringType()), StructField("area", FloatType()), StructField("bbox", Box2dType()), ])), False, ), StructField("image", ImageType(), False), StructField("split", StringType(), False), ]) df = spark.createDataFrame(examples, schema=schema) if asset_dir: asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/" print("ASSET DIR: ", asset_dir) df = df.withColumn("image", image_copy(col("image"), lit(asset_dir))) return df