Esempio n. 1
0
def test_load_dataset(spark: SparkSession, tmp_path: Path):
    dataset_dir = tmp_path / "features"
    asset_dir = tmp_path / "assets"
    asset_dir.mkdir(parents=True)

    expected = []
    data = []
    for i in range(1000):
        image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8)
        image_uri = asset_dir / f"{i}.png"

        array = wrap(np.random.random_sample((3, 4)))
        data.append({
            "id": i,
            "array": array,
            "image": Image.from_array(image_data, image_uri),
        })
        expected.append({"id": i, "array": array, "image": image_data})
    df = spark.createDataFrame(data)

    df.write.mode("overwrite").format("rikai").save(str(dataset_dir))

    loader = DataLoader(dataset_dir, batch_size=8)
    actual = []
    for examples in loader:
        # print(examples)
        assert len(examples) == 8
        actual.extend(examples)

    actual = sorted(actual, key=lambda x: x["id"])
    assert len(actual) == 1000
    for expect, act in zip(expected, actual):
        assert np.array_equal(expect["array"], act["array"])
        assert np.array_equal(expect["image"], act["image"])
Esempio n. 2
0
def test_images(spark: SparkSession, tmp_path):
    expected = [
        {
            "id": 1,
            "image": Image(uri="s3://123"),
        },
        {
            "id": 2,
            "image": Image(uri="s3://abc"),
        },
    ]
    df = spark.createDataFrame(expected)
    df.write.mode("overwrite").parquet(str(tmp_path))

    records = sorted(_read_parquets(str(tmp_path)), key=lambda x: x["id"])
    assert_count_equal(expected, records)
Esempio n. 3
0
def test_load_dataset(spark: SparkSession, tmp_path: Path):
    dataset_dir = tmp_path / "features"
    asset_dir = tmp_path / "assets"
    asset_dir.mkdir(parents=True)

    expected = []
    data = []
    for i in range(1000):
        image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8)
        image_uri = asset_dir / f"{i}.png"

        array = wrap(np.random.random_sample((3, 4)))
        data.append(
            {
                "id": i,
                "array": array,
                "image": Image.from_array(image_data, image_uri),
            }
        )
        expected.append({"id": i, "array": array, "image": image_data})
    df = spark.createDataFrame(data)

    df.write.mode("overwrite").format("rikai").save(str(dataset_dir))

    loader = DataLoader(dataset_dir, batch_size=8)
    _check_loader(loader, expected)
    loader2 = DataLoader(df, batch_size=8)
    _check_loader(loader2, expected)
Esempio n. 4
0
    def test_images(self):
        expected = [
            {
                "id": 1,
                "image": Image(uri="s3://123"),
            },
            {
                "id": 2,
                "image": Image(uri="s3://abc"),
            },
        ]
        df = self.spark.createDataFrame(expected)
        df.write.mode("overwrite").parquet(self.test_dir)

        records = sorted(self._read_parquets(self.test_dir),
                         key=lambda x: x["id"])
        self.assertCountEqual(expected, records)
Esempio n. 5
0
def test_image_copy(spark: SparkSession, tmpdir):
    source_image = os.path.join(tmpdir, "source_image")
    with open(source_image, "w") as fobj:
        fobj.write("abc")
    os.makedirs(os.path.join(tmpdir, "out"))

    df = spark.createDataFrame([(Image(source_image), )],
                               ["image"])  # type: pyspark.sql.DataFrame
    df = df.withColumn(
        "image",
        image_copy(col("image"), lit(os.path.join(tmpdir, "out/"))),
    )
    data = df.collect()  # force lazy calculation
    out_file = os.path.join(tmpdir, "out", "source_image")
    assert Image(out_file) == data[0].image

    with open(os.path.join(out_file)) as fobj:
        assert fobj.read() == "abc"
Esempio n. 6
0
    def test_image_copy(self):
        source_image = os.path.join(self.test_dir, "source_image")
        with open(source_image, "w") as fobj:
            fobj.write("abc")
        os.makedirs(os.path.join(self.test_dir, "out"))

        df = self.spark.createDataFrame(
            [(Image(source_image), )],
            ["image"])  # type: pyspark.sql.DataFrame
        df = df.withColumn(
            "image",
            image_copy(col("image"), lit(os.path.join(self.test_dir, "out/"))),
        )
        data = df.collect()  # force lazy calculation
        out_file = os.path.join(self.test_dir, "out", "source_image")
        self.assertEqual(Image(out_file), data[0].image)

        with open(os.path.join(out_file)) as fobj:
            self.assertEqual("abc", fobj.read())
Esempio n. 7
0
def test_torch_dataset(spark, tmp_path, num_workers):
    total = 1000
    dataset_dir = tmp_path / "data"
    asset_dir = tmp_path / "asset"
    asset_dir.mkdir(parents=True)
    data = []
    expected = []
    for i in range(total):
        image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8)
        image_uri = asset_dir / f"{i}.png"
        Image.from_array(image_data, image_uri),

        array = wrap(np.random.random_sample((3, 4)))
        data.append(
            {
                "id": i,
                "array": array,
                "image": Image(image_uri),
            }
        )
        expected.append(
            {
                "id": i,
                "array": torch.as_tensor(np.array([array])),
                "image": torch.as_tensor(np.array([image_data])),
            }
        )

    df = spark.createDataFrame(data)
    df.write.mode("overwrite").format("rikai").save(str(dataset_dir))
    dataset = Dataset(dataset_dir)
    loader = torchDataLoader(
        dataset,
        num_workers=num_workers,
        drop_last=True,
    )
    actual = sorted(list(loader), key=lambda x: x["id"])
    assert len(actual) == total
    for expect, act in zip(expected, actual):
        assert torch.equal(
            expect["array"], act["array"]
        ), f"Expected {expect['array']} got {act['array']}"
        assert torch.equal(expect["image"], act["image"])
Esempio n. 8
0
def image_copy(image: Image, uri: str) -> Image:
    """Copy the image to a new destination, specified by the URI.

    Parameters
    ----------
    image : Image
        An image object
    uri : str
        The base directory to copy the image to.

    Return
    ------
    Image
        Return a new image pointed to the new URI
    """
    logger.info("Copying image src=%s dest=%s", image.uri, uri)
    return Image(_copy(image.uri, uri))
Esempio n. 9
0
def test_readme_example(spark: SparkSession):
    df = spark.createDataFrame([{
        "id":
        1,
        "mat":
        DenseMatrix(2, 2, range(4)),
        "image":
        Image("s3://foo/bar/1.png"),
        "annotations": [
            Row(
                label="cat",
                mask=wrap(np.random.rand(256, 256)),
                bbox=Box2d(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0),
            )
        ],
    }])
    df.show()
Esempio n. 10
0
def _create_dataframe(df_path: Path, spark: SparkSession):
    asset_dir = df_path / "assets"
    asset_dir.mkdir(parents=True)

    data = []
    for i in range(100):
        image_data = np.random.randint(0,
                                       128,
                                       size=(64, 64, 3),
                                       dtype=np.uint8)
        image_uri = asset_dir / f"{i}.png"
        data.append(
            Row(
                id=i,
                image=Image.from_array(image_data, image_uri),
                label=random.choice(["cat", "dog", "duck", "bird"]),
            ))
    return spark.createDataFrame(data)
Esempio n. 11
0
def test_coco_dataset(
    spark: SparkSession,
    tmp_path: Path,
):
    dataset_dir = tmp_path / "features"
    asset_dir = tmp_path / "assets"
    asset_dir.mkdir(parents=True)
    data = []
    for i in range(10):
        image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8)
        image_uri = asset_dir / f"{i}.png"

        data.append(
            Row(
                image_id=i,
                split="train",
                image=Image.from_array(image_data, image_uri),
                annotations=[
                    Row(
                        category_id=123,
                        category_text="car",
                        bbox=Box2d(1, 2, 3, 4),
                    ),
                    Row(
                        category_id=234,
                        category_text="dog",
                        bbox=Box2d(1, 2, 3, 4),
                    ),
                ],
            )
        )

    spark.createDataFrame(data).write.mode("overwrite").format("rikai").save(
        str(dataset_dir)
    )

    loader = DataLoader(dataset_dir, batch_size=1)
    example = next(iter(loader))
    assert isinstance(example, list)
    assert 1 == len(example)
    assert 2 == len(example[0]["annotations"])
    assert np.array_equal(
        np.array([1, 2, 3, 4]), example[0]["annotations"][0]["bbox"]
    ), f"Actual annotations: {example[0]['annotations'][0]['bbox']}"
Esempio n. 12
0
    def test_coco_dataset(self):
        dataset_dir = os.path.join(self.test_dir, "features")
        asset_dir = os.path.join(self.test_dir, "assets")
        os.makedirs(asset_dir)
        data = []
        for i in range(10):
            image_data = np.random.randint(0,
                                           128,
                                           size=(128, 128),
                                           dtype=np.uint8)
            image_uri = os.path.join(asset_dir, f"{i}.png")
            PILImage.fromarray(image_data).save(image_uri)

            data.append(
                Row(
                    image_id=i,
                    split="train",
                    image=Image(image_uri),
                    annotations=[
                        Row(category_id=123,
                            category_text="car",
                            bbox=Box2d(1, 2, 3, 4)),
                        Row(category_id=234,
                            category_text="dog",
                            bbox=Box2d(1, 2, 3, 4)),
                    ],
                ))

        self.spark.createDataFrame(data).write.mode("overwrite").format(
            "rikai").save(dataset_dir)

        loader = DataLoader(dataset_dir, batch_size=1)
        example = next(iter(loader))
        self.assertTrue(isinstance(example, list))
        self.assertEqual(1, len(example))
        self.assertEqual(2, len(example[0]["annotations"]))
        self.assertTrue(
            np.array_equal(np.array([1, 2, 3, 4]),
                           example[0]["annotations"][0]["bbox"]))
Esempio n. 13
0
    def test_load_dataset(self):
        dataset_dir = os.path.join(self.test_dir, "features")
        asset_dir = os.path.join(self.test_dir, "assets")
        os.makedirs(asset_dir)

        expected = []
        data = []
        for i in range(1000):
            image_data = np.random.randint(0,
                                           128,
                                           size=(128, 128),
                                           dtype=np.uint8)
            image_uri = os.path.join(asset_dir, f"{i}.png")
            PILImage.fromarray(image_data).save(image_uri)

            array = wrap(np.random.random_sample((3, 4)))
            data.append({
                "id": i,
                "array": array,
                "image": Image(image_uri),
            })
            expected.append({"id": i, "array": array, "image": image_data})
        df = self.spark.createDataFrame(data)

        df.write.mode("overwrite").format("rikai").save(dataset_dir)

        loader = DataLoader(dataset_dir, batch_size=8)
        actual = []
        for examples in loader:
            # print(examples)
            self.assertEqual(8, len(examples))
            actual.extend(examples)

        actual = sorted(actual, key=lambda x: x["id"])
        self.assertEqual(1000, len(actual))
        for expect, act in zip(expected, actual):
            self.assertTrue(np.array_equal(expect["array"], act["array"]))
            self.assertTrue(np.array_equal(expect["image"], act["image"]))
Esempio n. 14
0
def test_embedded_images(spark, tmpdir):
    df = spark.createDataFrame([Row(Image(secrets.token_bytes(128)))])
    _check_roundtrip(spark, df, tmpdir)
Esempio n. 15
0
def convert(
    spark: SparkSession,
    dataset_root: str,
    limit: int = 0,
    asset_dir: Optional[str] = None,
) -> DataFrame:
    """Convert a Coco Dataset into Rikai dataset.

    This function expects the COCO datasets are stored in directory with the
    following structure:

    - dataset
        - annotations
          - captions_train2017.json
          - instances_train2017.json
          - ...
        - train2017
        - val2017
        - test2017

    Parameters
    ----------
    spark : SparkSession
        A live spark session
    dataset_root : str
        The directory of dataset
    limit : int, optional
        The number of images of each split to be converted.
    asset_dir : str, optional
        The asset directory to store images, can be a s3 directory.

    Return
    ------
    DataFrame
        Returns a Spark DataFrame
    """
    train_json = os.path.join(dataset_root, "annotations",
                              "instances_train2017.json")
    val_json = os.path.join(dataset_root, "annotations",
                            "instances_val2017.json")

    categories = load_categories(train_json)

    examples = []
    for split, anno_file in zip(["train", "val"], [train_json, val_json]):
        coco = COCO(annotation_file=anno_file)
        # Coco has native dependencies, so we do not distributed them
        # to the workers.
        image_ids = coco.imgs
        if limit > 0:
            image_ids = islice(image_ids, limit)
        for image_id in image_ids:
            ann_id = coco.getAnnIds(imgIds=image_id)
            annotations = coco.loadAnns(ann_id)
            annos = []
            for ann in annotations:
                bbox = Box2d(*ann["bbox"])
                annos.append({
                    "category_id":
                    ann["category_id"],
                    "category_text":
                    categories[ann["category_id"]]["name"],
                    "bbox":
                    bbox,
                    "area":
                    float(ann["area"]),
                })
            image_payload = coco.loadImgs(ids=image_id)[0]
            example = {
                "image_id":
                image_id,
                "annotations":
                annos,
                "image":
                Image(
                    os.path.abspath(
                        os.path.join(
                            os.curdir,
                            "dataset",
                            "{}2017".format(split),
                            image_payload["file_name"],
                        ))),
                "split":
                split,
            }
            examples.append(example)

    schema = StructType([
        StructField("image_id", LongType(), False),
        StructField(
            "annotations",
            ArrayType(
                StructType([
                    StructField("category_id", IntegerType()),
                    StructField("category_text", StringType()),
                    StructField("area", FloatType()),
                    StructField("bbox", Box2dType()),
                ])),
            False,
        ),
        StructField("image", ImageType(), False),
        StructField("split", StringType(), False),
    ])
    df = spark.createDataFrame(examples, schema=schema)
    if asset_dir:
        asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/"
        print("ASSET DIR: ", asset_dir)
        df = df.withColumn("image", image_copy(col("image"), lit(asset_dir)))
    return df