Example #1
0
def test_box2d_vectorize_iou():
    box1 = Box2d(0, 0, 20, 20)
    assert np.allclose(
        [1 / 7, 5 * 5 / (2 * 20 * 20 - 5 * 5)],
        box1.iou([Box2d(10, 10, 30, 30),
                  Box2d(15, 15, 35, 35)]),
    )
Example #2
0
def test_df_to_rikai(spark: SparkSession, tmp_path: Path):
    df = spark.createDataFrame(
        [Row(Box2d(1, 2, 3, 4)), Row(Box2d(23, 33, 44, 88))], ["bbox"]
    )
    df_to_rikai(df, str(tmp_path))
    actual_df = spark.read.format("rikai").load(str(tmp_path))
    assert_count_equal(df.collect(), actual_df.collect())
Example #3
0
def test_box2d_iou():
    box1 = Box2d(0, 0, 20, 20)
    box2 = Box2d(10, 10, 30, 30)
    assert np.isclose(1 / 7, box1.iou(box2))
    assert isinstance(box1.iou(box2), float)
    box3 = Box2d(15, 15, 35, 35)
    assert np.isclose(5 * 5 / (2 * 20 * 20 - 5 * 5), box1.iou(box3))
Example #4
0
    def test_bbox(self):
        df = self.spark.createDataFrame([Row(b=Box2d(1, 2, 3, 4))])
        df.write.mode("overwrite").format("rikai").save(self.test_dir)

        records = self._read_parquets(self.test_dir)

        self.assertCountEqual([{"b": Box2d(1, 2, 3, 4)}], records)
Example #5
0
def test_bbox(spark: SparkSession, tmp_path: Path):
    test_dir = str(tmp_path)
    df = spark.createDataFrame([Row(b=Box2d(1, 2, 3, 4))])
    df.write.mode("overwrite").format("rikai").save(test_dir)

    records = _read_parquets(test_dir)

    assert_count_equal([{"b": Box2d(1, 2, 3, 4)}], records)
Example #6
0
def test_areas(spark: SparkSession):
    """Test calculating bounding box's area."""
    df = spark.createDataFrame(
        [
            (Box2d(1, 2, 2.0, 3.0), ),
            (Box2d(10, 12, 11.0, 17.0), ),
        ],
        ["bbox"],
    )
    df = df.withColumn("area", area(col("bbox")))
    assert_area_equals([1.0, 5.0], df)
Example #7
0
 def test_areas(self):
     """Test calculating bounding box's area."""
     df = self.spark.createDataFrame(
         [
             (Box2d(1, 2, 1.0, 1.0), ),
             (Box2d(10, 12, 1.0, 5.0), ),
         ],
         ["bbox"],
     )
     df = df.withColumn("area", area(col("bbox")))
     self.assertCountEqual((1.0, 5.0), df.select("area").toPandas()["area"])
Example #8
0
def test_coco_dataset(
    spark: SparkSession,
    tmp_path: Path,
):
    dataset_dir = tmp_path / "features"
    asset_dir = tmp_path / "assets"
    asset_dir.mkdir(parents=True)
    data = []
    for i in range(10):
        image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8)
        image_uri = asset_dir / f"{i}.png"

        data.append(
            Row(
                image_id=i,
                split="train",
                image=Image.from_array(image_data, image_uri),
                annotations=[
                    Row(
                        category_id=123,
                        category_text="car",
                        bbox=Box2d(1, 2, 3, 4),
                    ),
                    Row(
                        category_id=234,
                        category_text="dog",
                        bbox=Box2d(1, 2, 3, 4),
                    ),
                ],
            )
        )

    spark.createDataFrame(data).write.mode("overwrite").format("rikai").save(
        str(dataset_dir)
    )

    loader = DataLoader(dataset_dir, batch_size=1)
    example = next(iter(loader))
    assert isinstance(example, list)
    assert 1 == len(example)
    assert 2 == len(example[0]["annotations"])
    assert np.array_equal(
        np.array([1, 2, 3, 4]), example[0]["annotations"][0]["bbox"]
    ), f"Actual annotations: {example[0]['annotations'][0]['bbox']}"
Example #9
0
def test_box2d_as_list():
    box = Box2d(1.0, 2.0, 3.0, 4.0)

    assert [1.0, 2.0, 3.0, 4.0] == list(box)

    img = Image.fromarray(
        np.random.randint(0, 128, size=(32, 32), dtype=np.uint8))
    draw = ImageDraw.Draw(img)
    # Check that the box works with draw.
    draw.rectangle(box)

    assert isinstance(box, Sequence)
Example #10
0
    def test_coco_dataset(self):
        dataset_dir = os.path.join(self.test_dir, "features")
        asset_dir = os.path.join(self.test_dir, "assets")
        os.makedirs(asset_dir)
        data = []
        for i in range(10):
            image_data = np.random.randint(0,
                                           128,
                                           size=(128, 128),
                                           dtype=np.uint8)
            image_uri = os.path.join(asset_dir, f"{i}.png")
            PILImage.fromarray(image_data).save(image_uri)

            data.append(
                Row(
                    image_id=i,
                    split="train",
                    image=Image(image_uri),
                    annotations=[
                        Row(category_id=123,
                            category_text="car",
                            bbox=Box2d(1, 2, 3, 4)),
                        Row(category_id=234,
                            category_text="dog",
                            bbox=Box2d(1, 2, 3, 4)),
                    ],
                ))

        self.spark.createDataFrame(data).write.mode("overwrite").format(
            "rikai").save(dataset_dir)

        loader = DataLoader(dataset_dir, batch_size=1)
        example = next(iter(loader))
        self.assertTrue(isinstance(example, list))
        self.assertEqual(1, len(example))
        self.assertEqual(2, len(example[0]["annotations"]))
        self.assertTrue(
            np.array_equal(np.array([1, 2, 3, 4]),
                           example[0]["annotations"][0]["bbox"]))
Example #11
0
def test_scale_box2d():
    box = Box2d(1.0, 2.0, 3.0, 4.0)

    for twos in [2, 2.0, np.float32(2), np.float64(2), (2, 2)]:
        assert Box2d(0.5, 1.0, 1.5, 2.0) == box / twos
        assert Box2d(2.0, 4.0, 6.0, 8.0) == box * twos

    assert Box2d(0.5, 0.5, 1.5, 1.0) == box / (2, 4)
    assert Box2d(0.5, 0.25, 1.5, 0.5) == box / (2.0, 8.0)
    assert Box2d(10.0, 15.0, 30.0, 30.0) == box * (10, 7.5)
Example #12
0
def test_readme_example(spark: SparkSession):
    df = spark.createDataFrame([{
        "id":
        1,
        "mat":
        DenseMatrix(2, 2, range(4)),
        "image":
        Image("s3://foo/bar/1.png"),
        "annotations": [
            Row(
                label="cat",
                mask=wrap(np.random.rand(256, 256)),
                bbox=Box2d(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0),
            )
        ],
    }])
    df.show()
Example #13
0
def test_bbox(spark, tmp_path):
    df = spark.createDataFrame(
        [Row(Box2d(1, 2, 3, 4)),
         Row(Box2d(23, 33, 44, 88))], ["bbox"])
    _check_roundtrip(spark, df, tmp_path)
Example #14
0
def test_box2d_empty_iou():
    box1 = Box2d(0, 0, 20, 20)
    assert box1.iou([]).size == 0
Example #15
0
 def test_bbox(self):
     df = self.spark.createDataFrame(
         [Row(Box2d(1, 2, 3, 4)), Row(Box2d(23, 33, 44, 88))], ["bbox"]
     )
     self._check_roundtrip(df)
Example #16
0
def convert(
    spark: SparkSession,
    dataset_root: str,
    limit: int = 0,
    asset_dir: Optional[str] = None,
) -> DataFrame:
    """Convert a Coco Dataset into Rikai dataset.

    This function expects the COCO datasets are stored in directory with the
    following structure:

    - dataset
        - annotations
          - captions_train2017.json
          - instances_train2017.json
          - ...
        - train2017
        - val2017
        - test2017

    Parameters
    ----------
    spark : SparkSession
        A live spark session
    dataset_root : str
        The directory of dataset
    limit : int, optional
        The number of images of each split to be converted.
    asset_dir : str, optional
        The asset directory to store images, can be a s3 directory.

    Return
    ------
    DataFrame
        Returns a Spark DataFrame
    """
    train_json = os.path.join(dataset_root, "annotations",
                              "instances_train2017.json")
    val_json = os.path.join(dataset_root, "annotations",
                            "instances_val2017.json")

    categories = load_categories(train_json)

    examples = []
    for split, anno_file in zip(["train", "val"], [train_json, val_json]):
        coco = COCO(annotation_file=anno_file)
        # Coco has native dependencies, so we do not distributed them
        # to the workers.
        image_ids = coco.imgs
        if limit > 0:
            image_ids = islice(image_ids, limit)
        for image_id in image_ids:
            ann_id = coco.getAnnIds(imgIds=image_id)
            annotations = coco.loadAnns(ann_id)
            annos = []
            for ann in annotations:
                bbox = Box2d(*ann["bbox"])
                annos.append({
                    "category_id":
                    ann["category_id"],
                    "category_text":
                    categories[ann["category_id"]]["name"],
                    "bbox":
                    bbox,
                    "area":
                    float(ann["area"]),
                })
            image_payload = coco.loadImgs(ids=image_id)[0]
            example = {
                "image_id":
                image_id,
                "annotations":
                annos,
                "image":
                Image(
                    os.path.abspath(
                        os.path.join(
                            os.curdir,
                            "dataset",
                            "{}2017".format(split),
                            image_payload["file_name"],
                        ))),
                "split":
                split,
            }
            examples.append(example)

    schema = StructType([
        StructField("image_id", LongType(), False),
        StructField(
            "annotations",
            ArrayType(
                StructType([
                    StructField("category_id", IntegerType()),
                    StructField("category_text", StringType()),
                    StructField("area", FloatType()),
                    StructField("bbox", Box2dType()),
                ])),
            False,
        ),
        StructField("image", ImageType(), False),
        StructField("split", StringType(), False),
    ])
    df = spark.createDataFrame(examples, schema=schema)
    if asset_dir:
        asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/"
        print("ASSET DIR: ", asset_dir)
        df = df.withColumn("image", image_copy(col("image"), lit(asset_dir)))
    return df