Beispiel #1
0
def test_spark_show_numpy(spark: SparkSession, capsys):
    data = wrap(np.random.rand(50, 50, 3))
    data2 = wrap(np.array([1, 2, 3], dtype=np.uint8))
    df = spark.createDataFrame([{"np": data}, {"np": data2}])
    df.show()
    assert np.array_equal(data, df.first().np)
    stdout = capsys.readouterr().out
    print(stdout)
    assert "ndarray(float64" in stdout
    assert "ndarray(uint8" in stdout
Beispiel #2
0
def test_load_dataset(spark: SparkSession, tmp_path: Path):
    dataset_dir = tmp_path / "features"
    asset_dir = tmp_path / "assets"
    asset_dir.mkdir(parents=True)

    expected = []
    data = []
    for i in range(1000):
        image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8)
        image_uri = asset_dir / f"{i}.png"

        array = wrap(np.random.random_sample((3, 4)))
        data.append({
            "id": i,
            "array": array,
            "image": Image.from_array(image_data, image_uri),
        })
        expected.append({"id": i, "array": array, "image": image_data})
    df = spark.createDataFrame(data)

    df.write.mode("overwrite").format("rikai").save(str(dataset_dir))

    loader = DataLoader(dataset_dir, batch_size=8)
    actual = []
    for examples in loader:
        # print(examples)
        assert len(examples) == 8
        actual.extend(examples)

    actual = sorted(actual, key=lambda x: x["id"])
    assert len(actual) == 1000
    for expect, act in zip(expected, actual):
        assert np.array_equal(expect["array"], act["array"])
        assert np.array_equal(expect["image"], act["image"])
Beispiel #3
0
def test_load_dataset(spark: SparkSession, tmp_path: Path):
    dataset_dir = tmp_path / "features"
    asset_dir = tmp_path / "assets"
    asset_dir.mkdir(parents=True)

    expected = []
    data = []
    for i in range(1000):
        image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8)
        image_uri = asset_dir / f"{i}.png"

        array = wrap(np.random.random_sample((3, 4)))
        data.append(
            {
                "id": i,
                "array": array,
                "image": Image.from_array(image_data, image_uri),
            }
        )
        expected.append({"id": i, "array": array, "image": image_data})
    df = spark.createDataFrame(data)

    df.write.mode("overwrite").format("rikai").save(str(dataset_dir))

    loader = DataLoader(dataset_dir, batch_size=8)
    _check_loader(loader, expected)
    loader2 = DataLoader(df, batch_size=8)
    _check_loader(loader2, expected)
Beispiel #4
0
def test_numpy_to_image(spark: SparkSession, tmp_path: Path):
    """Test upload a numpy image to the external storage,
    and convert the data into Image asset.

    """
    df = spark.createDataFrame(
        [Row(id=1, data=wrap(np.ones((32, 32), dtype=np.uint8)))])
    df = df.withColumn(
        "image",
        numpy_to_image(
            df.data, concat(lit(str(tmp_path)), lit("/"), df.id, lit(".png"))),
    )
    df.count()
    # print(df.first().image)
    assert Path(df.first().image.uri) == tmp_path / "1.png"
    assert (tmp_path / "1.png").exists()
Beispiel #5
0
def test_readme_example(spark: SparkSession):
    df = spark.createDataFrame([{
        "id":
        1,
        "mat":
        DenseMatrix(2, 2, range(4)),
        "image":
        Image("s3://foo/bar/1.png"),
        "annotations": [
            Row(
                label="cat",
                mask=wrap(np.random.rand(256, 256)),
                bbox=Box2d(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0),
            )
        ],
    }])
    df.show()
Beispiel #6
0
def test_torch_dataset(spark, tmp_path, num_workers):
    total = 1000
    dataset_dir = tmp_path / "data"
    asset_dir = tmp_path / "asset"
    asset_dir.mkdir(parents=True)
    data = []
    expected = []
    for i in range(total):
        image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8)
        image_uri = asset_dir / f"{i}.png"
        Image.from_array(image_data, image_uri),

        array = wrap(np.random.random_sample((3, 4)))
        data.append(
            {
                "id": i,
                "array": array,
                "image": Image(image_uri),
            }
        )
        expected.append(
            {
                "id": i,
                "array": torch.as_tensor(np.array([array])),
                "image": torch.as_tensor(np.array([image_data])),
            }
        )

    df = spark.createDataFrame(data)
    df.write.mode("overwrite").format("rikai").save(str(dataset_dir))
    dataset = Dataset(dataset_dir)
    loader = torchDataLoader(
        dataset,
        num_workers=num_workers,
        drop_last=True,
    )
    actual = sorted(list(loader), key=lambda x: x["id"])
    assert len(actual) == total
    for expect, act in zip(expected, actual):
        assert torch.equal(
            expect["array"], act["array"]
        ), f"Expected {expect['array']} got {act['array']}"
        assert torch.equal(expect["image"], act["image"])
Beispiel #7
0
    def test_load_dataset(self):
        dataset_dir = os.path.join(self.test_dir, "features")
        asset_dir = os.path.join(self.test_dir, "assets")
        os.makedirs(asset_dir)

        expected = []
        data = []
        for i in range(1000):
            image_data = np.random.randint(0,
                                           128,
                                           size=(128, 128),
                                           dtype=np.uint8)
            image_uri = os.path.join(asset_dir, f"{i}.png")
            PILImage.fromarray(image_data).save(image_uri)

            array = wrap(np.random.random_sample((3, 4)))
            data.append({
                "id": i,
                "array": array,
                "image": Image(image_uri),
            })
            expected.append({"id": i, "array": array, "image": image_data})
        df = self.spark.createDataFrame(data)

        df.write.mode("overwrite").format("rikai").save(dataset_dir)

        loader = DataLoader(dataset_dir, batch_size=8)
        actual = []
        for examples in loader:
            # print(examples)
            self.assertEqual(8, len(examples))
            actual.extend(examples)

        actual = sorted(actual, key=lambda x: x["id"])
        self.assertEqual(1000, len(actual))
        for expect, act in zip(expected, actual):
            self.assertTrue(np.array_equal(expect["array"], act["array"]))
            self.assertTrue(np.array_equal(expect["image"], act["image"]))