Esempio n. 1
0
def test_write_mnist(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    try:
        train_image_file = os.path.join(temp_dir, "train-images")
        train_label_file = os.path.join(temp_dir, "train-labels")
        output_path = os.path.join(temp_dir, "output_dataset")

        images = np.array([[i] * 16 for i in range(20)]).reshape(
            (20, 4, 4)).astype(np.uint8)
        labels = np.array(list(range(20))).reshape((20, )).astype(np.uint8)

        _images_to_mnist_file(images, train_image_file)
        _labels_to_mnist_file(labels, train_label_file)

        write_mnist(image_file=train_image_file,
                    label_file=train_label_file,
                    output_path=output_path)
        data, schema = ParquetDataset._read_as_dict_rdd(output_path)
        data = data.sortBy(lambda x: x['label']).collect()
        images_load = np.reshape(np.stack([d['image'] for d in data]),
                                 (-1, 4, 4))
        labels_load = np.stack([d['label'] for d in data])

        assert np.all(images_load == images)
        assert np.all(labels_load == labels_load)

    finally:
        shutil.rmtree(temp_dir)
Esempio n. 2
0
def test_write_parquet_simple(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    def generator(num):
        for i in range(num):
            yield {"id": i, "feature": np.zeros((10,)), "label": np.ones((4,))}

    schema = {
        "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.INT32, shape=()),
        "feature": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(10,)),
        "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,))
    }

    try:

        ParquetDataset.write("file://" + temp_dir, generator(100), schema)
        data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir)
        data = data.collect()[0]
        assert data['id'] == 0
        assert np.all(data['feature'] == np.zeros((10,), dtype=np.float32))
        assert np.all(data['label'] == np.ones((4,), dtype=np.float32))

    finally:
        shutil.rmtree(temp_dir)
Esempio n. 3
0
def test_write_parquet_images(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    def generator():
        dataset_path = os.path.join(resource_path, "cat_dog")
        for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")):
            for name in files:
                image_path = os.path.join(root, name)
                yield {"image": image_path, "label": 1, "id": image_path}

        for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")):
            for name in files:
                image_path = os.path.join(root, name)
                yield {"image": image_path, "label": 0, "id": image_path}

    schema = {
        "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=(10,)),
        "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)),
        "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=())
    }

    try:
        ParquetDataset.write("file://" + temp_dir, generator(), schema)
        data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir)
        data = data.collect()[0]
        image_path = data['id']
        with open(image_path, "rb") as f:
            image_bytes = f.read()

        assert image_bytes == data['image']

    finally:
        shutil.rmtree(temp_dir)
Esempio n. 4
0
def test_write_voc(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()
    try:
        from zoo.orca.data import SparkXShards
        dataset_path = os.path.join(resource_path, "VOCdevkit")
        output_path = os.path.join(temp_dir, "output_dataset")
        write_voc(dataset_path, splits_names=[(2007, "trainval")],
                  output_path="file://" + output_path)

        data, schema = ParquetDataset._read_as_dict_rdd("file://" + output_path)
        data = data.collect()[0]
        image_path = data["image_id"]
        with open(image_path, "rb") as f:
            image_bytes = f.read()
        assert image_bytes == data['image']

    finally:
        shutil.rmtree(temp_dir)