Esempio n. 1
0
def write_voc(voc_root_path, splits_names, output_path, **kwargs):
    custom_classes = kwargs.get("classes", None)
    voc_datasets = VOCDatasets(
        voc_root_path, splits_names, classes=custom_classes)

    def make_generator():
        for img_path, label in voc_datasets:
            yield {"image": img_path, "label": label, "image_id": img_path}

    image, label = voc_datasets[0]
    label_shape = (-1, label.shape[-1])
    schema = {
        "image": SchemaField(feature_type=FeatureType.IMAGE,
                             dtype=DType.FLOAT32,
                             shape=()),
        "label": SchemaField(feature_type=FeatureType.NDARRAY,
                             dtype=ndarray_dtype_to_dtype(label.dtype),
                             shape=label_shape),
        "image_id": SchemaField(feature_type=FeatureType.SCALAR,
                                dtype=DType.STRING,
                                shape=())
    }
    kwargs = {key: value for key, value in kwargs.items() if key not in [
        "classes"]}
    ParquetDataset.write(output_path, make_generator(), schema, **kwargs)
Esempio n. 2
0
def write_from_directory(directory, label_map, output_path, shuffle=True, **kwargs):
    labels = os.listdir(directory)
    valid_labels = [label for label in labels if label in label_map]
    generator = []
    for label in valid_labels:
        label_path = os.path.join(directory, label)
        images = os.listdir(label_path)
        for image in images:
            image_path = os.path.join(label_path, image)
            generator.append({"image": image_path,
                              "label": label_map[label],
                              "image_id": image_path,
                              "label_str": label})
    if shuffle:
        random.shuffle(generator)

    schema = {"image": SchemaField(feature_type=FeatureType.IMAGE,
                                   dtype=DType.FLOAT32,
                                   shape=()),
              "label": SchemaField(feature_type=FeatureType.SCALAR,
                                   dtype=DType.INT32,
                                   shape=()),
              "image_id": SchemaField(feature_type=FeatureType.SCALAR,
                                      dtype=DType.STRING,
                                      shape=()),
              "label_str": SchemaField(feature_type=FeatureType.SCALAR,
                                       dtype=DType.STRING,
                                       shape=())}

    ParquetDataset.write(output_path, generator, schema, **kwargs)
Esempio n. 3
0
def test_write_parquet_images(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    def generator():
        dataset_path = os.path.join(resource_path, "cat_dog")
        for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")):
            for name in files:
                image_path = os.path.join(root, name)
                yield {"image": image_path, "label": 1, "id": image_path}

        for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")):
            for name in files:
                image_path = os.path.join(root, name)
                yield {"image": image_path, "label": 0, "id": image_path}

    schema = {
        "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=(10,)),
        "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)),
        "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=())
    }

    try:
        ParquetDataset.write("file://" + temp_dir, generator(), schema)
        data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir)
        data = data.collect()[0]
        image_path = data['id']
        with open(image_path, "rb") as f:
            image_bytes = f.read()

        assert image_bytes == data['image']

    finally:
        shutil.rmtree(temp_dir)
Esempio n. 4
0
def test_write_parquet_simple(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    def generator(num):
        for i in range(num):
            yield {"id": i, "feature": np.zeros((10,)), "label": np.ones((4,))}

    schema = {
        "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.INT32, shape=()),
        "feature": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(10,)),
        "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,))
    }

    try:

        ParquetDataset.write("file://" + temp_dir, generator(100), schema)
        data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir)
        data = data.collect()[0]
        assert data['id'] == 0
        assert np.all(data['feature'] == np.zeros((10,), dtype=np.float32))
        assert np.all(data['label'] == np.ones((4,), dtype=np.float32))

    finally:
        shutil.rmtree(temp_dir)
Esempio n. 5
0
def _write_ndarrays(images, labels, output_path, **kwargs):
    images_shape = [int(x) for x in images.shape[1:]]
    labels_shape = [int(x) for x in labels.shape[1:]]
    schema = {
        "image": SchemaField(feature_type=FeatureType.NDARRAY,
                             dtype=ndarray_dtype_to_dtype(images.dtype),
                             shape=images_shape),
        "label": SchemaField(feature_type=FeatureType.NDARRAY,
                             dtype=ndarray_dtype_to_dtype(labels.dtype),
                             shape=labels_shape)
    }

    def make_generator():
        for i in range(images.shape[0]):
            yield {"image": images[i], "label": labels[i]}

    ParquetDataset.write(output_path, make_generator(), schema, **kwargs)
Esempio n. 6
0
def images_generator():
    dataset_path = os.path.join(resource_path, "cat_dog")
    for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")):
        for name in files:
            image_path = os.path.join(root, name)
            yield {"image": image_path, "label": 1, "id": image_path}

    for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")):
        for name in files:
            image_path = os.path.join(root, name)
            yield {"image": image_path, "label": 0, "id": image_path}


images_schema = {
    "image":
    SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=()),
    "label":
    SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.FLOAT32,
                shape=()),
    "id":
    SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=())
}


def parse_data_train(image, label):
    image = tf.io.decode_jpeg(image, NUM_CHANNELS)
    image = tf.image.resize(image, size=(WIDTH, HEIGHT))
    image = tf.reshape(image, [WIDTH, HEIGHT, NUM_CHANNELS])
    return image, label