Beispiel #1
0
    def test_jpeg(self):
        """Test lossy image codec"""
        for size in [(300, 200), (300, 200, 3)]:
            expected_image = np.random.randint(0,
                                               255,
                                               size=size,
                                               dtype=np.uint8)
            codec = CompressedImageCodec('jpeg', quality=100)
            field = UnischemaField(name='field_image',
                                   numpy_dtype=np.uint8,
                                   shape=size,
                                   codec=codec,
                                   nullable=False)

            actual_image = codec.decode(field,
                                        codec.encode(field, expected_image))
            # Check a non exact match between the images. Verifying reasonable mean absolute error (up to 10)
            mean_abs_error = np.mean(
                np.abs(
                    expected_image.astype(np.float) -
                    actual_image.astype(np.float)))
            # The threshold is relatively high as compressing random images with jpeg results in a significant
            # quality loss
            self.assertLess(mean_abs_error, 50)
            self.assertTrue(np.any(expected_image != actual_image, axis=None))
def test_bad_shape():
    codec = CompressedImageCodec('png')
    field = UnischemaField(name='field_image',
                           numpy_dtype=np.uint8,
                           shape=(10, 20),
                           codec=codec,
                           nullable=False)
    with pytest.raises(ValueError, match='Unexpected dimensions'):
        codec.encode(field, np.zeros((100, 200), dtype=np.uint8))
Beispiel #3
0
 def test_bad_dtype(self):
     codec = CompressedImageCodec('png')
     field = UnischemaField(name='field_image',
                            numpy_dtype=np.uint8,
                            shape=(10, 20),
                            codec=codec,
                            nullable=False)
     with self.assertRaises(ValueError) as e:
         codec.encode(field, np.zeros((100, 200), dtype=np.uint16))
     self.assertTrue('Unexpected type' in str(e.exception))
def test_png():
    """Simple noop encode/decode using png codec. Verify that supports uint16 png codec and monochrome and
    color images."""
    for size in [(300, 200), (300, 200, 3)]:
        for dtype in [np.uint8, np.uint16]:
            expected_image = np.random.randint(0, np.iinfo(dtype).max, size=size, dtype=dtype)
            codec = CompressedImageCodec('png')
            field = UnischemaField(name='field_image', numpy_dtype=dtype, shape=size, codec=codec,
                                   nullable=False)

            actual_image = codec.decode(field, codec.encode(field, expected_image))
            np.testing.assert_array_equal(expected_image, actual_image)
            assert expected_image.dtype == actual_image.dtype
def test_jpeg_quality():
    """Compare mean abs error between different encoding quality settings. Higher quality value should result
    in a smaller error"""
    size = (300, 200, 3)
    expected_image = np.random.randint(0, 255, size=size, dtype=np.uint8)

    errors = dict()
    for quality in [10, 99]:
        codec = CompressedImageCodec('jpeg', quality=quality)
        field = UnischemaField(name='field_image', numpy_dtype=np.uint8, shape=size, codec=codec, nullable=False)
        actual_image = codec.decode(field, codec.encode(field, expected_image))
        errors[quality] = np.mean(np.abs(expected_image.astype(np.float) - actual_image.astype(np.float)))

    assert errors[10] > errors[99]
def test_cross_coding():
    """Encode using PIL and decode using opencv. Previously had an error with channel ordering. This test
    covers this issue for the future """
    for size in [(300, 200), (300, 200, 3)]:
        dtype = np.uint8
        expected_image = np.random.randint(0, np.iinfo(dtype).max, size=size, dtype=np.uint8)
        codec = CompressedImageCodec('png')
        field = UnischemaField(name='field_image', numpy_dtype=dtype, shape=size, codec=codec,
                               nullable=False)

        encoded = Image.fromarray(expected_image)
        encoded_bytes = io.BytesIO()
        encoded.save(encoded_bytes, format='PNG')

        actual_image = codec.decode(field, encoded_bytes.getvalue())
        np.testing.assert_array_equal(expected_image, actual_image)
        assert expected_image.dtype == actual_image.dtype
def test_use_persisted_codec_and_not_provided_by_user(synthetic_dataset,
                                                      reader_factory):
    """In order to start using new codec for some field while maintain the ability to read old datasets that were
    written using an old codec, we need to make sure we are using stored UnischemaField.codec object (that contains
    an old codec/shape)."""
    new_unischema_instance = UnischemaField('matrix_uint16', np.uint16,
                                            (2, 3, 4),
                                            CompressedImageCodec('png'), False)

    with reader_factory(synthetic_dataset.url,
                        schema_fields=[new_unischema_instance]) as reader:
        row = next(reader)
    assert row.matrix_uint16.shape == (32, 16, 3)
def test_invalid_image_size():
    """Codec can encode only (H, W) and (H, W, 3) images"""
    codec = CompressedImageCodec('png')

    field = UnischemaField(name='field_image', numpy_dtype=np.uint8, shape=(10, 10, 3), codec=codec,
                           nullable=False)

    with pytest.raises(ValueError):
        codec.encode(field, np.zeros((10,), dtype=np.uint8))

    with pytest.raises(ValueError):
        codec.encode(field, np.zeros((10, 10, 2), dtype=np.uint8))

    with pytest.raises(ValueError):
        codec.encode(field, np.zeros((10, 10, 10, 10), dtype=np.uint8))
Beispiel #9
0
from petastorm.unischema import Unischema, UnischemaField, dict_to_spark_row

_DEFAULT_IMAGE_SIZE = (32, 16, 3)

TestSchema = Unischema('TestSchema', [
    UnischemaField('partition_key', np.unicode_,
                   (), ScalarCodec(StringType()), False),
    UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
    UnischemaField('id2', np.int32, (), ScalarCodec(ShortType()), False),
    UnischemaField('id_float', np.float64,
                   (), ScalarCodec(DoubleType()), False),
    UnischemaField('id_odd', np.bool_, (), ScalarCodec(BooleanType()), False),
    UnischemaField('python_primitive_uint8', np.uint8,
                   (), ScalarCodec(ShortType()), False),
    UnischemaField('image_png', np.uint8, _DEFAULT_IMAGE_SIZE,
                   CompressedImageCodec('png'), False),
    UnischemaField('matrix', np.float32, _DEFAULT_IMAGE_SIZE, NdarrayCodec(),
                   False),
    UnischemaField('decimal', Decimal,
                   (), ScalarCodec(DecimalType(10, 9)), False),
    UnischemaField('matrix_uint16', np.uint16, _DEFAULT_IMAGE_SIZE,
                   NdarrayCodec(), False),
    UnischemaField('matrix_string', np.string_, (
        None,
        None,
    ), NdarrayCodec(), False),
    UnischemaField('empty_matrix_string', np.string_,
                   (None, ), NdarrayCodec(), False),
    UnischemaField('matrix_nullable', np.uint16, _DEFAULT_IMAGE_SIZE,
                   NdarrayCodec(), True),
    UnischemaField('sensor_name', np.unicode_, (1, ), NdarrayCodec(), False),
Beispiel #10
0
    ScalarCodec
from petastorm.etl.dataset_metadata import materialize_dataset
from petastorm.etl.rowgroup_indexers import SingleFieldIndexer
from petastorm.etl.rowgroup_indexing import build_rowgroup_index
from petastorm.unischema import Unischema, UnischemaField, dict_to_spark_row

_DEFAULT_IMAGE_SIZE = (32, 16, 3)

TestSchema = Unischema('TestSchema', [
    UnischemaField('partition_key', np.unicode_, (), ScalarCodec(StringType()), False),
    UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
    UnischemaField('id2', np.int32, (), ScalarCodec(ShortType()), False),
    UnischemaField('id_float', np.float64, (), ScalarCodec(DoubleType()), False),
    UnischemaField('id_odd', np.bool_, (), ScalarCodec(BooleanType()), False),
    UnischemaField('python_primitive_uint8', np.uint8, (), ScalarCodec(ShortType()), False),
    UnischemaField('image_png', np.uint8, _DEFAULT_IMAGE_SIZE, CompressedImageCodec('png'), False),
    UnischemaField('matrix', np.float32, _DEFAULT_IMAGE_SIZE, NdarrayCodec(), False),
    UnischemaField('decimal', Decimal, (), ScalarCodec(DecimalType(10, 9)), False),
    UnischemaField('matrix_uint16', np.uint16, _DEFAULT_IMAGE_SIZE, NdarrayCodec(), False),
    UnischemaField('matrix_string', np.string_, (None, None,), NdarrayCodec(), False),
    UnischemaField('empty_matrix_string', np.string_, (None,), NdarrayCodec(), False),
    UnischemaField('matrix_nullable', np.uint16, _DEFAULT_IMAGE_SIZE, NdarrayCodec(), True),
    UnischemaField('sensor_name', np.unicode_, (1,), NdarrayCodec(), False),
    UnischemaField('string_array_nullable', np.unicode_, (None,), NdarrayCodec(), True),
])


def _random_binary_string_gen(max_length):
    """Returns a single random string up to max_length specified length that may include \x00 character anywhere in the
    string"""
    size = random.randint(0, max_length)
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

from petastorm.codecs import ScalarCodec, CompressedImageCodec, NdarrayCodec
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField

from pycarbon.core.carbon_dataset_metadata import materialize_dataset_carbon

from pycarbon.tests import DEFAULT_CARBONSDK_PATH

# The schema defines how the dataset schema looks like
HelloWorldSchema = Unischema('HelloWorldSchema', [
    UnischemaField('id', np.int_, (), ScalarCodec(IntegerType()), False),
    UnischemaField('image1', np.uint8,
                   (128, 256, 3), CompressedImageCodec('png'), False),
    UnischemaField('array_4d', np.uint8,
                   (None, 128, 30, None), NdarrayCodec(), False),
])


def row_generator(x):
    """Returns a single entry in the generated dataset. Return a bunch of random values as an example."""
    return {
        'id': x,
        'image1': np.random.randint(0, 255, dtype=np.uint8,
                                    size=(128, 256, 3)),
        'array_4d': np.random.randint(0,
                                      255,
                                      dtype=np.uint8,
                                      size=(4, 128, 30, 3))
#  Copyright (c) 2017-2018 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from pyspark.sql.types import StringType

from petastorm.codecs import ScalarCodec, CompressedImageCodec
from petastorm.unischema import Unischema, UnischemaField

ImagenetSchema = Unischema('ImagenetSchema', [
    UnischemaField('noun_id', np.string_,
                   (), ScalarCodec(StringType()), False),
    UnischemaField('text', np.string_, (), ScalarCodec(StringType()), False),
    UnischemaField('image', np.uint8,
                   (None, None, 3), CompressedImageCodec('png'), False),
])
def test_str_special_method():
    codec = CompressedImageCodec('png', 80)
    assert str(codec) == 'CompressedImageCodec(\'png\', 80)'