コード例 #1
0
def test_encode_non_scalar_type_is_passed(non_scalar_value):
    codec = ScalarCodec(FloatType())
    field = UnischemaField(name='field_float',
                           numpy_dtype=np.float32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    with pytest.raises(TypeError, match='Expected a scalar'):
        codec.encode(field, non_scalar_value)
コード例 #2
0
def test_bad_encoded_data_shape():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    with pytest.raises(TypeError):
        codec.decode(field, codec.encode(field, np.asarray([10, 10])))
コード例 #3
0
def test_bad_unischema_field_shape():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(1, ),
                           codec=codec,
                           nullable=False)
    with pytest.raises(ValueError, match='must be an empty tuple'):
        codec.encode(field, np.int32(1))
コード例 #4
0
def test_as_spark_schema_unspecified_codec_type_unknown_scalar_type_raises():
    """We have a limited list of scalar types we can automatically map from numpy (+Decimal) types to spark types.
    Make sure that a ValueError is raised if an unknown type is used."""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_vector_unspecified_codec', object, ()),
    ])

    with pytest.raises(ValueError, match='Was not able to map type'):
        TestSchema.as_spark_schema()
コード例 #5
0
def test_bad_shape():
    codec = CompressedImageCodec('png')
    field = UnischemaField(name='field_image',
                           numpy_dtype=np.uint8,
                           shape=(10, 20),
                           codec=codec,
                           nullable=False)
    with pytest.raises(ValueError, match='Unexpected dimensions'):
        codec.encode(field, np.zeros((100, 200), dtype=np.uint8))
コード例 #6
0
def test_nested_value():
    codec = NoopCodec(ArrayType(ArrayType(StringType())))
    field = UnischemaField(name='field_string',
                           numpy_dtype=np.string_,
                           shape=(None, None),
                           codec=codec,
                           nullable=False)
    nested_array = [['a', 'b'], ['c'], ['d']]
    assert codec.decode(field, codec.encode(field,
                                            nested_array)) == nested_array
コード例 #7
0
def test_add_field_transform():
    one_added = transform_schema(
        TestSchema,
        TransformSpec(lambda x: x,
                      edit_fields=[
                          UnischemaField('double2', np.float64, (), None,
                                         False)
                      ]))
    assert set(
        one_added.fields.keys()) == {'string', 'double', 'double2', 'int'}
コード例 #8
0
def test_encode_scalar_bool():
    codec = ScalarCodec(BooleanType())
    field = UnischemaField(name='field_bool', numpy_dtype=np.bool, shape=(), codec=codec, nullable=False)

    encoded = codec.encode(field, np.bool_(True))
    assert isinstance(codec.encode(field, encoded), bool)
    assert encoded

    encoded = codec.encode(field, np.bool_(False))
    assert not encoded
コード例 #9
0
    def test_get_petastorm_column(self):
        col_name = 'frame_id'
        col = DataFrameColumn(col_name, ColumnType.INTEGER, False)
        petastorm_col = UnischemaField(col_name, np.int32, (),
                                       ScalarCodec(IntegerType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.FLOAT, True)
        petastorm_col = UnischemaField(col_name, np.float64, (),
                                       ScalarCodec(FloatType()), True)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, ColumnType.TEXT, False)
        petastorm_col = UnischemaField(col_name, np.str_, (),
                                       ScalarCodec(StringType()), False)
        self.assertEqual(SchemaUtils.get_petastorm_column(col), petastorm_col)

        col = DataFrameColumn(col_name, None, True, [10, 10])
        self.assertEqual(SchemaUtils.get_petastorm_column(col), None)
コード例 #10
0
def test_as_spark_schema():
    """Try using 'as_spark_schema' function"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
        UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('string_field_implicit', np.string_, ()),
    ])

    spark_schema = TestSchema.as_spark_schema()
    assert spark_schema.fields[0].name == 'int_field'

    assert spark_schema.fields[1].name == 'string_field'
    assert spark_schema.fields[1].dataType == StringType()

    assert spark_schema.fields[2].name == 'string_field_implicit'
    assert spark_schema.fields[2].dataType == StringType()

    assert TestSchema.fields['int_field'].name == 'int_field'
    assert TestSchema.fields['string_field'].name == 'string_field'
コード例 #11
0
def test_invalid_schema_field(synthetic_dataset, reader_factory):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_,
                       (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32,
                       (), ScalarCodec(ShortType()), False)
    ])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError) as e:
        reader_factory(synthetic_dataset.url,
                       schema_fields=BogusSchema.fields.values(),
                       shuffle_row_groups=False,
                       predicate=EqualPredicate(expected_values))

    assert 'bogus_key' in str(e)
コード例 #12
0
ファイル: test_codec_scalar.py プロジェクト: wxrui/petastorm
def test_numeric_types(spark_numpy_types):
    spark_type, numpy_type = spark_numpy_types

    codec = ScalarCodec(spark_type())
    field = UnischemaField(name='field_int', numpy_dtype=numpy_type, shape=(), codec=codec, nullable=False)

    min_val, max_val = np.iinfo(numpy_type).min, np.iinfo(numpy_type).max

    assert codec.decode(field, codec.encode(field, numpy_type(min_val))) == min_val
    assert codec.decode(field, codec.encode(field, numpy_type(max_val))) == max_val
コード例 #13
0
def test_encode_scalar_int():
    codec = ScalarCodec(IntegerType())
    field = UnischemaField(name='field_int',
                           numpy_dtype=np.int32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, np.int32(42))
    assert isinstance(encoded, int)
    assert 42 == encoded
コード例 #14
0
ファイル: test_codecs.py プロジェクト: meremeev/petastorm
 def test_bad_dtype(self):
     codec = CompressedImageCodec('png')
     field = UnischemaField(name='field_image',
                            numpy_dtype=np.uint8,
                            shape=(10, 20),
                            codec=codec,
                            nullable=False)
     with self.assertRaises(ValueError) as e:
         codec.encode(field, np.zeros((100, 200), dtype=np.uint16))
     self.assertTrue('Unexpected type' in str(e.exception))
コード例 #15
0
def test_unicode():
    codec = ScalarCodec(StringType())
    field = UnischemaField(name='field_string',
                           numpy_dtype=np.unicode_,
                           shape=(),
                           codec=codec,
                           nullable=False)

    assert codec.decode(field, codec.encode(field, 'abc')) == 'abc'
    assert codec.decode(field, codec.encode(field, '')) == ''
コード例 #16
0
def test_scalar_codec_decimal():
    codec = ScalarCodec(DecimalType(4, 3))
    field = UnischemaField(name='field_decimal',
                           numpy_dtype=Decimal,
                           shape=(),
                           codec=codec,
                           nullable=False)

    value = Decimal('123.4567')
    assert codec.decode(field, codec.encode(field, value)) == value
コード例 #17
0
def test_predicate_on_partitioned_dataset(tmpdir):
    """
    Generates a partitioned dataset and ensures that readers evaluate the type of the partition
    column according to the type given in the Unischema.
    """
    TestSchema = Unischema('TestSchema', [
        UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('test_field', np.int32,
                       (), ScalarCodec(IntegerType()), False),
    ])

    def test_row_generator(x):
        """Returns a single entry in the generated dataset."""
        return {'id': x, 'test_field': x * x}

    rowgroup_size_mb = 256
    dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext

    rows_count = 10
    with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb):

        rows_rdd = sc.parallelize(range(rows_count))\
            .map(test_row_generator)\
            .map(lambda x: dict_to_spark_row(TestSchema, x))

        spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
            .write \
            .partitionBy('id') \
            .parquet(dataset_url)

    with make_reader(dataset_url,
                     predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
        assert next(reader).id == 3
    with make_reader(dataset_url,
                     predicate=in_lambda(['id'],
                                         lambda x: x == '3')) as reader:
        with pytest.raises(StopIteration):
            # Predicate should have selected none, so a StopIteration should be raised.
            next(reader)
コード例 #18
0
def test_decode_numpy_scalar_when_codec_is_none():
    """Decoding a row that has a field with the codec set to None. The type should be deduced automatically
    from UnischemaField's numpy_dtype attribute"""

    MatrixSchema = Unischema('TestSchema',
                             [UnischemaField('scalar', np.float64, ())])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)
コード例 #19
0
def test_transform_spec_support_return_tensor(scalar_dataset, reader_factory):
    field1 = UnischemaField(name='abc', shape=(2, 3), numpy_dtype=np.float32)

    with pytest.raises(ValueError, match='field abc must be numpy array type'):
        ArrowReaderWorker._check_shape_and_ravel('xyz', field1)

    with pytest.raises(ValueError, match='field abc must be the shape'):
        ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 5)), field1)

    with pytest.raises(ValueError, match='field abc error: only support row major multi-dimensional array'):
        ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3), order='F'), field1)

    assert (6,) == ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3)), field1).shape

    for partial_shape in [(2, None), (None,), (None, None)]:
        field_with_unknown_dim = UnischemaField(name='abc', shape=partial_shape, numpy_dtype=np.float32)
        with pytest.raises(ValueError, match='All dimensions of a shape.*must be constant'):
            ArrowReaderWorker._check_shape_and_ravel(np.zeros((2, 3), order='F'), field_with_unknown_dim)

    def preproc_fn1(x):
        return pd.DataFrame({
            'tensor_col_1': x['id'].map(lambda _: np.random.rand(2, 3)),
            'tensor_col_2': x['id'].map(lambda _: np.random.rand(3, 4, 5)),
        })

    edit_fields = [
        ('tensor_col_1', np.float32, (2, 3), False),
        ('tensor_col_2', np.float32, (3, 4, 5), False),
    ]

    # This spec will remove all input columns and return one new column 'tensor_col_1' with shape (2, 3)
    spec1 = TransformSpec(
        preproc_fn1,
        edit_fields=edit_fields,
        removed_fields=list(scalar_dataset.data[0].keys())
    )

    with reader_factory(scalar_dataset.url, transform_spec=spec1) as reader:
        sample = next(reader)._asdict()
        assert len(sample) == 2
        assert (2, 3) == sample['tensor_col_1'].shape[1:] and \
               (3, 4, 5) == sample['tensor_col_2'].shape[1:]
コード例 #20
0
def test_encode_scalar_float():
    codec = ScalarCodec(FloatType())
    expected = np.random.random(()).astype(np.float64)
    field = UnischemaField(name='field_float',
                           numpy_dtype=np.float32,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, expected)
    assert isinstance(encoded, float)
    assert expected == encoded
コード例 #21
0
ファイル: test_codecs.py プロジェクト: aabbcc23/petastorm
def test_compressed_ndarray_codec():
    SHAPE = (10, 20, 30)
    expected = np.random.rand(*SHAPE).astype(dtype=np.int32)
    codec = CompressedNdarrayCodec()
    field = UnischemaField(name='test_name',
                           numpy_dtype=np.int32,
                           shape=SHAPE,
                           codec=CompressedNdarrayCodec(),
                           nullable=False)
    np.testing.assert_equal(codec.decode(field, codec.encode(field, expected)),
                            expected)
コード例 #22
0
def test_match_unischema_fields_legacy_warning():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int32', np.int32, (), None, False),
        UnischemaField('uint8', np.uint8, (), None, False),
        UnischemaField('uint16', np.uint16, (), None, False),
    ])

    # Check that no warnings are shown if the legacy and the new way of filtering produce the same results.
    with pytest.warns(None) as unexpected_warnings:
        match_unischema_fields(TestSchema, ['uint8'])
    assert not unexpected_warnings

    # uint8 and uint16 would have been matched using the old method, but not the new one
    with pytest.warns(UserWarning, match=r'schema_fields behavior has changed.*uint16, uint8'):
        assert match_unischema_fields(TestSchema, ['uint']) == []

    # Now, all fields will be matched, but in different order (legacy vs current). Make sure we don't issue a warning.
    with pytest.warns(None) as unexpected_warnings:
        match_unischema_fields(TestSchema, ['int', 'uint8', 'uint16', 'int32'])
    assert not unexpected_warnings
コード例 #23
0
ファイル: test_codecs.py プロジェクト: meremeev/petastorm
    def test_scalar_codec_unicode(self):
        codec = ScalarCodec(StringType())
        field = UnischemaField(name='field_string',
                               numpy_dtype=np.unicode_,
                               shape=(),
                               codec=codec,
                               nullable=False)

        self.assertEqual(codec.decode(field, codec.encode(field, 'abc')),
                         'abc')
        self.assertEqual(codec.decode(field, codec.encode(field, '')), '')
コード例 #24
0
def test_encode_scalar_string():
    codec = ScalarCodec(StringType())
    expected = 'surprise'
    field = UnischemaField(name='field_string',
                           numpy_dtype=np.unicode_,
                           shape=(),
                           codec=codec,
                           nullable=False)
    encoded = codec.encode(field, expected)
    assert isinstance(encoded, str)
    assert expected == encoded
コード例 #25
0
def test_decode_numpy_scalar_with_explicit_scalar_codec():
    """Decoding a row that has a field with the codec set explicitly"""

    MatrixSchema = Unischema('TestSchema', [
        UnischemaField('scalar', np.float64,
                       (), ScalarCodec(DoubleType()), False)
    ])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)
コード例 #26
0
ファイル: test_schema.py プロジェクト: georgia-tech-db/eva
 def test_get_petastorm_column_ndarray(self):
     expected_type = [np.int8, np.uint8, np.int16, np.int32, np.int64,
                      np.unicode_, np.bool_, np.float32, np.float64,
                      Decimal, np.str_, np.datetime64]
     col_name = 'frame_id'
     for array_type, np_type in zip(NdArrayType, expected_type):
         col = DataFrameColumn(col_name, ColumnType.NDARRAY, True,
                               array_type, [10, 10])
         petastorm_col = UnischemaField(col_name, np_type, [10, 10],
                                        NdarrayCodec(), True)
         self.assertEqual(SchemaUtils.get_petastorm_column(col),
                          petastorm_col)
コード例 #27
0
 def setUpClass(cls):
     cls._TestField1a = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField1b = UnischemaField('random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField1c = UnischemaField('Random', np.string_, (), ScalarCodec(StringType()), False)
     cls._TestField2a = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     cls._TestField2b = UnischemaField('id', np.int32, (), ScalarCodec(ShortType()), False)
     cls._TestField2c = UnischemaField('ID', np.int32, (), ScalarCodec(ShortType()), False)
コード例 #28
0
ファイル: test_codecs.py プロジェクト: aabbcc23/petastorm
def _test_scalar_type(spark_type, numpy_type, bits):
    codec = ScalarCodec(spark_type())
    field = UnischemaField(name='field_int',
                           numpy_dtype=numpy_type,
                           shape=(),
                           codec=codec,
                           nullable=False)

    min_val, max_val = -2**(bits - 1), 2**(bits - 1) - 1
    assert codec.decode(field, codec.encode(field,
                                            numpy_type(min_val))) == min_val
    assert codec.decode(field, codec.encode(field,
                                            numpy_type(max_val))) == max_val
コード例 #29
0
def test_ndarray_codec(codec_factory):
    SHAPE = (10, 20, 3)
    for dtype in NUMERIC_DTYPES:
        expected = np.random.rand(*SHAPE).astype(dtype=dtype)
        codec = codec_factory()
        field = UnischemaField(name='test_name',
                               numpy_dtype=dtype,
                               shape=SHAPE,
                               codec=codec,
                               nullable=False)
        actual = codec.decode(field, codec.encode(field, expected))
        np.testing.assert_equal(actual, expected)
        assert expected.dtype == actual.dtype
コード例 #30
0
def test_use_persisted_codec_and_not_provided_by_user(synthetic_dataset,
                                                      reader_factory):
    """In order to start using new codec for some field while maintain the ability to read old datasets that were
    written using an old codec, we need to make sure we are using stored UnischemaField.codec object (that contains
    an old codec/shape)."""
    new_unischema_instance = UnischemaField('matrix_uint16', np.uint16,
                                            (2, 3, 4),
                                            CompressedImageCodec('png'), False)

    with reader_factory(synthetic_dataset.url,
                        schema_fields=[new_unischema_instance]) as reader:
        row = next(reader)
    assert row.matrix_uint16.shape == (32, 16, 3)