Ejemplo n.º 1
0
 def test_from_bytes(self):
     self.assertEqual(
         False, Conversions.from_byte_buffer(BooleanType.get(), b'\x00'))
     self.assertEqual(
         True, Conversions.from_byte_buffer(BooleanType.get(), b'\x01'))
     self.assertEqual(
         1234,
         Conversions.from_byte_buffer(IntegerType.get(),
                                      b'\xd2\x04\x00\x00'))
     self.assertEqual(
         1234,
         Conversions.from_byte_buffer(LongType.get(),
                                      b'\xd2\x04\x00\x00\x00\x00\x00\x00'))
     self.assertAlmostEqual(1.2345,
                            Conversions.from_byte_buffer(
                                FloatType.get(), b'\x19\x04\x9e?'),
                            places=5)
     self.assertAlmostEqual(
         1.2345,
         Conversions.from_byte_buffer(DoubleType.get(),
                                      b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f'))
     self.assertEqual(
         1234,
         Conversions.from_byte_buffer(DateType.get(), b'\xd2\x04\x00\x00'))
     self.assertEqual(
         100000000000,
         Conversions.from_byte_buffer(TimeType.get(),
                                      b'\x00\xe8vH\x17\x00\x00\x00'))
     self.assertEqual(
         100000000000,
         Conversions.from_byte_buffer(TimestampType.with_timezone(),
                                      b'\x00\xe8vH\x17\x00\x00\x00'))
     self.assertEqual(
         100000000000,
         Conversions.from_byte_buffer(TimestampType.without_timezone(),
                                      b'\x00\xe8vH\x17\x00\x00\x00'))
     self.assertEqual(
         "foo", Conversions.from_byte_buffer(StringType.get(), b'foo'))
     self.assertEqual(
         uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"),
         Conversions.from_byte_buffer(
             UUIDType.get(), b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7'))
     self.assertEqual(
         b'foo', Conversions.from_byte_buffer(FixedType.of_length(3),
                                              b'foo'))
     self.assertEqual(
         b'foo', Conversions.from_byte_buffer(BinaryType.get(), b'foo'))
     self.assertEqual(
         Decimal(123.45).quantize(Decimal(".01")),
         Conversions.from_byte_buffer(DecimalType.of(5, 2), b'\x30\x39'))
     self.assertEqual(
         Decimal(123.4567).quantize(Decimal(".0001")),
         Conversions.from_byte_buffer(DecimalType.of(5, 4),
                                      b'\x00\x12\xd6\x87'))
     self.assertEqual(
         Decimal(-123.4567).quantize(Decimal(".0001")),
         Conversions.from_byte_buffer(DecimalType.of(5, 4),
                                      b'\xff\xed\x29\x79'))
def test_primitive_types(primitive_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])
    compare_schema(
        expected_schema,
        convert_parquet_to_iceberg(primitive_type_test_parquet_file))
Ejemplo n.º 3
0
def supported_primitives():
    return StructType.of([
        NestedField.required(100, "id", LongType.get()),
        NestedField.optional(101, "data", StringType.get()),
        NestedField.required(102, "b", BooleanType.get()),
        NestedField.optional(103, "i", IntegerType.get()),
        NestedField.required(104, "l", LongType.get()),
        NestedField.optional(105, "f", FloatType.get()),
        NestedField.required(106, "d", DoubleType.get()),
        NestedField.optional(107, "date", DateType.get()),
        NestedField.required(108, "ts", TimestampType.with_timezone()),
        NestedField.required(110, "s", StringType.get()),
        NestedField.required(111, "uuid", UUIDType.get()),
        NestedField.required(112, "fixed", FixedType.of_length(7)),
        NestedField.optional(113, "bytes", BinaryType.get()),
        NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)),
        NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)),
        NestedField.required(114, "dec_38_10", DecimalType.of(38, 10))
    ])
Ejemplo n.º 4
0
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    assert reader.read() == source_table
Ejemplo n.º 5
0
class AvroToIceberg(object):
    FIELD_ID_PROP = "field-id"
    FIELD_TYPE_PROP = "type"
    FIELD_NAME_PROP = "name"
    FIELD_LOGICAL_TYPE_PROP = "logicalType"
    FIELD_FIELDS_PROP = "fields"
    FIELD_ITEMS_PROP = "items"
    FIELD_ELEMENT_ID_PROP = "element-id"

    AVRO_JSON_PRIMITIVE_TYPES = ("boolean", "int", "long", "float", "double", "bytes", "string")
    AVRO_JSON_COMPLEX_TYPES = ("record", "array", "enum", "fixed")

    TYPE_PROCESSING_MAP = {str: lambda x, y: AvroToIceberg.convert_str_type(x, y),
                           dict: lambda x, y: AvroToIceberg.convert_complex_type(x, y),
                           list: lambda x, y: AvroToIceberg.convert_union_type(x, y)}

    COMPLEX_TYPE_PROCESSING_MAP = {"record": lambda x, y: AvroToIceberg.convert_record_type(x, y),
                                   "array": lambda x, y: AvroToIceberg.convert_array_type(x, y),
                                   "map": lambda x, y: AvroToIceberg.convert_map_type(x, y)}

    PRIMITIVE_FIELD_TYPE_MAP = {"boolean": BooleanType.get(),
                                "bytes": BinaryType.get(),
                                "date": DateType.get(),
                                "double": DoubleType.get(),
                                "float": FloatType.get(),
                                "int": IntegerType.get(),
                                "long": LongType.get(),
                                "string": StringType.get(),
                                "time-millis": TimeType.get(),
                                "timestamp-millis": TimestampType.without_timezone()}

    PROCESS_FUNCS = {TypeID.STRUCT: lambda avro_row, field: AvroToIceberg.get_field_from_struct(avro_row, field),
                     TypeID.LIST: lambda avro_row, field: AvroToIceberg.get_field_from_list(avro_row, field),
                     TypeID.MAP: lambda avro_row, field: AvroToIceberg.get_field_from_map(avro_row, field)}

    @staticmethod
    def convert_avro_schema_to_iceberg(avro_schema):
        if avro_schema.get(AvroToIceberg.FIELD_TYPE_PROP) != "record":
            raise RuntimeError("Cannot convert avro schema to iceberg %s" % avro_schema)

        struct = AvroToIceberg.convert_type(avro_schema, None)

        return Schema(struct[0].fields)

    @staticmethod
    def convert_record_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)

        if avro_field_type != "record":
            raise RuntimeError("Field type muse be 'record': %s" % avro_field_type)

        fields = avro_field.get(AvroToIceberg.FIELD_FIELDS_PROP)

        iceberg_fields = []
        if next_id is None:
            next_id = len(fields)
        for field in fields:
            iceberg_field, next_id = AvroToIceberg.convert_avro_field_to_iceberg(field, next_id=next_id)
            iceberg_fields.append(iceberg_field)

        return StructType.of(iceberg_fields), next_id

    @staticmethod
    def convert_avro_field_to_iceberg(field, next_id):
        field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id)

        if field.get(AvroToIceberg.FIELD_ID_PROP) is None:
            return field_type, next_id

        if is_optional:
            return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP),
                                        field.get(AvroToIceberg.FIELD_NAME_PROP),
                                        field_type), next_id
        else:
            return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP),
                                        field.get(AvroToIceberg.FIELD_NAME_PROP),
                                        field_type), next_id

    @staticmethod
    def convert_type(field, next_id=None):
        avro_field_type = field.get(AvroToIceberg.FIELD_TYPE_PROP)

        optional = AvroToIceberg.is_option_schema(avro_field_type)

        processing_func = AvroToIceberg.TYPE_PROCESSING_MAP.get(type(avro_field_type))
        if processing_func is None:
            raise RuntimeError("No function found to process %s" % avro_field_type)

        iceberg_type, next_id = processing_func(field, next_id)

        return iceberg_type, optional, next_id

    @staticmethod
    def convert_str_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP)
        if not isinstance(avro_field_type, str):
            raise RuntimeError("Field type must be of type str: %s" % avro_field_type)

        if avro_field_type in AvroToIceberg.AVRO_JSON_PRIMITIVE_TYPES:
            if logical_type is not None:
                return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(logical_type), next_id
            else:
                return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(avro_field_type), next_id

        elif avro_field_type in AvroToIceberg.AVRO_JSON_COMPLEX_TYPES:
            if logical_type is not None:
                processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(logical_type)
            else:
                processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(avro_field_type)

            if processing_func is None:
                raise RuntimeError("No function found to process %s" % avro_field_type)

            return processing_func(avro_field, next_id)
        else:
            raise RuntimeError("Unknown type %s" % avro_field_type)

    @staticmethod
    def convert_complex_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        if not isinstance(avro_field_type, dict):
            raise RuntimeError("Complex field type must be of type dict: %s" % avro_field_type)

        return AvroToIceberg.convert_avro_field_to_iceberg(avro_field_type, next_id)

    @staticmethod
    def convert_union_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        if not isinstance(avro_field_type, list):
            raise RuntimeError("Union field type must be of type list: %s" % avro_field_type)

        if len(avro_field_type) > 2:
            raise RuntimeError("Cannot process unions larger than 2 items: %s" % avro_field_type)
        for item in avro_field_type:
            if isinstance(item, str) and item == "null":
                continue
            avro_field_type = item
        avro_field[AvroToIceberg.FIELD_TYPE_PROP] = avro_field_type
        items = AvroToIceberg.convert_type(avro_field, next_id)
        return items[0], items[2]

    @staticmethod
    def convert_array_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        if avro_field_type != "array":
            raise RuntimeError("Avro type must be array: %s" % avro_field_type)
        element_id = avro_field.get(AvroToIceberg.FIELD_ELEMENT_ID_PROP)
        items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP)

        is_optional = AvroToIceberg.is_option_schema(items)

        if isinstance(items, str) and items in AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP:
            item_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(items)
            if item_type is None:
                raise RuntimeError("No mapping found for type %s" % items)
        else:
            raise RuntimeError("Complex list types not yet implemented")

        if is_optional:
            return ListType.of_optional(element_id, item_type), next_id
        else:
            return ListType.of_required(element_id, item_type), next_id

    @staticmethod
    def convert_map_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        avro_logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP)
        if avro_field_type != "array" or avro_logical_type != "map":
            raise RuntimeError("Avro type must be array and logical type must be map: %s" % avro_logical_type)
        is_optional = False
        items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP)
        for field in items.get(AvroToIceberg.FIELD_FIELDS_PROP, list()):
            if field.get(AvroToIceberg.FIELD_NAME_PROP) == "key":
                key_id = field.get(AvroToIceberg.FIELD_ID_PROP)
                if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str):
                    raise RuntimeError("Support for complex map keys not yet implemented")
                key_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP))
            elif field.get(AvroToIceberg.FIELD_NAME_PROP) == "value":
                value_id = field.get(AvroToIceberg.FIELD_ID_PROP)
                if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str):
                    raise RuntimeError("Support for complex map values not yet imeplemented")
                value_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP))

        if is_optional:
            return MapType.of_optional(key_id, value_id, key_type, value_type), next_id
        else:
            return MapType.of_required(key_id, value_id, key_type, value_type), next_id

    @staticmethod
    def is_option_schema(field_type):
        if isinstance(field_type, list) and len(field_type) == 2 and "null" in field_type:
            return True

        return False

    @staticmethod
    def read_avro_file(iceberg_schema, data_file):
        fo = data_file.new_fo()
        avro_reader = fastavro.reader(fo)
        for avro_row in avro_reader:
            iceberg_row = dict()
            for field in iceberg_schema.as_struct().fields:
                iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field)
            yield iceberg_row
        fo.close()

    @staticmethod
    def read_avro_row(iceberg_schema, avro_reader):
        try:
            for avro_row in avro_reader:
                iceberg_row = dict()
                for field in iceberg_schema.as_struct().fields:
                    iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field)
                yield iceberg_row
        except StopIteration:
            return

    @staticmethod
    def get_field_from_avro(avro_row, field):
        try:
            return AvroToIceberg.PROCESS_FUNCS.get(field.type.type_id,
                                                   AvroToIceberg.get_field_from_primitive)(avro_row, field)
        except KeyError:
            raise RuntimeError("Don't know how to get field of type: %s" % field.type.type_id)

    @staticmethod
    def get_field_from_primitive(avro_row, field):
        try:
            return avro_row[field.name]
        except KeyError:
            if field.is_required:
                raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row))

    @staticmethod
    def get_field_from_struct(avro_row, field):
        field_obj = {}
        for nested_field in field.type.fields:
            field_obj[nested_field.name] = AvroToIceberg.get_field_from_avro(avro_row[field.name], nested_field)
        return field_obj

    @staticmethod
    def get_field_from_list(avro_row, field):
        try:
            return avro_row[field.name]
        except KeyError:
            if field.is_required:
                raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row))

    @staticmethod
    def get_field_from_map(avro_row, field):
        val_map = dict()

        try:
            avro_value = avro_row[field.name]
        except KeyError:
            if field.is_required:
                raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row))
            else:
                return None

        for val in avro_value:
            val_map[val['key']] = val['value']

        return val_map
Ejemplo n.º 6
0
import os
import random
import tempfile
import time

from iceberg.api import Files, PartitionSpec, Schema
from iceberg.api.types import BooleanType, IntegerType, LongType, NestedField, StringType
from iceberg.core import (BaseSnapshot, BaseTable, ConfigProperties,
                          GenericManifestFile, SnapshotLogEntry, TableMetadata,
                          TableMetadataParser, TableOperations,
                          TableProperties)
from iceberg.exceptions import AlreadyExistsException, CommitFailedException
import pytest

SCHEMA = Schema([NestedField.optional(1, "b", BooleanType.get())])
METADATA = dict()
VERSIONS = dict()


class LocalTableOperations(TableOperations):
    def current(self):
        raise RuntimeError("Not implemented for tests")

    def refresh(self):
        raise RuntimeError("Not implemented for tests")

    def commit(self, base, metadata):
        raise RuntimeError("Not implemented for tests")

    def new_input_file(self, path):
Ejemplo n.º 7
0
    def test_byte_buffer_conversions(self):
        # booleans are stored as 0x00 for 'false' and a non-zero byte for 'true'
        self.assertConversion(False, BooleanType.get(), b'\x00')
        self.assertConversion(True, BooleanType.get(), b'\x01')
        self.assertEqual(b'\x00', Literal.of(False).to_byte_buffer())
        self.assertEqual(b'\x01', Literal.of(True).to_byte_buffer())

        # integers are stored as 4 bytes in little-endian order
        # 84202 is 0...01|01001000|11101010 in binary
        # 11101010 -> 234 (-22), 01001000 -> 72, 00000001 -> 1, 00000000 -> 0
        self.assertConversion(84202, IntegerType.get(), bytes([234, 72, 1, 0]))
        self.assertEqual(bytes([234, 72, 1, 0]),
                         Literal.of(84202).to_byte_buffer())

        # longs are stored as 8 bytes in little-endian order
        # 200L is 0...0|11001000 in binary
        # 11001000 -> 200 (-56), 00000000 -> 0, ... , 00000000 -> 0
        self.assertConversion(200, LongType.get(),
                              bytes([200, 0, 0, 0, 0, 0, 0, 0]))
        self.assertEqual(bytes([200, 0, 0, 0, 0, 0, 0, 0]),
                         Literal.of(200).to(LongType.get()).to_byte_buffer())

        # floats are stored as 4 bytes in little-endian order
        # floating point numbers are represented as sign * 2ˆexponent * mantissa
        # -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary
        # 00000000 -> 0, 00000000 -> 0, 10010000 -> 144 (-112), 11000000 -> 192 (-64),
        self.assertConversion(-4.5, FloatType.get(), bytes([0, 0, 144, 192]))
        self.assertEqual(bytes([0, 0, 144, 192]),
                         Literal.of(-4.5).to_byte_buffer())

        # doubles are stored as 8 bytes in little-endian order
        # floating point numbers are represented as sign * 2ˆexponent * mantissa
        # 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0
        # 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64
        self.assertConversion(6.0, DoubleType.get(),
                              bytes([0, 0, 0, 0, 0, 0, 24, 64]))
        self.assertEqual(bytes([0, 0, 0, 0, 0, 0, 24, 64]),
                         Literal.of(6.0).to(DoubleType.get()).to_byte_buffer())

        # dates are stored as days from 1970-01-01 in a 4-byte little-endian int
        # 1000 is 0...0|00000011|11101000 in binary
        # 11101000 -> 232 (-24), 00000011 -> 3, ... , 00000000 -> 0
        self.assertConversion(1000, DateType.get(), bytes([232, 3, 0, 0]))
        self.assertEqual(bytes([232, 3, 0, 0]),
                         Literal.of(1000).to(DateType.get()).to_byte_buffer())

        # time is stored as microseconds from midnight in an 8-byte little-endian long
        # 10000L is 0...0|00100111|00010000 in binary
        # 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0
        self.assertConversion(10000, TimeType.get(),
                              bytes([16, 39, 0, 0, 0, 0, 0, 0]))
        self.assertEqual(
            bytes([16, 39, 0, 0, 0, 0, 0, 0]),
            Literal.of(10000).to(LongType.get()).to(
                TimeType.get()).to_byte_buffer())

        # timestamps are stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long
        # 400000L is 0...110|00011010|10000000 in binary
        # 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0
        self.assertConversion(400000, TimestampType.without_timezone(),
                              bytes([128, 26, 6, 0, 0, 0, 0, 0]))
        self.assertConversion(400000, TimestampType.with_timezone(),
                              bytes([128, 26, 6, 0, 0, 0, 0, 0]))
        self.assertEqual(
            bytes([128, 26, 6, 0, 0, 0, 0, 0]),
            Literal.of(400000).to(LongType.get()).to(
                TimestampType.without_timezone()).to_byte_buffer())
        self.assertEqual(
            bytes([128, 26, 6, 0, 0, 0, 0, 0]),
            Literal.of(400000).to(LongType.get()).to(
                TimestampType.with_timezone()).to_byte_buffer())

        # strings are stored as UTF-8 bytes (without length)
        # 'A' -> 65, 'B' -> 66, 'C' -> 67
        self.assertConversion("ABC", StringType.get(), bytes([65, 66, 67]))
        self.assertEqual(bytes([65, 66, 67]),
                         Literal.of("ABC").to_byte_buffer())

        # uuids are stored as 16-byte big-endian values
        # f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7
        # 0xF7 -> 11110111 -> 247 (-9), 0x9C -> 10011100 -> 156 (-100), 0x3E -> 00111110 -> 62,
        # 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124,
        # 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> 189 (-67), 0xA4 -> 10100100 -> 164 (-92),
        # 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 52,
        # 0x9C -> 10011100 -> 156 (-100), 0xB7 -> 10110111 -> 183 (-73), 0x85 -> 10000101 -> 133 (-123),
        # 0xE7 -> 11100111 -> 231 (-25)
        self.assertConversion(
            uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), UUIDType.get(),
            bytes([
                247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183,
                133, 231
            ]))
        self.assertEqual(
            bytes([
                247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183,
                133, 231
            ]),
            Literal.of(uuid.UUID(
                "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer())

        # fixed values are stored directly
        # 'a' -> 97, 'b' -> 98
        self.assertConversion(bytes("ab", "utf8"), FixedType.of_length(2),
                              bytes([97, 98]))
        self.assertEqual(bytes([97, 98]),
                         Literal.of(bytes("ab", "utf8")).to_byte_buffer())

        # binary values are stored directly
        # 'Z' -> 90
        self.assertConversion(bytearray("Z", "utf8"), BinaryType.get(),
                              bytes([90]))
        self.assertEqual(bytes([90]),
                         Literal.of(bytearray("Z", "utf8")).to_byte_buffer())

        # decimals are stored as unscaled values in the form of two's-complement big-endian binary,
        # using the minimum number of bytes for the values
        # 345 is 0...1|01011001 in binary
        # 00000001 -> 1, 01011001 -> 89
        self.assertConversion(
            Decimal(3.45).quantize(Decimal(".01")), DecimalType.of(3, 2),
            bytes([1, 89]))
        self.assertEqual(
            bytes([1, 89]),
            Literal.of(3.45).to(DecimalType.of(3, 2)).to_byte_buffer())

        # decimal on 3-bytes to test that we use the minimum number of bytes and not a power of 2
        # 1234567 is 00010010|11010110|10000111 in binary
        # 00010010 -> 18, 11010110 -> 214, 10000111 -> 135
        self.assertConversion(
            Decimal(123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4),
            bytes([18, 214, 135]))
        self.assertEqual(
            bytes([18, 214, 135]),
            Literal.of(123.4567).to(DecimalType.of(7, 4)).to_byte_buffer())

        # negative decimal to test two's complement
        # -1234567 is 11101101|00101001|01111001 in binary
        # 11101101 -> 237, 00101001 -> 41, 01111001 -> 121
        self.assertConversion(
            Decimal(-123.4567).quantize(Decimal(".0001")),
            DecimalType.of(7, 4), bytes([237, 41, 121]))
        self.assertEqual(
            bytes([237, 41, 121]),
            Literal.of(-123.4567).to(DecimalType.of(7, 4)).to_byte_buffer())

        # test empty byte in decimal
        # 11 is 00001011 in binary
        # 00001011 -> 11
        self.assertConversion(
            Decimal(0.011).quantize(Decimal(".001")), DecimalType.of(10, 3),
            bytes([11]))
        self.assertEqual(
            bytes([11]),
            Literal.of(0.011).to(DecimalType.of(10, 3)).to_byte_buffer())
Ejemplo n.º 8
0
                               FloatType,
                               IntegerType,
                               ListType,
                               LongType,
                               MapType,
                               NestedField,
                               StringType,
                               StructType,
                               TimestampType)
from iceberg.api.types import Type
import pyarrow as pa
from pyarrow.parquet import lib, ParquetFile

_logger = logging.getLogger(__name__)

arrow_type_map = {lib.Type_BOOL: lambda x=None: BooleanType.get(),
                  lib.Type_DATE32: lambda x=None: DateType.get(),
                  lib.Type_DECIMAL128: lambda x=None: DecimalType.of(x.precision, x.scale),
                  lib.Type_DOUBLE: lambda x=None: DoubleType.get(),
                  lib.Type_FIXED_SIZE_BINARY: lambda x=None: FixedType.of_length(x.byte_width),
                  lib.Type_BINARY: lambda x=None: BinaryType.get(),
                  lib.Type_FLOAT: lambda x=None: FloatType.get(),
                  lib.Type_STRING: lambda x=None: StringType.get(),
                  lib.Type_INT32: lambda x=None: IntegerType.get(),
                  lib.Type_INT64: lambda x=None: LongType.get(),
                  lib.Type_TIMESTAMP: lambda x=None: (TimestampType.without_timezone()
                                                      if x.tz is None
                                                      else TimestampType.with_timezone())
                  }

Ejemplo n.º 9
0
from iceberg.api.types import (BinaryType,
                               BooleanType,
                               DateType,
                               DecimalType,
                               DoubleType,
                               FixedType,
                               FloatType,
                               IntegerType,
                               LongType,
                               StringType,
                               TimestampType,
                               TimeType,
                               UUIDType)

PRIMITIVES = [BinaryType.get(),
              BooleanType.get(),
              DateType.get(),
              DecimalType.of(9, 2),
              DecimalType.of(11, 2),
              DecimalType.of(9, 3),
              DoubleType.get(),
              FixedType.of_length(3),
              FixedType.of_length(4),
              FloatType.get(),
              IntegerType.get(),
              LongType.get(),
              StringType.get(),
              TimestampType.with_timezone(),
              TimestampType.without_timezone(),
              TimeType.get(),
              UUIDType.get()]