Python NestedField.optional Beispiele, iceberg.api.types.NestedField.optional Python Beispiele

Beispiel #1

0

Datei anzeigen

def test_column_rename(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64())
    ]
    schema = pa.schema([
        pa.field("int_col", pa.int32(), False),
        pa.field("bigint_col", pa.int64(), True),
        pa.field("string_col", pa.string(), True),
        pa.field("float_col", pa.float32(), True),
        pa.field("dbl_col", pa.float64(), True)
    ])

    source_table = pa.table(pyarrow_array, schema=schema)

    target_table = reader.read()
    assert source_table == target_table

Beispiel #2

0

Datei anzeigen

Datei: conftest.py Projekt: yuhuali1989/incubator-iceberg

def schema():
    return Schema(NestedField.required(1, "id", IntegerType.get()),
                  NestedField.optional(2, "no_stats", IntegerType.get()),
                  NestedField.required(3, "required", StringType.get()),
                  NestedField.optional(4, "all_nulls", StringType.get()),
                  NestedField.optional(5, "some_nulls", StringType.get()),
                  NestedField.optional(6, "no_nulls", StringType.get()))

Beispiel #3

0

Datei anzeigen

def inc_man_spec():
    inc_schema = Schema(
        NestedField.required(1, "id", IntegerType.get()),
        NestedField.optional(4, "all_nulls", StringType.get()),
        NestedField.optional(5, "some_nulls", StringType.get()),
        NestedField.optional(6, "no_nulls", StringType.get()))
    return (PartitionSpec.builder_for(inc_schema).with_spec_id(0).identity(
        "id").identity("all_nulls").identity("some_nulls").identity(
            "no_nulls").build())

Beispiel #4

0

Datei anzeigen

Datei: test_parquet_to_iceberg.py Projekt: shenodaguirguis/iceberg-1

def test_unnested_complex_types(unnested_complex_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.optional(1, "list_int_col",
                             ListType.of_optional(3, IntegerType.get())),
        NestedField.optional(4, "list_str_col",
                             ListType.of_optional(6, StringType.get())),
        NestedField.optional(
            7, "struct_col",
            StructType.of([
                NestedField.optional(8, "f1", IntegerType.get()),
                NestedField.optional(9, "f2", StringType.get())
            ]))
    ])
    converted_schema = convert_parquet_to_iceberg(
        unnested_complex_type_test_parquet_file)
    compare_schema(expected_schema, converted_schema)

Beispiel #5

0

Datei anzeigen

Datei: data_file.py Projekt: rdsr/li-iceberg-rdsr

 def get_type(partition_type):
     return StructType.of([
         NestedField.required(100, "file_path", StringType.get()),
         NestedField.required(101, "file_format", StringType.get()),
         NestedField.required(102, "partition", partition_type),
         NestedField.required(103, "record_count", LongType.get()),
         NestedField.required(104, "file_size_in_bytes", LongType.get()),
         NestedField.required(105, "block_size_in_bytes", LongType.get()),
         NestedField.optional(106, "file_ordinal", IntegerType.get()),
         NestedField.optional(107, "sort_columns",
                              ListType.of_required(112, IntegerType.get())),
         NestedField.optional(
             108, "column_sizes",
             MapType.of_required(117, 118, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             109, "value_counts",
             MapType.of_required(119, 120, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             110, "null_value_counts",
             MapType.of_required(121, 122, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             125, "lower_bounds",
             MapType.of_required(126, 127, IntegerType.get(),
                                 BinaryType.get())),
         NestedField.optional(
             128, "upper_bounds",
             MapType.of_required(129, 130, IntegerType.get(),
                                 BinaryType.get()))
     ]
                          # NEXT ID TO ASSIGN: 131
                          )

Beispiel #6

0

Datei anzeigen

Datei: conftest.py Projekt: rdsr/li-iceberg-rdsr

def supported_primitives():
    return StructType.of([
        NestedField.required(100, "id", LongType.get()),
        NestedField.optional(101, "data", StringType.get()),
        NestedField.required(102, "b", BooleanType.get()),
        NestedField.optional(103, "i", IntegerType.get()),
        NestedField.required(104, "l", LongType.get()),
        NestedField.optional(105, "f", FloatType.get()),
        NestedField.required(106, "d", DoubleType.get()),
        NestedField.optional(107, "date", DateType.get()),
        NestedField.required(108, "ts", TimestampType.with_timezone()),
        NestedField.required(110, "s", StringType.get()),
        NestedField.required(111, "uuid", UUIDType.get()),
        NestedField.required(112, "fixed", FixedType.of_length(7)),
        NestedField.optional(113, "bytes", BinaryType.get()),
        NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)),
        NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)),
        NestedField.required(114, "dec_38_10", DecimalType.of(38, 10))
    ])

Beispiel #7

0

Datei anzeigen

def test_compound_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(
        input_file, expected_schema, {},
        Expressions.and_(Expressions.equal("string_col", "us"),
                         Expressions.equal("int_col", 1)), True)
    pyarrow_array = [
        pa.array([1], type=pa.int32()),
        pa.array([1], type=pa.int64()),
        pa.array([1.0], type=pa.float32()),
        pa.array([1.0], type=pa.float64()),
        pa.array(['us'], type=pa.string())
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("string_col",
                                         pa.string(),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table

Beispiel #8

0

Datei anzeigen

def rg_expected_schema():
    return Schema([
        NestedField.required(1, "string_col", StringType.get()),
        NestedField.required(2, "long_col", LongType.get()),
        NestedField.required(3, "int_col", IntegerType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "null_col", StringType.get()),
        NestedField.optional(6, "missing_col", StringType.get()),
        NestedField.optional(7, "no_stats_col", StringType.get()),
        NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(9, "ts_wotz_col",
                             TimestampType.without_timezone()),
        NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)),
        NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)),
        NestedField.optional(12, "date_type", DateType.get()),
    ])

Beispiel #9

0

Datei anzeigen

def test_not_null(assert_and_unwrap):
    optional = StructType.of([NestedField.optional(21, "s", StringType.get())])
    unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s"))
    expr = unbound.bind(optional)
    bound = assert_and_unwrap(expr)
    assert Operation.NOT_NULL == bound.op
    assert 21 == bound.ref.field.field_id
    assert bound.lit is None

    required = StructType.of([NestedField.required(22, "s", StringType.get())])
    assert Expressions.always_true() == unbound.bind(required)

Beispiel #10

0

Datei anzeigen

def test_decimal_column_add(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9))
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array([None, None, None, None, None], type=pa.decimal128(38, 9))
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("new_dec_col",
                                         pa.decimal128(38, 9),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table

Beispiel #11

0

Datei anzeigen

    def convert_avro_field_to_iceberg(field, next_id):
        field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id)

        if field.get(AvroToIceberg.FIELD_ID_PROP) is None:
            return field_type, next_id

        if is_optional:
            return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP),
                                        field.get(AvroToIceberg.FIELD_NAME_PROP),
                                        field_type), next_id
        else:
            return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP),
                                        field.get(AvroToIceberg.FIELD_NAME_PROP),
                                        field_type), next_id

Beispiel #12

0

Datei anzeigen

def test_schema_evolution_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(16, "other_new_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(15, "new_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.not_null("new_col"), True)

    schema = pa.schema([
        pa.field("int_col", pa.int32(), nullable=False),
        pa.field("bigint_col", pa.int64(), nullable=True),
        pa.field("other_new_col", pa.int64(), nullable=True),
        pa.field("float_col", pa.float32(), nullable=True),
        pa.field("dbl_col", pa.float64(), nullable=True),
        pa.field("string_col", pa.string(), nullable=True),
        pa.field("new_col", pa.string(), nullable=True)
    ])

    pyarrow_not_null_array = [
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.int64()),
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.float32()),
        pa.array([], type=pa.float64()),
        pa.array([], type=pa.string()),
        pa.array([], type=pa.string())
    ]

    not_null_table = pa.table(pyarrow_not_null_array, schema=schema)
    pyarrow_null_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([None, None, None, None, None], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([None, None, None, None, None], type=pa.string())
    ]
    null_table = pa.table(pyarrow_null_array, schema=schema)

    target_table = reader.read()
    assert not_null_table == target_table

    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.is_null("new_col"), True)
    target_table = reader.read()
    assert null_table == target_table

Beispiel #13

0

Datei anzeigen

Datei: schema_parser.py Projekt: andrei-ionescu/iceberg

    def struct_from_dict(dict_obj):
        struct_fields = list()
        fields = dict_obj.get(SchemaParser.FIELDS)
        for field in fields:
            field_id = field.get(SchemaParser.ID)
            field_name = field.get(SchemaParser.NAME)
            field_type = SchemaParser.type_from_dict(
                field.get(SchemaParser.TYPE))

            if field.get(SchemaParser.REQUIRED):
                struct_fields.append(
                    NestedField.required(field_id, field_name, field_type))
            else:
                struct_fields.append(
                    NestedField.optional(field_id, field_name, field_type))

        return StructType.of(struct_fields)

Beispiel #14

0

Datei anzeigen

def test_projection(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    num_cols = source_table.num_columns
    for i in range(1, num_cols - 1):
        source_table = source_table.remove_column(num_cols - i)

    assert source_table == reader.read()

Beispiel #15

0

Datei anzeigen

Datei: conftest.py Projekt: yuhuali1989/incubator-iceberg

import io
import pickle
import uuid

from iceberg.api import DataFile
from iceberg.api.expressions import (BoundPredicate, Expressions,
                                     ExpressionVisitors, Literal, Operation,
                                     UnboundPredicate)
from iceberg.api.schema import Schema
from iceberg.api.struct_like import StructLike
from iceberg.api.types import (BinaryType, Conversions, DateType, DecimalType,
                               FixedType, IntegerType, NestedField, StringType,
                               TimestampType, TimeType)
import pytest

exp_schema = Schema(NestedField.optional(34, "a", IntegerType.get()))


class TestHelpers(object):
    @staticmethod
    def assert_all_references_bound(message, expr):
        ExpressionVisitors.visit(expr,
                                 TestHelpers.CheckReferencesBound(message))

    @staticmethod
    def assert_and_unwrap(expr, expected=None):
        if expected is not None:
            assert isinstance(expr, expected)
        else:
            assert isinstance(expr, BoundPredicate)

Beispiel #16

0

Datei anzeigen

# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


import iceberg.api.expressions as exp
from iceberg.api.types import (IntegerType,
                               NestedField,
                               StringType,
                               StructType)
from iceberg.exceptions import ValidationException
from pytest import raises

STRUCT = StructType.of([NestedField.required(13, "x", IntegerType.get()),
                       NestedField.required(14, "y", IntegerType.get()),
                       NestedField.optional(15, "z", IntegerType.get())])


def test_less_than(row_of):
    evaluator = exp.evaluator.Evaluator(STRUCT,
                                        exp.expressions.Expressions.less_than("x", 7))
    assert not evaluator.eval(row_of((7, 8, None)))
    assert evaluator.eval(row_of((6, 8, None)))


def test_less_than_or_equal(row_of):
    evaluator = exp.evaluator.Evaluator(STRUCT,
                                        exp.expressions.Expressions.less_than_or_equal("x", 7))
    assert evaluator.eval(row_of((7, 8, None)))
    assert evaluator.eval(row_of((6, 8, None)))
    assert not evaluator.eval(row_of((8, 8, None)))

Beispiel #17

0

Datei anzeigen

Datei: test_parquet_to_iceberg.py Projekt: shenodaguirguis/iceberg-1

def test_primitive_types(primitive_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])
    compare_schema(
        expected_schema,
        convert_parquet_to_iceberg(primitive_type_test_parquet_file))

Beispiel #18

0

Datei anzeigen

Datei: conftest.py Projekt: rdsr/li-iceberg-rdsr

def iceberg_full_read_projection_schema():
    return Schema([
        NestedField.required(0, "id", LongType.get()),
        NestedField.optional(1, "data", StringType.get())
    ])

Beispiel #19

0

Datei anzeigen

def get_nested_field(field_id: int, field_name: str, field_type: Type, nullable: bool) -> NestedField:
    if nullable:
        return NestedField.optional(field_id, field_name, field_type)
    else:
        return NestedField.required(field_id, field_name, field_type)

Beispiel #20

0

Datei anzeigen

Datei: conftest.py Projekt: rdsr/li-iceberg-rdsr

import os
import random
import tempfile
import time

from iceberg.api import Files, PartitionSpec, Schema
from iceberg.api.types import BooleanType, IntegerType, LongType, NestedField, StringType
from iceberg.core import (BaseSnapshot, BaseTable, ConfigProperties,
                          GenericManifestFile, SnapshotLogEntry, TableMetadata,
                          TableMetadataParser, TableOperations,
                          TableProperties)
from iceberg.exceptions import AlreadyExistsException, CommitFailedException
import pytest

SCHEMA = Schema([NestedField.optional(1, "b", BooleanType.get())])
METADATA = dict()
VERSIONS = dict()


class LocalTableOperations(TableOperations):
    def current(self):
        raise RuntimeError("Not implemented for tests")

    def refresh(self):
        raise RuntimeError("Not implemented for tests")

    def commit(self, base, metadata):
        raise RuntimeError("Not implemented for tests")

    def new_input_file(self, path):

Beispiel #21

0

Datei anzeigen

def test_basic_read(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    assert reader.read() == source_table