Python Schema Examples, iceberg.api.Schema Python Examples

Example #1

0

Show file

File: test_base_table_scan.py Project: rdsr/li-iceberg-rdsr

def test_table_scan_honors_select_without_case_sensitivity(ts_table):
    scan1 = ts_table.new_scan().case_sensitive(False).select(["ID"])
    # order of refinements shouldn't matter
    scan2 = ts_table.new_scan().select(["ID"]).case_sensitive(False)

    expected_schema = Schema([NestedField.required(1, "id", IntegerType.get())])

    assert scan1.schema.as_struct() == expected_schema.as_struct()
    assert scan2.schema.as_struct() == expected_schema.as_struct()

Example #2

0

Show file

File: reference.py Project: shenodaguirguis/iceberg-1

    def bind(self,
             struct: StructType,
             case_sensitive: bool = True) -> BoundReference:
        from iceberg.api import Schema
        schema = Schema(struct.fields)
        field = schema.find_field(
            self.name
        ) if case_sensitive else schema.case_insensitive_find_field(self.name)

        ValidationException.check(field is not None,
                                  "Cannot find field '%s' in struct: %s",
                                  (self.name, schema.as_struct()))

        return BoundReference(struct, field)

Example #3

0

Show file

def test_column_rename(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64())
    ]
    schema = pa.schema([
        pa.field("int_col", pa.int32(), False),
        pa.field("bigint_col", pa.int64(), True),
        pa.field("string_col", pa.string(), True),
        pa.field("float_col", pa.float32(), True),
        pa.field("dbl_col", pa.float64(), True)
    ])

    source_table = pa.table(pyarrow_array, schema=schema)

    target_table = reader.read()
    assert source_table == target_table

Example #4

0

Show file

File: conftest.py Project: shenodaguirguis/iceberg-1

def expected_metadata_sorting():
    spec_schema = Schema(NestedField.required(1, "x", LongType.get()),
                         NestedField.required(2, "y", LongType.get()),
                         NestedField.required(3, "z", LongType.get()))

    spec = PartitionSpec \
        .builder_for(spec_schema) \
        .with_spec_id(5) \
        .build()

    random.seed(1234)
    previous_snapshot_id = int(time.time()) - random.randint(0, 3600)

    previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None,
                                     timestamp_millis=previous_snapshot_id,
                                     manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"),
                                                                    spec_id=spec.spec_id)])

    current_snapshot_id = int(time.time())
    current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id,
                                    timestamp_millis=current_snapshot_id,
                                    manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"),
                                                                   spec_id=spec.spec_id)])

    reversed_snapshot_log = list()
    metadata = TableMetadata(ops, None, "s3://bucket/test/location",
                             int(time.time()), 3, spec_schema, 5, [spec], {"property": "value"}, current_snapshot_id,
                             [previous_snapshot, current_snapshot], reversed_snapshot_log)

    reversed_snapshot_log.append(SnapshotLogEntry(current_snapshot.timestamp_millis, current_snapshot.snapshot_id))
    reversed_snapshot_log.append(SnapshotLogEntry(previous_snapshot.timestamp_millis, previous_snapshot.snapshot_id))

    return metadata

Example #5

0

Show file

    def convert_avro_schema_to_iceberg(avro_schema):
        if avro_schema.get(AvroToIceberg.FIELD_TYPE_PROP) != "record":
            raise RuntimeError("Cannot convert avro schema to iceberg %s" % avro_schema)

        struct = AvroToIceberg.convert_type(avro_schema, None)

        return Schema(struct[0].fields)

Example #6

0

Show file

def test_schema_evolution_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(16, "other_new_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(15, "new_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.not_null("new_col"), True)

    schema = pa.schema([
        pa.field("int_col", pa.int32(), nullable=False),
        pa.field("bigint_col", pa.int64(), nullable=True),
        pa.field("other_new_col", pa.int64(), nullable=True),
        pa.field("float_col", pa.float32(), nullable=True),
        pa.field("dbl_col", pa.float64(), nullable=True),
        pa.field("string_col", pa.string(), nullable=True),
        pa.field("new_col", pa.string(), nullable=True)
    ])

    pyarrow_not_null_array = [
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.int64()),
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.float32()),
        pa.array([], type=pa.float64()),
        pa.array([], type=pa.string()),
        pa.array([], type=pa.string())
    ]

    not_null_table = pa.table(pyarrow_not_null_array, schema=schema)
    pyarrow_null_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([None, None, None, None, None], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([None, None, None, None, None], type=pa.string())
    ]
    null_table = pa.table(pyarrow_null_array, schema=schema)

    target_table = reader.read()
    assert not_null_table == target_table

    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.is_null("new_col"), True)
    target_table = reader.read()
    assert null_table == target_table

Example #7

0

Show file

File: schema_parser.py Project: andrei-ionescu/iceberg

    def from_json(json_obj):
        if isinstance(json_obj, str):
            type_var = SchemaParser.type_from_dict(json.loads(json_obj))
        else:
            type_var = SchemaParser.type_from_dict(json_obj)

        if type_var is not None and type_var.is_nested_type(
        ) and type_var.as_nested_type().is_struct_type():
            return Schema(type_var.as_nested_type().as_struct_type().fields)
        else:
            raise RuntimeError("Cannot create schema, not a struct type: %s" %
                               type_var)

Example #8

0

Show file

File: test_partition_spec_parser.py Project: SteNicholas/iceberg

def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()))

    spec = PartitionSpec\
        .builder_for(spec_schema) \
        .identity("id")\
        .bucket("data", 16)\
        .build()

    expected = '{"spec-id": 0, "fields": [' \
               '{"name": "id", "transform": "identity", "source-id": 1}, ' \
               '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2}]}'
    assert expected == PartitionSpecParser.to_json(spec)

Example #9

0

Show file

def test_raise_exception_with_invalid_json():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()),
                         NestedField.required(3, "num", DecimalType.of(9, 2)))

    spec_string = '{"spec-id": 0, "fields": [' \
                  '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \
                  '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \
                  '{"name": "data1", "transform": "bucket[16]", "source-id": 2}, ' \
                  '{"name": "data2", "transform": "bucket[8]", "source-id": 2}, ' \
                  '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3}]}'

    with pytest.raises(RuntimeError):
        PartitionSpecParser.from_json(spec_schema, spec_string)

Example #10

0

Show file

def test_column_upcast(primitive_type_test_file):
    expected_schema = Schema(
        [NestedField.required(1, "int_col", LongType.get())])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [pa.array([1, 2, 3, 4, 5], type=pa.int32())]
    source_table = pa.table(
        pyarrow_array,
        schema=pa.schema([pa.field("int_col", pa.int64(), nullable=False)]))

    target_table = reader.read()
    assert source_table == target_table

Example #11

0

Show file

def arrow_to_iceberg(arrow_schema: pa.Schema) -> Schema:
    """
    Use an arrow schema, which contains the field_id metadata, to create an equivalent iceberg Schema

    Parameters
    ----------
    arrow_schema : pyarrow.Schema
        An Arrow schema with the parquet field_id metadata

    Returns
    -------
    iceberg.api.Schema
        returns an equivalent iceberg Schema based on the arrow schema read from the file
    """
    return Schema([get_field(col) for col in arrow_schema])

Example #12

0

Show file

File: test_parquet_to_iceberg.py Project: shenodaguirguis/iceberg-1

def test_unnested_complex_types(unnested_complex_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.optional(1, "list_int_col",
                             ListType.of_optional(3, IntegerType.get())),
        NestedField.optional(4, "list_str_col",
                             ListType.of_optional(6, StringType.get())),
        NestedField.optional(
            7, "struct_col",
            StructType.of([
                NestedField.optional(8, "f1", IntegerType.get()),
                NestedField.optional(9, "f2", StringType.get())
            ]))
    ])
    converted_schema = convert_parquet_to_iceberg(
        unnested_complex_type_test_parquet_file)
    compare_schema(expected_schema, converted_schema)

Example #13

0

Show file

def rg_expected_schema():
    return Schema([
        NestedField.required(1, "string_col", StringType.get()),
        NestedField.required(2, "long_col", LongType.get()),
        NestedField.required(3, "int_col", IntegerType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "null_col", StringType.get()),
        NestedField.optional(6, "missing_col", StringType.get()),
        NestedField.optional(7, "no_stats_col", StringType.get()),
        NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(9, "ts_wotz_col",
                             TimestampType.without_timezone()),
        NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)),
        NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)),
        NestedField.optional(12, "date_type", DateType.get()),
    ])

Example #14

0

Show file

File: test_parquet_to_iceberg.py Project: shenodaguirguis/iceberg-1

def test_primitive_types(primitive_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])
    compare_schema(
        expected_schema,
        convert_parquet_to_iceberg(primitive_type_test_parquet_file))

Example #15

0

Show file

File: parquet_schema_utils.py Project: shenodaguirguis/iceberg-1

def prune_columns(file_schema: Schema, expected_schema: Schema) -> List[str]:
    """
    Given two Iceberg schema's returns a list of column_names for all id's in the
    file schema that are projected in the expected schema

    Parameters
    ----------
    file_schema : iceberg.api.Schema
        An Iceberg schema of the file being read
    expected_schema : iceberg.api.Schema
        An Iceberg schema of the final projection
    Returns
    -------
    list
        The column names in the file that matched ids in the expected schema
    """
    return [column.name for column in file_schema.as_struct().fields
            if column.id in get_projected_ids(expected_schema)]

Example #16

0

Show file

def test_projection(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    num_cols = source_table.num_columns
    for i in range(1, num_cols - 1):
        source_table = source_table.remove_column(num_cols - i)

    assert source_table == reader.read()

Example #17

0

Show file

def test_compound_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(
        input_file, expected_schema, {},
        Expressions.and_(Expressions.equal("string_col", "us"),
                         Expressions.equal("int_col", 1)), True)
    pyarrow_array = [
        pa.array([1], type=pa.int32()),
        pa.array([1], type=pa.int64()),
        pa.array([1.0], type=pa.float32()),
        pa.array([1.0], type=pa.float64()),
        pa.array(['us'], type=pa.string())
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("string_col",
                                         pa.string(),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table

Example #18

0

Show file

def test_decimal_column_add(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9))
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array([None, None, None, None, None], type=pa.decimal128(38, 9))
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("new_dec_col",
                                         pa.decimal128(38, 9),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table

Example #19

0

Show file

def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()),
                         NestedField.required(3, "num", DecimalType.of(9, 2)))

    spec = PartitionSpec \
        .builder_for(spec_schema) \
        .identity("id") \
        .bucket("data", 16) \
        .add_without_field_id(2, "data1", "bucket[16]") \
        .add(2, 1010, "data2", "bucket[8]") \
        .bucket("num", 8) \
        .build()

    expected = '{"spec-id": 0, "fields": [' \
               '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \
               '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \
               '{"name": "data1", "transform": "bucket[16]", "source-id": 2, "field-id": 1002}, ' \
               '{"name": "data2", "transform": "bucket[8]", "source-id": 2, "field-id": 1010}, ' \
               '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3, "field-id": 1011}]}'
    assert expected == PartitionSpecParser.to_json(spec)

Example #20

0

Show file

File: conftest.py Project: shenodaguirguis/iceberg-1

def missing_spec_list():
    schema = Schema(NestedField.required(1, "x", LongType.get()),
                    NestedField.required(2, "y", LongType.get()),
                    NestedField.required(3, "z", LongType.get()))

    spec = PartitionSpec.builder_for(schema).identity("x").with_spec_id(6).build()
    random.seed(1234)
    previous_snapshot_id = int(time.time()) - random.randint(0, 3600)

    previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None,
                                     timestamp_millis=previous_snapshot_id,
                                     manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"),
                                                                    spec_id=spec.spec_id)])

    current_snapshot_id = int(time.time())
    current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id,
                                    timestamp_millis=current_snapshot_id,
                                    manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"),
                                                                   spec_id=spec.spec_id)])
    return TableMetadata(ops, None, "s3://bucket/test/location", int(time.time()), 3, schema, 6,
                         (spec,), {"property": "value"}, current_snapshot_id, [previous_snapshot, current_snapshot],
                         [])

Example #21

0

Show file

File: table_metadata.py Project: ajothomas/iceberg-1

    def new_table_metadata(ops: TableOperations,
                           schema: Schema,
                           spec: PartitionSpec,
                           location: str,
                           properties: dict = None) -> "TableMetadata":
        last_column_id = AtomicInteger(0)
        fresh_schema = assign_fresh_ids(schema,
                                        last_column_id.increment_and_get)

        spec_builder = PartitionSpec.builder_for(fresh_schema)
        for field in spec.fields:
            src_name = schema.find_column_name(field.source_id)
            spec_builder.add(field.source_id,
                             fresh_schema.find_field(src_name).field_id,
                             field.name, str(field.transform))

        fresh_spec = spec_builder.build()
        properties = properties if properties is not None else dict()

        return TableMetadata(ops, None, location, int(time.time() * 1000),
                             last_column_id.get(), fresh_schema,
                             TableMetadata.INITIAL_SPEC_ID, [fresh_spec],
                             properties, -1, list(), list())

Example #22

0

Show file

def test_basic_read(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    assert reader.read() == source_table

Example #23

0

Show file

File: test_partition_spec.py Project: alanzhang211/incubator-iceberg

def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                         NestedField.required(2, "l", LongType.get()),
                         NestedField.required(3, "d", DateType.get()),
                         NestedField.required(4, "t", TimeType.get()),
                         NestedField.required(5, "ts", TimestampType.without_timezone()),
                         NestedField.required(6, "dec", DecimalType.of(9, 2)),
                         NestedField.required(7, "s", StringType.get()),
                         NestedField.required(8, "u", UUIDType.get()),
                         NestedField.required(9, "f", FixedType.of_length(3)),
                         NestedField.required(10, "b", BinaryType.get()))

    specs = [
        PartitionSpec.builder_for(spec_schema).identity("i").build(),
        PartitionSpec.builder_for(spec_schema).identity("l").build(),
        PartitionSpec.builder_for(spec_schema).identity("d").build(),
        PartitionSpec.builder_for(spec_schema).identity("t").build(),
        PartitionSpec.builder_for(spec_schema).identity("ts").build(),
        PartitionSpec.builder_for(spec_schema).identity("dec").build(),
        PartitionSpec.builder_for(spec_schema).identity("s").build(),
        PartitionSpec.builder_for(spec_schema).identity("u").build(),
        PartitionSpec.builder_for(spec_schema).identity("f").build(),
        PartitionSpec.builder_for(spec_schema).identity("b").build(),
        PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(),
        PartitionSpec.builder_for(spec_schema).year("d").build(),
        PartitionSpec.builder_for(spec_schema).month("d").build(),
        PartitionSpec.builder_for(spec_schema).day("d").build(),
        PartitionSpec.builder_for(spec_schema).year("ts").build(),
        PartitionSpec.builder_for(spec_schema).month("ts").build(),
        PartitionSpec.builder_for(spec_schema).day("ts").build(),
        PartitionSpec.builder_for(spec_schema).hour("ts").build(),
        PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(),
        PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build()
    ]

    expected_spec_strs = [
        "[\n i: identity(1)\n]",
        "[\n l: identity(2)\n]",
        "[\n d: identity(3)\n]",
        "[\n t: identity(4)\n]",
        "[\n ts: identity(5)\n]",
        "[\n dec: identity(6)\n]",
        "[\n s: identity(7)\n]",
        "[\n u: identity(8)\n]",
        "[\n f: identity(9)\n]",
        "[\n b: identity(10)\n]",
        "[\n i_bucket: bucket[128](1)\n]",
        "[\n l_bucket: bucket[128](2)\n]",
        "[\n d_bucket: bucket[128](3)\n]",
        "[\n t_bucket: bucket[128](4)\n]",
        "[\n ts_bucket: bucket[128](5)\n]",
        "[\n dec_bucket: bucket[128](6)\n]",
        "[\n s_bucket: bucket[128](7)\n]",
        "[\n d_year: year(3)\n]",
        "[\n d_month: month(3)\n]",
        "[\n d_day: day(3)\n]",
        "[\n ts_year: year(5)\n]",
        "[\n ts_month: month(5)\n]",
        "[\n ts_day: day(5)\n]",
        "[\n ts_hour: hour(5)\n]",
        "[\n i_truncate: truncate[10](1)\n]",
        "[\n l_truncate: truncate[10](2)\n]",
        "[\n dec_truncate: truncate[10](6)\n]",
        "[\n s_truncate: truncate[10](7)\n]",
        "[\n dec_bucket: bucket[16](6)\n]",
    ]

    for (spec, expected_spec_str) in zip(specs, expected_spec_strs):
        assert str(spec) == expected_spec_str

Example #24

0

Show file

File: manifest_entry.py Project: rdsr/li-iceberg-rdsr

 def wrap_file_schema(file_struct):
     return Schema(NestedField.required(0, "status", IntegerType.get()),
                   NestedField.required(1, "snapshot_id", LongType.get()),
                   NestedField.required(2, "data_file", file_struct))

Example #25

0

Show file

File: manifest_entry.py Project: rdsr/li-iceberg-rdsr

 def project_schema(part_type, columns):
     return ManifestEntry.wrap_file_schema(Schema(DataFile.get_type(part_type).fields)
                                           .select(columns)
                                           .as_struct())

Example #26

0

Show file

File: conftest.py Project: shenodaguirguis/iceberg-1

def base_scan_schema():
    return Schema([
        NestedField.required(1, "id", IntegerType.get()),
        NestedField.required(2, "data", StringType.get())
    ])

Example #27

0

Show file

File: test_base_table_scan.py Project: rdsr/li-iceberg-rdsr

def test_table_scan_honors_select(ts_table):
    scan = ts_table.new_scan().select(["id"])

    expected_schema = Schema([NestedField.required(1, "id", IntegerType.get())])

    assert scan.schema.as_struct() == expected_schema.as_struct()

Example #28

0

Show file

File: conftest.py Project: rdsr/li-iceberg-rdsr

import os
import random
import tempfile
import time

from iceberg.api import Files, PartitionSpec, Schema
from iceberg.api.types import BooleanType, IntegerType, LongType, NestedField, StringType
from iceberg.core import (BaseSnapshot, BaseTable, ConfigProperties,
                          GenericManifestFile, SnapshotLogEntry, TableMetadata,
                          TableMetadataParser, TableOperations,
                          TableProperties)
from iceberg.exceptions import AlreadyExistsException, CommitFailedException
import pytest

SCHEMA = Schema([NestedField.optional(1, "b", BooleanType.get())])
METADATA = dict()
VERSIONS = dict()


class LocalTableOperations(TableOperations):
    def current(self):
        raise RuntimeError("Not implemented for tests")

    def refresh(self):
        raise RuntimeError("Not implemented for tests")

    def commit(self, base, metadata):
        raise RuntimeError("Not implemented for tests")

    def new_input_file(self, path):

Example #29

0

Show file

File: conftest.py Project: rdsr/li-iceberg-rdsr

def iceberg_full_read_projection_schema():
    return Schema([
        NestedField.required(0, "id", LongType.get()),
        NestedField.optional(1, "data", StringType.get())
    ])