Beispiel #1
0
def get_list_field(col_type: pa.ListType) -> ListType:
    if col_type.value_field.nullable:
        return ListType.of_optional(int(col_type.value_field.metadata[b"PARQUET:field_id"].decode("utf-8")),
                                    arrow_type_map[col_type.value_field.type.id](col_type.value_field.type))
    else:
        return ListType.of_required(col_type.value_field.metadata[b"PARQUET:field_id"].decode("utf-8"),
                                    arrow_type_map[col_type.value_field.type.id](col_type.value_field.type))
    def list_from_dict(dict_obj):
        elem_id = dict_obj.get(SchemaParser.ELEMENT_ID)
        elem_type = SchemaParser.type_from_dict(
            dict_obj.get(SchemaParser.ELEMENT))
        is_required = dict_obj.get(SchemaParser.REQUIRED)

        if is_required:
            return ListType.of_required(elem_id, elem_type)
        else:
            return ListType.of_optional(elem_id, elem_type)
Beispiel #3
0
 def get_type(partition_type):
     return StructType.of([
         NestedField.required(100, "file_path", StringType.get()),
         NestedField.required(101, "file_format", StringType.get()),
         NestedField.required(102, "partition", partition_type),
         NestedField.required(103, "record_count", LongType.get()),
         NestedField.required(104, "file_size_in_bytes", LongType.get()),
         NestedField.required(105, "block_size_in_bytes", LongType.get()),
         NestedField.optional(106, "file_ordinal", IntegerType.get()),
         NestedField.optional(107, "sort_columns",
                              ListType.of_required(112, IntegerType.get())),
         NestedField.optional(
             108, "column_sizes",
             MapType.of_required(117, 118, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             109, "value_counts",
             MapType.of_required(119, 120, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             110, "null_value_counts",
             MapType.of_required(121, 122, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             125, "lower_bounds",
             MapType.of_required(126, 127, IntegerType.get(),
                                 BinaryType.get())),
         NestedField.optional(
             128, "upper_bounds",
             MapType.of_required(129, 130, IntegerType.get(),
                                 BinaryType.get()))
     ]
                          # NEXT ID TO ASSIGN: 131
                          )
def test_unnested_complex_types(unnested_complex_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.optional(1, "list_int_col",
                             ListType.of_optional(3, IntegerType.get())),
        NestedField.optional(4, "list_str_col",
                             ListType.of_optional(6, StringType.get())),
        NestedField.optional(
            7, "struct_col",
            StructType.of([
                NestedField.optional(8, "f1", IntegerType.get()),
                NestedField.optional(9, "f2", StringType.get())
            ]))
    ])
    converted_schema = convert_parquet_to_iceberg(
        unnested_complex_type_test_parquet_file)
    compare_schema(expected_schema, converted_schema)
Beispiel #5
0
    def convert_array_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        if avro_field_type != "array":
            raise RuntimeError("Avro type must be array: %s" % avro_field_type)
        element_id = avro_field.get(AvroToIceberg.FIELD_ELEMENT_ID_PROP)
        items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP)

        is_optional = AvroToIceberg.is_option_schema(items)

        if isinstance(items, str) and items in AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP:
            item_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(items)
            if item_type is None:
                raise RuntimeError("No mapping found for type %s" % items)
        else:
            raise RuntimeError("Complex list types not yet implemented")

        if is_optional:
            return ListType.of_optional(element_id, item_type), next_id
        else:
            return ListType.of_required(element_id, item_type), next_id