def from_json(schema, json_obj): if isinstance(json_obj, str): json_obj = json.loads(json_obj) if not isinstance(json_obj, dict): raise RuntimeError( "Cannot parse partition spec, not an object: %s" % json_obj) spec_id = json_obj.get(PartitionSpecParser.SPEC_ID) builder = PartitionSpec.builder_for(schema).with_spec_id(spec_id) fields = json_obj.get(PartitionSpecParser.FIELDS) if not isinstance(fields, (list, tuple)): raise RuntimeError( "Cannot parse partition spec fields, not an array: %s" % fields) for element in fields: if not isinstance(element, dict): raise RuntimeError( "Cannot parse partition field, not an object: %s" % element) builder.add(element.get(PartitionSpecParser.SOURCE_ID), element.get(PartitionSpecParser.NAME), element.get(PartitionSpecParser.TRANSFORM)) return builder.build()
def from_json_fields(schema, spec_id, json_obj): builder = PartitionSpec.builder_for(schema).with_spec_id(spec_id) if isinstance(json_obj, str): json_obj = json.loads(json_obj) return PartitionSpecParser.__build_from_json_fields(builder, json_obj)
def inc_man_spec(): inc_schema = Schema( NestedField.required(1, "id", IntegerType.get()), NestedField.optional(4, "all_nulls", StringType.get()), NestedField.optional(5, "some_nulls", StringType.get()), NestedField.optional(6, "no_nulls", StringType.get())) return (PartitionSpec.builder_for(inc_schema).with_spec_id(0).identity( "id").identity("all_nulls").identity("some_nulls").identity( "no_nulls").build())
def from_json(schema, json_obj): if isinstance(json_obj, str): json_obj = json.loads(json_obj) if not isinstance(json_obj, dict): raise RuntimeError("Cannot parse partition spec, not an object: %s" % json_obj) spec_id = json_obj.get(PartitionSpecParser.SPEC_ID) builder = PartitionSpec.builder_for(schema).with_spec_id(spec_id) fields = json_obj.get(PartitionSpecParser.FIELDS) return PartitionSpecParser.__build_from_json_fields(builder, fields)
def new_table_metadata(ops, schema, spec, location): last_column_id = AtomicInteger(0) fresh_schema = assign_fresh_ids(schema, last_column_id.increment_and_get) spec_builder = PartitionSpec.builder_for(fresh_schema) for field in spec.fields: src_name = schema.find_column_name(field.source_id) spec_builder.add(fresh_schema.find_field(src_name), field, str(field.fransform())) fresh_spec = spec_builder.build() return TableMetadata(ops, None, location, int(time.time() * 1000), last_column_id.get(), fresh_schema, TableMetadata.INITIAL_SPEC_ID, [fresh_spec], dict(), -1, list(), list())
def from_json_fields(schema, spec_id, json_obj): builder = PartitionSpec.builder_for(schema).with_spec_id(spec_id) if isinstance(json_obj, str): json_obj = json.loads(json_obj) if not isinstance(json_obj, list): raise RuntimeError( "Cannot parse partition spec fields, not an array: %s" % json_obj) for item in json_obj: if not isinstance(item, dict): raise RuntimeError( "Cannot parse partition field, not an object: %s" % json_obj) builder.add(item.get(PartitionSpecParser.SOURCE_ID), item.get(PartitionSpecParser.NAME), item.get(PartitionSpecParser.TRANSFORM)) return builder.build()
def missing_spec_list(): schema = Schema(NestedField.required(1, "x", LongType.get()), NestedField.required(2, "y", LongType.get()), NestedField.required(3, "z", LongType.get())) spec = PartitionSpec.builder_for(schema).identity("x").with_spec_id(6).build() random.seed(1234) previous_snapshot_id = int(time.time()) - random.randint(0, 3600) previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None, timestamp_millis=previous_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"), spec_id=spec.spec_id)]) current_snapshot_id = int(time.time()) current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id, timestamp_millis=current_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"), spec_id=spec.spec_id)]) return TableMetadata(ops, None, "s3://bucket/test/location", int(time.time()), 3, schema, 6, (spec,), {"property": "value"}, current_snapshot_id, [previous_snapshot, current_snapshot], [])
def new_table_metadata(ops: TableOperations, schema: Schema, spec: PartitionSpec, location: str, properties: dict = None) -> "TableMetadata": last_column_id = AtomicInteger(0) fresh_schema = assign_fresh_ids(schema, last_column_id.increment_and_get) spec_builder = PartitionSpec.builder_for(fresh_schema) for field in spec.fields: src_name = schema.find_column_name(field.source_id) spec_builder.add(field.source_id, fresh_schema.find_field(src_name).field_id, field.name, str(field.transform)) fresh_spec = spec_builder.build() properties = properties if properties is not None else dict() return TableMetadata(ops, None, location, int(time.time() * 1000), last_column_id.get(), fresh_schema, TableMetadata.INITIAL_SPEC_ID, [fresh_spec], properties, -1, list(), list())
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [ PartitionSpec.builder_for(spec_schema).identity("i").build(), PartitionSpec.builder_for(spec_schema).identity("l").build(), PartitionSpec.builder_for(spec_schema).identity("d").build(), PartitionSpec.builder_for(spec_schema).identity("t").build(), PartitionSpec.builder_for(spec_schema).identity("ts").build(), PartitionSpec.builder_for(spec_schema).identity("dec").build(), PartitionSpec.builder_for(spec_schema).identity("s").build(), PartitionSpec.builder_for(spec_schema).identity("u").build(), PartitionSpec.builder_for(spec_schema).identity("f").build(), PartitionSpec.builder_for(spec_schema).identity("b").build(), PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(), PartitionSpec.builder_for(spec_schema).year("d").build(), PartitionSpec.builder_for(spec_schema).month("d").build(), PartitionSpec.builder_for(spec_schema).day("d").build(), PartitionSpec.builder_for(spec_schema).year("ts").build(), PartitionSpec.builder_for(spec_schema).month("ts").build(), PartitionSpec.builder_for(spec_schema).day("ts").build(), PartitionSpec.builder_for(spec_schema).hour("ts").build(), PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(), PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build() ] expected_spec_strs = [ "[\n i: identity(1)\n]", "[\n l: identity(2)\n]", "[\n d: identity(3)\n]", "[\n t: identity(4)\n]", "[\n ts: identity(5)\n]", "[\n dec: identity(6)\n]", "[\n s: identity(7)\n]", "[\n u: identity(8)\n]", "[\n f: identity(9)\n]", "[\n b: identity(10)\n]", "[\n i_bucket: bucket[128](1)\n]", "[\n l_bucket: bucket[128](2)\n]", "[\n d_bucket: bucket[128](3)\n]", "[\n t_bucket: bucket[128](4)\n]", "[\n ts_bucket: bucket[128](5)\n]", "[\n dec_bucket: bucket[128](6)\n]", "[\n s_bucket: bucket[128](7)\n]", "[\n d_year: year(3)\n]", "[\n d_month: month(3)\n]", "[\n d_day: day(3)\n]", "[\n ts_year: year(5)\n]", "[\n ts_month: month(5)\n]", "[\n ts_day: day(5)\n]", "[\n ts_hour: hour(5)\n]", "[\n i_truncate: truncate[10](1)\n]", "[\n l_truncate: truncate[10](2)\n]", "[\n dec_truncate: truncate[10](6)\n]", "[\n s_truncate: truncate[10](7)\n]", "[\n dec_bucket: bucket[16](6)\n]", ] for (spec, expected_spec_str) in zip(specs, expected_spec_strs): assert str(spec) == expected_spec_str
def test_partition_spec(self): schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [PartitionSpec.builder_for(schema).identity("i").build(), PartitionSpec.builder_for(schema).identity("l").build(), PartitionSpec.builder_for(schema).identity("d").build(), PartitionSpec.builder_for(schema).identity("t").build(), PartitionSpec.builder_for(schema).identity("ts").build(), PartitionSpec.builder_for(schema).identity("dec").build(), PartitionSpec.builder_for(schema).identity("s").build(), PartitionSpec.builder_for(schema).identity("u").build(), PartitionSpec.builder_for(schema).identity("f").build(), PartitionSpec.builder_for(schema).identity("b").build(), PartitionSpec.builder_for(schema).bucket("i", 128).build(), PartitionSpec.builder_for(schema).bucket("l", 128).build(), PartitionSpec.builder_for(schema).bucket("d", 128).build(), PartitionSpec.builder_for(schema).bucket("t", 128).build(), PartitionSpec.builder_for(schema).bucket("ts", 128).build(), PartitionSpec.builder_for(schema).bucket("dec", 128).build(), PartitionSpec.builder_for(schema).bucket("s", 128).build(), PartitionSpec.builder_for(schema).bucket("u", 128).build(), PartitionSpec.builder_for(schema).bucket("f", 128).build(), PartitionSpec.builder_for(schema).bucket("b", 128).build(), PartitionSpec.builder_for(schema).year("d").build(), PartitionSpec.builder_for(schema).month("d").build(), PartitionSpec.builder_for(schema).day("d").build(), PartitionSpec.builder_for(schema).year("ts").build(), PartitionSpec.builder_for(schema).month("ts").build(), PartitionSpec.builder_for(schema).day("ts").build(), PartitionSpec.builder_for(schema).hour("ts").build(), PartitionSpec.builder_for(schema).truncate("i", 10).build(), PartitionSpec.builder_for(schema).truncate("l", 10).build(), PartitionSpec.builder_for(schema).truncate("dec", 10).build(), PartitionSpec.builder_for(schema).truncate("s", 10).build(), PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(), PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(), ] for spec in specs: self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))