def test_overlap_keyspace(store, metadata_version): dataset_uuid1 = "uuid+namespace-attribute12_underscored" dataset_uuid2 = "uuid+namespace-attribute12_underscored_ext" table = "core" for dataset_uuid in (dataset_uuid1, dataset_uuid2): partition0 = "location=L-0".format(dataset_uuid) partition0_key = "{}/{}/{}/data.parquet".format( dataset_uuid, table, partition0) metadata = { "dataset_metadata_version": metadata_version, "dataset_uuid": dataset_uuid, } # put two partitions for two tables each to store store.put( "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX), simplejson.dumps(metadata).encode("utf-8"), ) store.put(partition0_key, b"test") store_schema_metadata( make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"), dataset_uuid, store, "core", ) for dataset_uuid in (dataset_uuid1, dataset_uuid2): partition0_label = "location=L-0/data".format(dataset_uuid) partition0_key = "{}/{}/{}.parquet".format(dataset_uuid, table, partition0_label) expected_partitions = { "location=L-0/data": { "files": { "core": partition0_key } } } expected_indices = {"location": {"L-0": ["location=L-0/data"]}} assert DatasetMetadata.storage_keys(dataset_uuid, store) == [ "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX), _get_common_metadata_key(dataset_uuid, "core"), partition0_key, ] dmd = DatasetMetadata.load_from_store(dataset_uuid, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def test_dynamic_partitions_quote(store, metadata_version): """ Do not specify partitions in metadata, but read them dynamically from store """ dataset_uuid = "uuid-namespace-attribute12_underscored" partition0_core = create_partition_key(dataset_uuid, "core", [("location", "München")], "data.parquet") partition1_core = create_partition_key(dataset_uuid, "core", [("location", "å\\ øß")], "data.parquet") metadata = { "dataset_metadata_version": metadata_version, "dataset_uuid": dataset_uuid, } expected_partitions = { "location=M%C3%BCnchen/data": { "files": { "core": partition0_core } }, "location=%C3%A5%5C%20%C3%B8%C3%9F/data": { "files": { "core": partition1_core } }, } expected_indices = { "location": { "München": ["location=M%C3%BCnchen/data"], "å\\ øß": ["location=%C3%A5%5C%20%C3%B8%C3%9F/data"], } } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"), dataset_uuid, store, "core", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def test_compat_old_rw_path(df_all_types, store): # strip down DF before some column types weren't supported before anyway df = df_all_types[[ c for c in df_all_types.columns if (not c.startswith("array_") # array types (always null) and c != "unicode" # unicode type (alway null) and "8" not in c # 8 bit types are casted to 64 bit and "16" not in c # 16 bit types are casted to 64 bit and "32" not in c # 32 bit types are casted to 64 bit ) ]] expected_meta = make_meta(df, origin="df") # old schema write path old_meta = dask_make_meta(df) pa_table = pa.Table.from_pandas(old_meta) buf = pa.BufferOutputStream() pq.write_table(pa_table, buf, version="2.0") key_old = _get_common_metadata_key("dataset_uuid_old", "table") store.put(key_old, buf.getvalue().to_pybytes()) actual_meta = read_schema_metadata(dataset_uuid="dataset_uuid_old", store=store, table="table") validate_compatible([actual_meta, expected_meta]) store_schema_metadata( schema=make_meta(df, origin="df"), dataset_uuid="dataset_uuid_new", store=store, table="table", ) key_new = _get_common_metadata_key("dataset_uuid_new", "table") actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store) actual_df["date"] = actual_df["date"].dt.date pdt.assert_frame_equal(actual_df, old_meta)
def test_empty_dataframe_from_schema(df_all_types): schema = make_meta(df_all_types, origin="1") actual_df = empty_dataframe_from_schema(schema) expected_df = df_all_types.loc[[]] expected_df["date"] = pd.Series([], dtype="datetime64[ns]") for c in expected_df.columns: if c.startswith("float"): expected_df[c] = pd.Series([], dtype=float) if c.startswith("int"): expected_df[c] = pd.Series([], dtype=int) if c.startswith("uint"): expected_df[c] = pd.Series([], dtype=np.uint64) pdt.assert_frame_equal(actual_df, expected_df)
def test_store_schema_metadata(store, df_all_types): store_schema_metadata( schema=make_meta(df_all_types, origin="df_all_types"), dataset_uuid="some_uuid", store=store, table="some_table", ) key = "some_uuid/some_table/_common_metadata" assert key in store.keys() pq_file = pq.ParquetFile(store.open(key)) actual_schema = pq_file.schema.to_arrow_schema() fields = [ pa.field("array_float32", pa.list_(pa.float64())), pa.field("array_float64", pa.list_(pa.float64())), pa.field("array_int16", pa.list_(pa.int64())), pa.field("array_int32", pa.list_(pa.int64())), pa.field("array_int64", pa.list_(pa.int64())), pa.field("array_int8", pa.list_(pa.int64())), pa.field("array_uint16", pa.list_(pa.uint64())), pa.field("array_uint32", pa.list_(pa.uint64())), pa.field("array_uint64", pa.list_(pa.uint64())), pa.field("array_uint8", pa.list_(pa.uint64())), pa.field("array_unicode", pa.list_(pa.string())), pa.field("bool", pa.bool_()), pa.field("byte", pa.binary()), pa.field("date", pa.date32()), pa.field("datetime64", pa.timestamp("us")), pa.field("float32", pa.float64()), pa.field("float64", pa.float64()), pa.field("int16", pa.int64()), pa.field("int32", pa.int64()), pa.field("int64", pa.int64()), pa.field("int8", pa.int64()), pa.field("null", pa.null()), pa.field("uint16", pa.uint64()), pa.field("uint32", pa.uint64()), pa.field("uint64", pa.uint64()), pa.field("uint8", pa.uint64()), pa.field("unicode", pa.string()), ] if not ARROW_LARGER_EQ_0130: fields.append(pa.field("__index_level_0__", pa.int64())) expected_schema = pa.schema(fields) assert actual_schema.remove_metadata() == expected_schema
def test_load_from_store_with_indices(store): meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": "uuid", "partitions": { "product_id=1/part_1": { "files": { "core_data": "dataset_uuid/table/location_id=1/part_1.parquet" } } }, "indices": { "product_id": { "1": ["part_1"], "2": ["part_1"], "100": ["part_1"], "34": ["part_1"], } }, } store.put("uuid.by-dataset-metadata.json", simplejson.dumps(meta_dct).encode("utf-8")) df = pd.DataFrame({"index": [1], "location_id": [1], "product_id": [1]}) store_schema_metadata(make_meta(df, origin="core"), "uuid", store, "core_data") storage_key = "uuid/some_index.parquet" index2 = ExplicitSecondaryIndex( column="location_id", index_dct={ 1: ["part_1", "part_2"], 3: ["part_3"] }, index_storage_key=storage_key, dtype=pa.int64(), ) index2.store(store, "dataset_uuid") dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid") assert "location_id" not in dmd.indices dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid", load_all_indices=True) assert "location_id" in dmd.indices
def test_load_partition_indices_types(store): dataset_uuid = "uuid+namespace-attribute12_underscored" table = "table" index_name = "location_id" index_value = 1 meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": dataset_uuid, "partitions": { "{index_name}={index_value}/part_1".format(index_name=index_name, index_value=index_value): { "files": { table: "{dataset_uuid}/{table}/location_id=1/part_1.parquet". format(dataset_uuid=dataset_uuid, table=table) } } }, } store.put( "{dataset_uuid}.by-dataset-metadata.json".format( dataset_uuid=dataset_uuid), simplejson.dumps(meta_dct).encode(), ) store_schema_metadata( make_meta( pd.DataFrame({index_name: pd.Series([index_value], dtype=int)}), origin="core", ), dataset_uuid, store, table, ) dmd = DatasetMetadata.load_from_store(store=store, uuid=dataset_uuid) dmd = dmd.load_partition_indices() assert len(dmd.indices) == 1 assert "location_id" in dmd.indices assert isinstance(dmd.indices["location_id"], PartitionIndex) idx = dmd.indices["location_id"] assert idx.dtype == pa.int64() assert idx.query(1) == ["location_id=1/part_1"]
def test_parse_nested_input_schema_compatible_but_different(): # Ensure that input can be parsed even though the schemas are not identical but compatible df_input = [[ { "data": { "table": pd.DataFrame({"A": [None]}) } }, { "data": { "table": pd.DataFrame({"A": ["str"]}) } }, ]] mp = parse_input_to_metapartition(df_input, metadata_version=4) expected_schema = make_meta(pd.DataFrame({"A": ["str"]}), origin="expected") assert mp.table_meta["table"] == expected_schema
def test_load_partition_keys(store): expected = { "dataset_metadata_version": 4, "dataset_uuid": "uuid", "partitions": { "part_1": { "files": { "core_data": "uuid/table/index=1/index2=2/file.parquet" } }, "part_2": { "files": { "core_data": "uuid/table/index=1/index2=2/file2.parquet" } }, }, "indices": { "product_id": { "1": ["part_1"], "2": ["part_2"], "100": ["part_1", "part_2"], "34": ["part_1"], }, "location_id": { "1": ["part_1"], "2": ["part_2"], "3": ["part_1"], "4": ["part_2"], }, }, } store.put("uuid.by-dataset-metadata.json", simplejson.dumps(expected).encode("utf-8")) df = pd.DataFrame({ "index": [1], "index2": [1], "product_id": [1], "location_id": [1] }) store_schema_metadata(make_meta(df, origin="core"), "uuid", store, "core_data") dmd = DatasetMetadata.load_from_store("uuid", store) assert dmd.partition_keys == ["index", "index2"]
def test_reconstruct_index_duplicates(store): ser = ParquetSerializer() df = pd.DataFrame({"index_col": [1, 1], "column": list("ab")}) label = "dontcare" key_prefix = "uuid/table/index_col=2/{}".format(label) key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") store_schema_metadata(schema, "uuid", store) mp = MetaPartition( label="dontcare", file=key, metadata_version=4, schema=schema, partition_keys=["index_col"], ) mp = mp.load_dataframes(store) df_actual = mp.data df_expected = pd.DataFrame( OrderedDict([("index_col", [2, 2]), ("column", list("ab"))])) pdt.assert_frame_equal(df_actual, df_expected)
def test_read_table_meta(store): meta_dct = { "dataset_metadata_version": 4, "dataset_uuid": "dataset_uuid", "partitions": { "location_id=1/part_1": { "files": { "table1": "dataset_uuid/table1/location_id=1/part_1.parquet" } } }, } df1 = pd.DataFrame({ "location_id": pd.Series([1], dtype=int), "x": pd.Series([True], dtype=bool) }) schema1 = make_meta(df1, origin="1") store_schema_metadata(schema1, "dataset_uuid", store, "table1") dmd = DatasetMetadata.load_from_dict(meta_dct, store) assert dmd.schema == schema1
def test_to_dict(metadata_version): df = pd.DataFrame({"A": [1]}) schema = make_meta(df, origin="test") mp = MetaPartition( label="label_1", file="file", data=df, indices={"test": [1, 2, 3]}, metadata_version=metadata_version, schema=schema, ) mp_dct = mp.to_dict() assert mp_dct == { "label": "label_1", "data": df, "file": "file", "indices": {"test": [1, 2, 3]}, "metadata_version": metadata_version, "schema": schema, "partition_keys": [], "logical_conjunction": None, "table_name": SINGLE_TABLE, }
def test_reconstruct_date_index(store, metadata_version, dates_as_object): ser = ParquetSerializer() # If the parquet file does include the primary index col, still use the reconstructed index and ignore the content of the file df = pd.DataFrame( {"index_col": [date(2018, 6, 1), date(2018, 6, 1)], "column": list("ab")} ) label = "dontcare" key_prefix = "uuid/table/index_col=2018-06-02/{}".format(label) key = ser.store(store, key_prefix, df) schema = make_meta(df, origin="1", partition_keys="index_col") store_schema_metadata(schema, "uuid", store) mp = MetaPartition( label="dontcare", file=key, metadata_version=metadata_version, schema=schema, partition_keys=["index_col"], ) mp = mp.load_dataframes(store, dates_as_object=dates_as_object) df_actual = mp.data if dates_as_object: dt_constructor = date else: dt_constructor = datetime df_expected = pd.DataFrame( OrderedDict( [ ("index_col", [dt_constructor(2018, 6, 2), dt_constructor(2018, 6, 2)]), ("column", list("ab")), ] ) ) pdt.assert_frame_equal(df_actual, df_expected)
def test_validate_compatible_other_pandas(df_all_types, remove_metadata, ignore_pandas): def _with_pandas(version): schema = make_meta(df_all_types, origin=version) metadata = schema.metadata pandas_metadata = simplejson.loads(metadata[b"pandas"].decode("utf8")) pandas_metadata["pandas_version"] = version metadata[b"pandas"] = simplejson.dumps(pandas_metadata).encode("utf8") schema = SchemaWrapper(pa.schema(schema, metadata), version) if remove_metadata: return schema.remove_metadata() else: return schema schema1 = make_meta(df_all_types, origin="all") schema2 = _with_pandas("0.19.0") schema3 = _with_pandas("0.99.0") if remove_metadata and not ignore_pandas: # This should fail as long as we have the metadata attached with pytest.raises(ValueError): validate_compatible( [schema1, schema2, schema3], ignore_pandas=ignore_pandas ) schema1 = schema1.remove_metadata() validate_compatible([schema1, schema2, schema3], ignore_pandas=ignore_pandas)
def test_dynamic_partitions_multiple_indices(store): """ Do not specify partitions in metadata, but read them dynamically from store """ suffix = "suffix" dataset_uuid = "uuid+namespace-attribute12_underscored" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0"), ("product", "P-0")], "{}.parquet".format(suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1"), ("product", "P-0")], "{}.parquet".format(suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/product=P-0/{}".format(suffix): { "files": { "core": partition0_core } }, "location=L-1/product=P-0/{}".format(suffix): { "files": { "core": partition1_core } }, } expected_indices = { "location": { "L-0": ["location=L-0/product=P-0/{}".format(suffix)], "L-1": ["location=L-1/product=P-0/{}".format(suffix)], }, "product": { "P-0": [ "location=L-0/product=P-0/{}".format(suffix), "location=L-1/product=P-0/{}".format(suffix), ] }, } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({ "location": ["L-0"], "product": ["P-0"] }), origin="1"), dataset_uuid, store, "core", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions # Sorting may differ in the index list. This is ok for runtime # but does produce flaky tests thus sort them. sorted_result = { column: {label: sorted(x) for label, x in index.items()} for column, index in dmd_dict["indices"].items() } assert sorted_result == expected_indices
def df_all_types_schema(df_all_types): return make_meta(df_all_types, origin="df_all_types")
def df_all_types_empty_schema(df_all_types): df_empty = df_all_types.drop(0) assert df_empty.empty return make_meta(df_empty, origin="df_empty")
def test_diff_schemas(df_all_types): # Prepare a schema with one missing, one additional and one changed column df2 = df_all_types.drop(columns=df_all_types.columns[0]) df2["new_col"] = pd.Series(df_all_types["bool"]) df2["int16"] = df2["int16"].astype(float) df2 = df2.reset_index(drop=True) schema2 = make_meta(df2, origin="2") schema1 = make_meta(df_all_types, origin="1") diff = _diff_schemas(schema1, schema2) expected_arrow_diff = """Arrow schema: @@ -1,5 +1,3 @@ -array_float32: list<item: double> - child 0, item: double array_float64: list<item: double> child 0, item: double array_int16: list<item: int64> @@ -26,10 +24,11 @@ datetime64: timestamp[ns] float32: double float64: double -int16: int64 +int16: double int32: int64 int64: int64 int8: int64 +new_col: bool null: null uint16: uint64 uint32: uint64 """ expected_pandas_diff = """Pandas_metadata: @@ -3,12 +3,7 @@ 'name': None, 'numpy_type': 'object', 'pandas_type': 'unicode'}], - 'columns': [{'field_name': 'array_float32', - 'metadata': None, - 'name': 'array_float32', - 'numpy_type': 'object', - 'pandas_type': 'list[float64]'}, - {'field_name': 'array_float64', + 'columns': [{'field_name': 'array_float64', 'metadata': None, 'name': 'array_float64', 'numpy_type': 'object', @@ -91,8 +86,8 @@ {'field_name': 'int16', 'metadata': None, 'name': 'int16', - 'numpy_type': 'int64', - 'pandas_type': 'int64'}, + 'numpy_type': 'float64', + 'pandas_type': 'float64'}, {'field_name': 'int32', 'metadata': None, 'name': 'int32', @@ -108,6 +103,11 @@ 'name': 'int8', 'numpy_type': 'int64', 'pandas_type': 'int64'}, + {'field_name': 'new_col', + 'metadata': None, + 'name': 'new_col', + 'numpy_type': 'bool', + 'pandas_type': 'bool'}, {'field_name': 'null', 'metadata': None, 'name': 'null',""" assert diff == expected_arrow_diff + expected_pandas_diff
def test_pickle(df_all_types): obj1 = make_meta(df_all_types, origin="df_all_types") s = pickle.dumps(obj1) obj2 = pickle.loads(s) assert obj1 == obj2
def generate_mp(): return MetaPartition( label=uuid.uuid4().hex, schema=make_meta(get_dataframe_alltypes(), origin="alltypes"), file="fakefile", )
def test_empty_dataframe_from_schema_columns(df_all_types): schema = make_meta(df_all_types, origin="1") actual_df = empty_dataframe_from_schema(schema, ["uint64", "int64"]) expected_df = df_all_types.loc[[], ["uint64", "int64"]] pdt.assert_frame_equal(actual_df, expected_df)
def test_query_indices_external(store, metadata_version): expected = { "dataset_metadata_version": metadata_version, "dataset_uuid": "uuid+namespace-attribute12_underscored", "partitions": { "part_1": { "files": { "core_data": "file.parquest" } }, "part_2": { "files": { "core_data": "file2.parquest" } }, }, "indices": { "product_id": "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", "location_id": { "1": ["part_1"], "2": ["part_2"], "3": ["part_1"], "4": ["part_2"], }, }, } store.put( "uuid+namespace-attribute12_underscored.by-dataset-metadata.json", simplejson.dumps(expected).encode("utf-8"), ) df = pd.DataFrame({ "product_id": [1, 2, 100, 34], "partition": [ np.array(["part_1"], dtype=object), np.array(["part_2"], dtype=object), np.array(["part_1", "part_2"], dtype=object), np.array(["part_1"], dtype=object), ], }) schema = pa.schema([ pa.field("partition", pa.list_(pa.string())), pa.field("product_id", pa.int64()), ]) table = pa.Table.from_pandas(df, schema=schema) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put( "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", buf.getvalue().to_pybytes(), ) store_schema_metadata( make_meta(df, origin="core"), "uuid+namespace-attribute12_underscored", store, "core_data", ) dmd = DatasetMetadata.load_from_store( "uuid+namespace-attribute12_underscored", store) dmd = dmd.load_index("product_id", store) assert dmd.query(product_id=2) == ["part_2"] dmd = dmd.load_all_indices(store) assert dmd.query(product_id=2, location_id=2) == ["part_2"] assert dmd.query(product_id=100, location_id=3) == ["part_1"] assert dmd.query(product_id=2, location_id=2, something_else="bla") == ["part_2"] additional_index = ExplicitSecondaryIndex.from_v2( "another_column", {"1": ["part_2", "part_3"]}) assert dmd.query(indices=[additional_index], another_column="1", product_id=2, location_id=2) == ["part_2"]
def test_schema_dataframe_rountrip(index, df_all_types): df = pd.DataFrame(df_all_types, index=index) schema = make_meta(df, origin="1") actual_df = empty_dataframe_from_schema(schema, date_as_object=True) validate_compatible([schema, make_meta(actual_df, origin="2")])
def time_make_meta(self): make_meta(self.df, origin="df")
def test_reorder(df_all_types): df2 = df_all_types.copy() df2 = df2.reindex(reversed(df_all_types.columns), axis=1) expected = make_meta(df_all_types, origin="df_all_types") actual = make_meta(df2, origin="df2") assert expected == actual
def test_unicode_col(): df = pd.DataFrame({"fö": [1]}) make_meta(df, origin="df")
def setup(self, num_schemas): self.df = get_dataframe_alltypes() schema = make_meta(self.df, origin="df") self.schemas = [deepcopy(schema) for _ in range(num_schemas)]
def test_dynamic_partitions_with_garbage(store): """ In case there are unknown files, dataset and indices still load correctly """ dataset_uuid = "uuid+namespace-attribute12_underscored" partition_suffix = "suffix" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0"), ("product", "P-0")], "{}.parquet".format(partition_suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1"), ("product", "P-0")], "{}.parquet".format(partition_suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/product=P-0/{}".format(partition_suffix): { "files": { "core": partition0_core } }, "location=L-1/product=P-0/{}".format(partition_suffix): { "files": { "core": partition1_core } }, } expected_indices = { "location": { "L-0": ["location=L-0/product=P-0/{}".format(partition_suffix)], "L-1": ["location=L-1/product=P-0/{}".format(partition_suffix)], }, "product": { "P-0": [ "location=L-0/product=P-0/{}".format(partition_suffix), "location=L-1/product=P-0/{}".format(partition_suffix), ] }, } store.put(partition0_core, b"test") store.put(partition1_core, b"test") store_schema_metadata( make_meta(pd.DataFrame({ "location": ["L-0"], "product": ["P-0"] }), origin="1"), dataset_uuid, store, "core", ) # the following files are garbage and should not interfere with the indices and/or partitions for suffix in ["", ".json", ".msgpack", ".my_own_file_format"]: store.put("this_should_not_exist{}".format(suffix), b"ignore me") store.put("{}/this_should_not_exist{}".format(dataset_uuid, suffix), b"ignore me") store.put( "{}/{}/this_should_not_exist{}".format(dataset_uuid, "core", suffix), b"ignore me", ) store.put( "{}/{}/location=L-0/this_should_not_exist{}".format( dataset_uuid, "core", suffix), b"ignore me", ) dmd = DatasetMetadata.load_from_dict(metadata, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions # Sorting may differ in the index list. This is ok for runtime # but does produce flaky tests thus sort them. sorted_result = { column: {label: sorted(x) for label, x in index.items()} for column, index in dmd_dict["indices"].items() } assert sorted_result == expected_indices
def test_dynamic_partitions(store): """ Do not specify partitions in metadata, but read them dynamically from store """ partition_suffix = "suffix" dataset_uuid = "uuid+namespace-attribute12_underscored" partition0_core = create_partition_key( dataset_uuid, "core", [("location", "L-0")], "{}.parquet".format(partition_suffix), ) partition1_core = create_partition_key( dataset_uuid, "core", [("location", "L-1")], "{}.parquet".format(partition_suffix), ) partition0_ext = create_partition_key( dataset_uuid, "extension", [("location", "L-0")], "{}.parquet".format(partition_suffix), ) partition1_ext = create_partition_key( dataset_uuid, "extension", [("location", "L-1")], "{}.parquet".format(partition_suffix), ) metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid} expected_partitions = { "location=L-0/{}".format(partition_suffix): { "files": { "core": partition0_core, "extension": partition0_ext } }, "location=L-1/{}".format(partition_suffix): { "files": { "core": partition1_core, "extension": partition1_ext } }, } expected_indices = { "location": { "L-0": ["location=L-0/{}".format(partition_suffix)], "L-1": ["location=L-1/{}".format(partition_suffix)], } } # put two partitions for two tables each to store store.put( "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX), simplejson.dumps(metadata).encode("utf-8"), ) store.put(partition0_core, b"test") store.put(partition1_core, b"test") store.put(partition0_ext, b"test") store.put(partition1_ext, b"test") store_schema_metadata( make_meta( pd.DataFrame({"location": ["L-0/{}".format(partition_suffix)]}), origin="stored", ), dataset_uuid, store, "core", ) # instantiate metadata to write table metadatad core_schema = make_meta( pd.DataFrame({ "column_0": pd.Series([1], dtype=int), "column_1": pd.Series([1], dtype=int), "location": pd.Series(["str"]), }), origin="core", ) extension_schema = make_meta( pd.DataFrame({ "column_77": pd.Series([1], dtype=int), "column_78": pd.Series([1], dtype=int), "location": pd.Series(["str"]), }), origin="extension", ) store_schema_metadata(core_schema, dataset_uuid, store, "core") store_schema_metadata(extension_schema, dataset_uuid, store, "extension") dmd = DatasetMetadata.load_from_store(dataset_uuid, store) # reload metadata to use table metadata dmd = DatasetMetadata.load_from_store(dataset_uuid, store) dmd = dmd.load_partition_indices() dmd_dict = dmd.to_dict() assert dmd_dict["partitions"] == expected_partitions assert dmd_dict["indices"] == expected_indices
def __init__( self, label: Optional[str], file: Optional[str] = None, table_name: str = SINGLE_TABLE, data: Optional[pd.DataFrame] = None, indices: Optional[Dict[Any, Any]] = None, metadata_version: Optional[int] = None, schema: Optional[SchemaWrapper] = None, partition_keys: Optional[Sequence[str]] = None, logical_conjunction: Optional[List[Tuple[Any, str, Any]]] = None, ): """ Initialize the :mod:`kartothek.io` base class MetaPartition. The `MetaPartition` is used as a wrapper around the kartothek `Partition` and primarily deals with dataframe manipulations, in- and output to store. The :class:`kartothek.io_components.metapartition` is immutable, i.e. all member functions will return a new MetaPartition object where the new attribute is changed Parameters ---------- label partition label files A dictionary with references to the files in store where the keys represent file labels and the keys file prefixes. metadata The metadata of the partition data A dictionary including the materialized in-memory DataFrames corresponding to the file references in `files`. indices Kartothek index dictionary, metadata_version table_meta The dataset table schemas partition_keys The dataset partition keys logical_conjunction A logical conjunction to assign to the MetaPartition. By assigning this, the MetaPartition will only be able to load data respecting this conjunction. """ if metadata_version is None: self.metadata_version = naming.DEFAULT_METADATA_VERSION else: self.metadata_version = metadata_version verify_metadata_version(self.metadata_version) self.schema = schema self.table_name = table_name if data is not None and schema is None: self.schema = make_meta(data, origin=f"{table_name}/{label}", partition_keys=partition_keys) indices = indices or {} for column, index_dct in indices.items(): if isinstance(index_dct, dict): indices[column] = ExplicitSecondaryIndex(column=column, index_dct=index_dct) self.logical_conjunction = logical_conjunction self.metapartitions = [{ "label": label, "data": data, "file": file or None, "indices": indices, "logical_conjunction": logical_conjunction, }] self.partition_keys = partition_keys or []