def test_read_or_predicates(store_factory, partition_on): # https://github.com/JDASoftwareGroup/kartothek/issues/295 dataset_uuid = "test" df = pd.DataFrame({"A": range(10), "B": ["A", "B"] * 5, "C": range(-10, 0)}) store_dataframes_as_dataset( store=store_factory, dataset_uuid=dataset_uuid, dfs=[df], partition_on=partition_on, ) df1 = read_table( store=store_factory, dataset_uuid=dataset_uuid, predicates=[[("A", "<", 3)], [("A", ">", 5)], [("B", "==", "non-existent")]], ) df2 = read_table( store=store_factory, dataset_uuid=dataset_uuid, predicates=[[("A", "<", 3)], [("A", ">", 5)]], ) expected = pd.DataFrame( data={ "A": [0, 1, 2, 6, 7, 8, 9], "B": ["A", "B", "A", "A", "B", "A", "B"], "C": [-10, -9, -8, -4, -3, -2, -1], }, ) pd.testing.assert_frame_equal(df1, df2) pd.testing.assert_frame_equal(expected, df2)
def assert_target_ktk_readable(tgt_store, tgt_ds): """ Try to read the target dataset using high level KTK functionality """ df_result = read_table(store=tgt_store, dataset_uuid=tgt_ds,) assert df_result is not None assert len(df_result) == 10 df_result = read_table( store=tgt_store, dataset_uuid=tgt_ds, predicates=[[("bool", "==", True)]] ) assert len(df_result) == 5 df_result = read_table( store=tgt_store, dataset_uuid=tgt_ds, predicates=[[("bytes", "==", b"2")]] ) assert len(df_result) == 1
def test_initial_commit(store): dataset_uuid = "dataset_uuid" df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])])) dataset = create_empty_dataset_header( store=store, table_meta={"core": make_meta(df, origin="1")}, dataset_uuid=dataset_uuid, metadata_version=4, ) assert dataset.explicit_partitions is False new_data = {"data": {"core": df}} new_metapartition = write_single_partition(store=store, dataset_uuid=dataset.uuid, data=new_data) new_partition = [{ "label": new_metapartition.label, "data": [("core", None)] }] updated_dataset = commit_dataset( store=store, dataset_uuid=dataset.uuid, new_partitions=new_partition, delete_scope=None, partition_on=None, ) assert updated_dataset.explicit_partitions is True actual = read_table(store=store, table="core", dataset_uuid=updated_dataset.uuid) df_expected = pd.DataFrame( OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])])) assert_frame_equal(df_expected, actual)
def test_initial_commit(store): dataset_uuid = "dataset_uuid" df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])])) dataset = create_empty_dataset_header( store=store, schema=make_meta(df, origin="1"), dataset_uuid=dataset_uuid, metadata_version=4, ) assert dataset.explicit_partitions is False new_metapartition = write_single_partition(store=store, dataset_uuid=dataset.uuid, data=df) updated_dataset = commit_dataset( store=store, dataset_uuid=dataset.uuid, # FIXME: is this breaking and if so, is it expected? new_partitions=[new_metapartition], delete_scope=None, partition_on=None, ) assert updated_dataset.explicit_partitions is True actual = read_table(store=store, dataset_uuid=updated_dataset.uuid) df_expected = pd.DataFrame( OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])])) assert_frame_equal(df_expected, actual)
def test_commit_dataset_from_metapartition(dataset_function, store): new_data = [ pd.DataFrame( OrderedDict([ ("P", [5]), ("L", [5]), ("TARGET", [5]), ("DATE", [datetime.date(2016, 3, 23)]), ])) ] new_partition = write_single_partition(store=store, dataset_uuid=dataset_function.uuid, data=new_data) pre_commit_dataset = DatasetMetadata.load_from_store( uuid=dataset_function.uuid, store=store) # Cannot assert equal since the metadata is differently ordered assert pre_commit_dataset == dataset_function updated_dataset = commit_dataset( store=store, dataset_uuid=dataset_function.uuid, new_partitions=new_partition, delete_scope=None, partition_on=None, ) assert updated_dataset != dataset_function assert updated_dataset.uuid == dataset_function.uuid assert len( updated_dataset.partitions) == len(dataset_function.partitions) + 1 # ensure that the new dataset is actually the one on disc loaded_dataset = DatasetMetadata.load_from_store(uuid=updated_dataset.uuid, store=store) assert loaded_dataset == updated_dataset # Read the data and check whether the rows above are included. # This checks whether all necessary informations were updated in the header # (e.g. files attributes of the partitions) actual = read_table(store=store, dataset_uuid=dataset_function.uuid) df_expected = pd.DataFrame( OrderedDict([ ( "DATE", [ datetime.date(2016, 3, 23), datetime.date(2010, 1, 1), datetime.date(2009, 12, 31), ], ), ("L", [5, 1, 2]), ("P", [5, 1, 2]), ("TARGET", [5, 1, 2]), ])) actual = actual.sort_values("DATE", ascending=False).reset_index(drop=True) assert_frame_equal(df_expected, actual)
def _read_table(*args, **kwargs): kwargs.pop("dispatch_by", None) res = read_table(*args, **kwargs) if len(res): # Array split conserves dtypes return np.array_split(res, len(res)) else: return [res]
def _read_table(*args, **kwargs): if "tables" in kwargs: kwargs.pop("tables") res = read_table(*args, table="core", **kwargs) if len(res): # Array split conserves dtypes return np.array_split(res, len(res)) else: return [res]
def _read_table(*args, **kwargs): if "tables" in kwargs: param_tables = kwargs.pop("tables") kwargs["table"] = param_tables res = read_table(*args, **kwargs) if len(res): # Array split conserves dtypes return np.array_split(res, len(res)) else: return [res]
def test_read_table_with_columns(dataset, store_session): df = read_table( store=store_session, dataset_uuid="dataset_uuid", columns=["P", "L"], ) expected_df = pd.DataFrame({"P": [1, 2], "L": [1, 2]}) # No stability of partitions df = df.sort_values(by="P").reset_index(drop=True) expected_df = expected_df.sort_values(by="P").reset_index(drop=True) pdt.assert_frame_equal(df, expected_df, check_dtype=False, check_like=True)
def test_non_default_table_name_roundtrip(store_factory, bound_store_dataframes): df = pd.DataFrame({"A": [1]}) bound_store_dataframes([df], store=store_factory, dataset_uuid="dataset_uuid", table_name="foo") for k in store_factory(): if k.endswith(".parquet") and "indices" not in k: assert "foo" in k result = read_table(dataset_uuid="dataset_uuid", store=store_factory) pdt.assert_frame_equal(df, result)
def test_read_table_simple_list_for_cols_cats(dataset, store_session): df = read_table( store=store_session, dataset_uuid="dataset_uuid", table=SINGLE_TABLE, columns=["P", "L"], categoricals=["P", "L"], ) expected_df = pd.DataFrame({"P": [1, 2], "L": [1, 2]}) # No stability of partitions df = df.sort_values(by="P").reset_index(drop=True) expected_df = expected_df.sort_values(by="P").reset_index(drop=True) expected_df = expected_df.astype("category") pdt.assert_frame_equal(df, expected_df, check_dtype=False, check_like=True)
def test_update_respects_ktk_cube_dataset_ids( driver, function_store, ktk_cube_dataset_ids ): df_source, cube = _write_cube(function_store) df_ex = _extend_cube(cube, function_store) remove_conditions = C("p") == 0 # This implicitly also tests that `data={}` behaves as expected and still deletes partitions # as requested via ktk_cube_dataset_ids and remove_conditions result = driver( data={}, remove_conditions=remove_conditions, cube=cube, store=function_store, ktk_cube_dataset_ids=ktk_cube_dataset_ids, ) assert set(result) == ktk_cube_dataset_ids df_read = query_cube(cube, function_store)[0] # expected result: df_source left joined with df_ex; choosing the subset of p!=0 from each # that is in `ktk_cube_dataset_ids`: if "source" in ktk_cube_dataset_ids: df_source = df_source.loc[df_source["p"] != 0] if "ex" in ktk_cube_dataset_ids: df_ex = df_ex.loc[df_ex["p"] != 0] df_expected = df_source.merge(df_ex[["x", "a"]], how="left", on="x") df_expected = df_expected[sorted(df_expected.columns)] pd.testing.assert_frame_equal(df_read, df_expected) # test "ex" separately, because the test above based on the *left* merge does not tell us much about # "ex" in case the partitions were removed from "source" df_ex_read = read_table(cube.ktk_dataset_uuid("ex"), function_store) if "ex" in ktk_cube_dataset_ids: assert set(df_ex_read["p"]) == {1} else: assert set(df_ex_read["p"]) == {0, 1}
def test_read_table_eager(dataset, store_session, use_categoricals): if use_categoricals: categories = ["P"] else: categories = None df = read_table( store=store_session, dataset_uuid="dataset_uuid", categoricals=categories, ) expected_df = pd.DataFrame( { "P": [1, 2], "L": [1, 2], "TARGET": [1, 2], "DATE": [datetime.date(2010, 1, 1), datetime.date(2009, 12, 31)], } ) if categories: expected_df = expected_df.astype({"P": "category"}) # No stability of partitions df = df.sort_values(by="P").reset_index(drop=True) pdt.assert_frame_equal(df, expected_df, check_dtype=True, check_like=True)
def test_read_table_eager(dataset, store_session, use_categoricals): if use_categoricals: categories = {"core": ["P"]} else: categories = None df = read_table( store=store_session, dataset_uuid="dataset_uuid", table="core", categoricals=categories, ) expected_df = pd.DataFrame({ "P": [1, 2], "L": [1, 2], "TARGET": [1, 2], "DATE": pd.to_datetime( [datetime.date(2010, 1, 1), datetime.date(2009, 12, 31)]), }) if categories: expected_df = expected_df.astype({"P": "category"}) # No stability of partitions df = df.sort_values(by="P").reset_index(drop=True) pdt.assert_frame_equal(df, expected_df, check_dtype=True, check_like=True) df_2 = read_table(store=store_session, dataset_uuid="dataset_uuid", table="helper") expected_df_2 = pd.DataFrame({"P": [1, 2], "info": ["a", "b"]}) assert isinstance(df_2, pd.DataFrame) # No stability of partitions df_2 = df_2.sort_values(by="P").reset_index(drop=True) pdt.assert_frame_equal(df_2, expected_df_2, check_dtype=True, check_like=True) df_3 = read_table( store=store_session, dataset_uuid="dataset_uuid", table="helper", predicates=[[("P", "==", 2)]], ) expected_df_3 = pd.DataFrame({"P": [2], "info": ["b"]}) assert isinstance(df_3, pd.DataFrame) pdt.assert_frame_equal(df_3, expected_df_3, check_dtype=True, check_like=True) df_4 = read_table( store=store_session, dataset_uuid="dataset_uuid", table="helper", predicates=[[("info", "==", "a")]], ) expected_df_4 = pd.DataFrame({"P": [1], "info": ["a"]}) assert isinstance(df_4, pd.DataFrame) pdt.assert_frame_equal(df_4, expected_df_4, check_dtype=True, check_like=True)
from functools import partial from tempfile import TemporaryDirectory from storefact import get_store_from_url from kartothek.io.eager import read_table dataset_dir = TemporaryDirectory() store_factory = partial(get_store_from_url, "hfs:///Users/1019021/Learn/python-python-parquet/resources") print(read_table("order_proposals_a6e8aef43203", store_factory, table="order_proposals"))
from storefact import get_store_from_url from kartothek.io.eager import read_table from kartothek.io.iter import read_dataset_as_dataframes__iterator from kartothek.io.dask.delayed import read_dataset_as_delayed import pandas as pd import dask percent_encoded_sas_token = "" store_factory = partial( get_store_from_url, f"hazure://*****:*****@ktkfiles?use_sas=true&create_if_missing=false" ) # Approach 1 # read all at once df = read_table(dataset_uuid="order_proposals_a6e8aef43203", store=store_factory, table="order_proposals") # write aggregated df to disk df.to_parquet('sample.python-parquet', engine='pyarrow') # Approach 2 # read iteratively df_frames = pd.DataFrame() for partition_index, df_dict in enumerate( read_dataset_as_dataframes__iterator( dataset_uuid="order_proposals_a6e8aef43203", store=store_factory)): # print(f"Partition #{partition_index}") for table_name, table_df in df_dict.items(): # print(f"Table: {table_name}. Data: \n{table_df}") df_frames = df_frames.append(table_df) # write aggregated df to disk