def read_table( dataset_uuid: Optional[str] = None, store=None, table: Optional[str] = SINGLE_TABLE, columns: Dict[str, List[str]] = None, concat_partitions_on_primary_index: bool = False, predicate_pushdown_to_io: bool = True, categoricals: Dict[str, List[str]] = None, label_filter: Callable = None, dates_as_object: bool = False, predicates: Optional[List[List[Tuple[str, str, Any]]]] = None, factory: Optional[DatasetFactory] = None, ) -> pd.DataFrame: """ A utility function to load a single table with multiple partitions as a single dataframe in one go. Mostly useful for smaller tables or datasets where all partitions fit into memory. The order of partitions is not guaranteed to be stable in the resulting dataframe. Parameters ---------- Returns ------- pandas.DataFrame Returns a pandas.DataFrame holding the data of the requested columns Examples -------- Dataset in store contains two partitions with two files each .. code :: >>> import storefact >>> from kartothek.io.eager import read_table >>> store = storefact.get_store_from_url('s3://bucket_with_dataset') >>> df = read_table(store, 'dataset_uuid', 'core') """ if concat_partitions_on_primary_index is not False: warnings.warn( "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release.", DeprecationWarning, ) if not isinstance(table, str): raise TypeError("Argument `table` needs to be a string") columns = _check_compatible_list(table, columns, "columns") categoricals = _check_compatible_list(table, categoricals, "categoricals") ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=_make_callable(store), factory=factory, load_dataset_metadata=False, ) partitions = read_dataset_as_dataframes( tables=[table], columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, ) empty_df = empty_dataframe_from_schema( schema=ds_factory.table_meta[table], columns=columns[table] if columns is not None else None, ) dfs = [partition_data[table] for partition_data in partitions] + [empty_df] # require meta 4 otherwise, can't construct types/columns if categoricals: dfs = align_categories(dfs, categoricals[table]) df = pd.concat(dfs, ignore_index=True, sort=False) # ensure column order if len(empty_df.columns) > 0: df = df.reindex(empty_df.columns, copy=False, axis=1) return df
def plan_query( conditions, cube, datasets, dimension_columns, partition_by, payload_columns, store, ): """ Plan cube query execution. .. important:: If the intention does not contain a partition-by, this partition by the cube partition columns to speed up the query on parallel backends. In that case, the backend must concat and check the resulting dataframes before passing it to the user. Parameters ---------- conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied. cube: Cube Cube specification. datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]] Datasets to query, must all be part of the cube. dimension_columns: Optional[Iterable[str]] Dimension columns of the query, may result in projection. partition_by: Optional[Iterable[str]] By which column logical partitions should be formed. payload_columns: Optional[Iterable[str]] Which columns apart from ``dimension_columns`` and ``partition_by`` should be returned. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Store to query from. Returns ------- intent: QueryIntention Query intention. empty_df: pandas.DataFrame Empty DataFrame representing the output types. groups: Tuple[QueryGroup] Tuple of query groups. May be empty. """ if callable(store): store = store() if not isinstance(datasets, dict): datasets = discover_datasets(cube=cube, store=store, filter_ktk_cube_dataset_ids=datasets) else: datasets = check_datasets(datasets, cube) datasets = { ktk_cube_dataset_id: ds.load_partition_indices() for ktk_cube_dataset_id, ds in datasets.items() } indexed_columns = _get_indexed_columns(datasets) intention = determine_intention( cube=cube, datasets=datasets, dimension_columns=dimension_columns, partition_by=partition_by, conditions=conditions, payload_columns=payload_columns, indexed_columns=indexed_columns, ) datasets = _load_required_explicit_indices(datasets, intention, store) restrictive_dataset_ids = _determine_restrictive_dataset_ids( cube=cube, datasets=datasets, intention=intention) load_columns = _dermine_load_columns(cube=cube, datasets=datasets, intention=intention) datasets = _filter_relevant_datasets(datasets=datasets, load_columns=load_columns) empty_df = { ktk_cube_dataset_id: _reduce_empty_dtype_sizes( empty_dataframe_from_schema( schema=ds.table_meta[SINGLE_TABLE], columns=sorted( get_dataset_columns(ds) & set(load_columns[ktk_cube_dataset_id])), )) for ktk_cube_dataset_id, ds in datasets.items() } empty_df_single = empty_df[cube.seed_dataset].copy() for k, df in empty_df.items(): if k == cube.seed_dataset: continue if empty_df_single is None: empty_df_single = df.copy() else: empty_df_single = empty_df_single.merge(df) empty_df_single = empty_df_single[list(intention.output_columns)] groups = regroup( intention, cube=cube, datasets=datasets, empty_df=empty_df, indexed_columns=indexed_columns, load_columns=load_columns, restrictive_dataset_ids=restrictive_dataset_ids, ) return intention, empty_df_single, groups
def test_schema_dataframe_rountrip(index, df_all_types): df = pd.DataFrame(df_all_types, index=index) schema = make_meta(df, origin="1") actual_df = empty_dataframe_from_schema(schema, date_as_object=True) validate_compatible([schema, make_meta(actual_df, origin="2")])
def test_empty_dataframe_from_schema_columns(df_all_types): schema = make_meta(df_all_types, origin="1") actual_df = empty_dataframe_from_schema(schema, ["uint64", "int64"]) expected_df = df_all_types.loc[[], ["uint64", "int64"]] pdt.assert_frame_equal(actual_df, expected_df)
def read_table( dataset_uuid=None, store=None, table=None, columns=None, concat_partitions_on_primary_index=False, predicate_pushdown_to_io=True, categoricals=None, label_filter=None, dates_as_object=False, predicates=None, factory=None, ): """ A utility function to load a single table with multiple partitions as a single dataframe in one go. Mostly useful for smaller tables or datasets where all partitions fit into memory. The order of partitions is not guaranteed to be stable in the resulting dataframe. Parameters ---------- table: str The table to be loaded columns: List[str] The columns to be loaded categoricals: List[str] A list of columns names which should be retrieved as a `pandas.Categorical` Returns ------- pandas.DataFrame Returns a pandas.DataFrame holding the data of the requested columns Examples -------- Dataset in store contains two partitions with two files each .. code :: >>> import storefact >>> from kartothek.io.eager import read_table >>> store = storefact.get_store_from_url('s3://bucket_with_dataset') >>> df = read_table(store, 'dataset_uuid', 'core') """ if table is None: raise TypeError("Parameter `table` is not optional.") if not isinstance(table, str): raise TypeError("Argument `table` needs to be a string") columns = _check_compatible_list(table, columns, "columns") categoricals = _check_compatible_list(table, categoricals, "categoricals") ds_factory = _ensure_factory( dataset_uuid=dataset_uuid, store=_make_callable(store), factory=factory, load_dataset_metadata=False, ) partitions = read_dataset_as_dataframes( tables=[table], columns=columns, concat_partitions_on_primary_index=concat_partitions_on_primary_index, predicate_pushdown_to_io=predicate_pushdown_to_io, categoricals=categoricals, label_filter=label_filter, dates_as_object=dates_as_object, predicates=predicates, factory=ds_factory, ) empty_df = empty_dataframe_from_schema( schema=ds_factory.table_meta[table], columns=columns[table] if columns is not None else None, ) dfs = [partition_data[table] for partition_data in partitions] + [empty_df] # require meta 4 otherwise, can't construct types/columns if categoricals: dfs = align_categories(dfs, categoricals[table]) df = pd.concat(dfs, ignore_index=True, sort=False) # ensure column order if len(empty_df.columns) > 0: df = df.reindex(empty_df.columns, copy=False, axis=1) return df