def apply_postwrite_checks(datasets, cube, store, existing_datasets): """ Apply sanity checks that can only be done after Kartothek has written its datasets. Parameters ---------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that just got written. cube: kartothek.core.cube.cube.Cube Cube specification. store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] KV store. existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that were present before the write procedure started. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] Datasets that just got written. Raises ------ ValueError If sanity check failed. """ try: empty_datasets = { ktk_cube_dataset_id for ktk_cube_dataset_id, ds in datasets.items() if len(ds.partitions) == 0 } if empty_datasets: raise ValueError( "Cannot write empty datasets: {empty_datasets}".format( empty_datasets=", ".join(sorted(empty_datasets)))) datasets_to_check = copy(existing_datasets) datasets_to_check.update(datasets) check_datasets(datasets_to_check, cube) except Exception as e: _rollback_transaction(existing_datasets=existing_datasets, new_datasets=datasets, store=store) raise MultiTableCommitAborted( "Post commit check failed. Operation rolled back.") from e return datasets
def discover_datasets( cube: Cube, store: StoreInput, filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None, ) -> Dict[str, DatasetMetadata]: """ Get all known datasets that belong to a give cube. Parameters ---------- cube Cube specification. store KV store. filter_ktk_cube_dataset_ids Optional selection of datasets to include. Returns ------- datasets: Dict[str, DatasetMetadata] All discovered datasets. Raises ------ ValueError In case no valid cube could be discovered. """ filter_ktk_cube_dataset_ids = converter_str_set_optional( filter_ktk_cube_dataset_ids ) result = discover_datasets_unchecked( cube.uuid_prefix, store, filter_ktk_cube_dataset_ids ) if filter_ktk_cube_dataset_ids is not None: if isinstance(filter_ktk_cube_dataset_ids, str): filter_ktk_cube_dataset_ids = {filter_ktk_cube_dataset_ids} else: filter_ktk_cube_dataset_ids = set(filter_ktk_cube_dataset_ids) missing = filter_ktk_cube_dataset_ids - set(result.keys()) if missing: raise ValueError( "Could not find the following requested datasets: {missing}".format( missing=", ".join(sorted(missing)) ) ) check_datasets(result, cube) return result
def discover_datasets(cube, store, filter_ktk_cube_dataset_ids=None): """ Get all known datasets that belong to a give cube. Parameters ---------- cube: kartothek.core.cube.cube.Cube Cube specification. store: Union[Callable[[], simplekv.KeyValueStore], simplekv.KeyValueStore] KV store. filter_ktk_cube_dataset_ids: Union[None, str, Iterable[str]] Optional selection of datasets to include. Returns ------- datasets: Dict[str, kartothek.core.dataset.DatasetMetadata] All discovered datasets. Raises ------ ValueError In case no valid cube could be discovered. """ filter_ktk_cube_dataset_ids = converter_str_set_optional( filter_ktk_cube_dataset_ids) result = discover_datasets_unchecked(cube.uuid_prefix, store, filter_ktk_cube_dataset_ids) if filter_ktk_cube_dataset_ids is not None: missing = filter_ktk_cube_dataset_ids - set(result.keys()) if missing: raise ValueError( "Could not find the following requested datasets: {missing}". format(missing=", ".join(sorted(missing)))) check_datasets(result, cube) return result
def discover_cube( uuid_prefix: str, store: Union[Callable[[], KeyValueStore]], filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None, ) -> Tuple[Cube, Dict[str, DatasetMetadata]]: """ Recover cube information from store. Parameters ---------- uuid_prefix Dataset UUID prefix. store KV store. filter_ktk_cube_dataset_ids Optional selection of datasets to include. Returns ------- cube: Cube Cube specification. datasets: Dict[str, DatasetMetadata] All discovered datasets. """ datasets = discover_datasets_unchecked(uuid_prefix, store, filter_ktk_cube_dataset_ids) seed_candidates = { ktk_cube_dataset_id for ktk_cube_dataset_id, ds in datasets.items() if ds.metadata.get(KTK_CUBE_METADATA_KEY_IS_SEED, ds.metadata.get("klee_is_seed", False)) } if len(seed_candidates) == 0: raise ValueError( 'Could not find seed dataset for cube "{uuid_prefix}".'.format( uuid_prefix=uuid_prefix)) elif len(seed_candidates) > 1: raise ValueError( 'Found multiple possible seed datasets for cube "{uuid_prefix}": {seed_candidates}' .format( uuid_prefix=uuid_prefix, seed_candidates=", ".join(sorted(seed_candidates)), )) seed_dataset = list(seed_candidates)[0] seed_ds = datasets[seed_dataset] dimension_columns = seed_ds.metadata.get( KTK_CUBE_METADATA_DIMENSION_COLUMNS, seed_ds.metadata.get("klee_dimension_columns"), ) if dimension_columns is None: raise ValueError( 'Could not recover dimension columns from seed dataset ("{seed_dataset}") of cube "{uuid_prefix}".' .format(seed_dataset=seed_dataset, uuid_prefix=uuid_prefix)) # datasets written with new kartothek versions (after merge of PR#7747) # always set KTK_CUBE_METADATA_PARTITION_COLUMNS and "klee_timestamp_column" in the metadata. # Older versions of ktk_cube do not write these; instead, these columns are inferred from # the actual partitioning: partition_columns are all but the last partition key # # TODO: once we're sure we have re-written all kartothek cubes, the code # in the branch `if partition_columns is None` below can be removed. # # read the now unused timestamp column just to make sure we can still read older cubes. # # TODO: once all cubes are re-created and don't use timestamp column anymore, remove the timestamp column handling # entirely partition_columns = seed_ds.metadata.get( KTK_CUBE_METADATA_PARTITION_COLUMNS, seed_ds.metadata.get("klee_partition_columns"), ) timestamp_column = seed_ds.metadata.get("klee_timestamp_column") if partition_columns is None: # infer the partition columns and timestamp column from the actual partitioning: partition_keys = seed_ds.partition_keys if len(partition_keys) == 0: raise ValueError( 'Seed dataset ("{seed_dataset}") has no partition keys.'. format( # type: ignore # noqa seed_dataset=seed_dataset, partition_keys=", ".join(partition_keys), )) elif len(partition_keys) < 2: raise ValueError(( 'Seed dataset ("{seed_dataset}") has only a single partition key ({partition_key}) ' "but should have at least 2.").format( seed_dataset=seed_dataset, partition_key=partition_keys[0])) partition_columns = partition_keys[:-1] timestamp_column = partition_keys[-1] index_columns = set() for ds in datasets.values(): index_columns |= set(ds.indices.keys()) - (set(dimension_columns) | set(partition_columns) | {timestamp_column}) # we only support the default timestamp column in the compat code if (timestamp_column is not None) and (timestamp_column != "KLEE_TS"): raise NotImplementedError( f"Can only read old cubes if the timestamp column is 'KLEE_TS', but '{timestamp_column}' was detected." ) cube = Cube( uuid_prefix=uuid_prefix, dimension_columns=dimension_columns, partition_columns=partition_columns, index_columns=index_columns, seed_dataset=seed_dataset, ) datasets = check_datasets(datasets, cube) return cube, datasets
def plan_query( conditions, cube, datasets, dimension_columns, partition_by, payload_columns, store, ): """ Plan cube query execution. .. important:: If the intention does not contain a partition-by, this partition by the cube partition columns to speed up the query on parallel backends. In that case, the backend must concat and check the resulting dataframes before passing it to the user. Parameters ---------- conditions: Union[None, Condition, Iterable[Condition], Conjunction] Conditions that should be applied. cube: Cube Cube specification. datasets: Union[None, Iterable[str], Dict[str, kartothek.core.dataset.DatasetMetadata]] Datasets to query, must all be part of the cube. dimension_columns: Optional[Iterable[str]] Dimension columns of the query, may result in projection. partition_by: Optional[Iterable[str]] By which column logical partitions should be formed. payload_columns: Optional[Iterable[str]] Which columns apart from ``dimension_columns`` and ``partition_by`` should be returned. store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]] Store to query from. Returns ------- intent: QueryIntention Query intention. empty_df: pandas.DataFrame Empty DataFrame representing the output types. groups: Tuple[QueryGroup] Tuple of query groups. May be empty. """ if callable(store): store = store() if not isinstance(datasets, dict): datasets = discover_datasets(cube=cube, store=store, filter_ktk_cube_dataset_ids=datasets) else: datasets = check_datasets(datasets, cube) datasets = { ktk_cube_dataset_id: ds.load_partition_indices() for ktk_cube_dataset_id, ds in datasets.items() } indexed_columns = _get_indexed_columns(datasets) intention = determine_intention( cube=cube, datasets=datasets, dimension_columns=dimension_columns, partition_by=partition_by, conditions=conditions, payload_columns=payload_columns, indexed_columns=indexed_columns, ) datasets = _load_required_explicit_indices(datasets, intention, store) restrictive_dataset_ids = _determine_restrictive_dataset_ids( cube=cube, datasets=datasets, intention=intention) load_columns = _dermine_load_columns(cube=cube, datasets=datasets, intention=intention) datasets = _filter_relevant_datasets(datasets=datasets, load_columns=load_columns) empty_df = { ktk_cube_dataset_id: _reduce_empty_dtype_sizes( empty_dataframe_from_schema( schema=ds.schema, columns=sorted( get_dataset_columns(ds) & set(load_columns[ktk_cube_dataset_id])), )) for ktk_cube_dataset_id, ds in datasets.items() } empty_df_single = empty_df[cube.seed_dataset].copy() for k, df in empty_df.items(): if k == cube.seed_dataset: continue if empty_df_single is None: empty_df_single = df.copy() else: empty_df_single = empty_df_single.merge(df) empty_df_single = empty_df_single[list(intention.output_columns)] groups = regroup( intention, cube=cube, datasets=datasets, empty_df=empty_df, indexed_columns=indexed_columns, load_columns=load_columns, restrictive_dataset_ids=restrictive_dataset_ids, ) return intention, empty_df_single, groups