Exemple #1
0
def determine_intention(
    cube,
    datasets,
    dimension_columns,
    partition_by,
    conditions,
    payload_columns,
    indexed_columns,
):
    """
    Dermine and check user intention during the query process.

    Parameters
    ----------
    cube: Cube
        Cube specification.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are present.
    dimension_columns: Optional[Iterable[str]]
        Dimension columns of the query, may result in projection.
    partition_by: Optional[Iterable[str]]
        By which column logical partitions should be formed.
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied.
    payload_columns: Optional[Iterable[str]]
        Which columns apart from ``dimension_columns`` and ``partition_by`` should be returned from the query.
    indexed_columns: Dict[str, Set[str]]
        Indexed columns per ktk_cube dataset ID.

    Returns
    -------
    intention: QueryIntention
        Checked and filled in intention of the user.
    """
    all_available_columns = set(
        itertools.chain.from_iterable(
            [get_dataset_columns(ds) for ds in datasets.values()]))

    dimension_columns = _process_dimension_columns(
        dimension_columns=dimension_columns, cube=cube)
    partition_by = _process_partition_by(
        partition_by=partition_by,
        cube=cube,
        all_available_columns=all_available_columns,
        indexed_columns=indexed_columns,
    )

    conditions_pre, conditions_post = _process_conditions(
        conditions=conditions,
        cube=cube,
        datasets=datasets,
        all_available_columns=all_available_columns,
        indexed_columns=indexed_columns,
    )

    payload_columns = _process_payload(
        payload_columns=payload_columns,
        all_available_columns=all_available_columns,
        cube=cube,
    )
    output_columns = tuple(
        sorted(
            set(partition_by)
            | set(dimension_columns)
            | set(payload_columns)
            | set(cube.partition_columns)))

    return QueryIntention(
        dimension_columns=dimension_columns,
        partition_by=partition_by,
        conditions_pre=conditions_pre,
        conditions_post=conditions_post,
        output_columns=output_columns,
    )
def _check_indices(datasets: Dict[str, DatasetMetadata], cube: Cube) -> None:
    """
    Check if required indices are present in given datasets.

    For all datasets the primary indices must be equal to ``ds.partition_keys``. For the seed dataset, secondary
    indices for all dimension columns except ``cube.suppress_index_on`` are expected.

    Additional indices are accepted and will not be reported as error.

    Parameters
    ----------
    datasets
        Datasets.
    cube
        Cube specification.

    Raises
    ------
    ValueError: In case indices are broken.
    """
    for ktk_cube_dataset_id in sorted(datasets.keys()):
        ds = datasets[ktk_cube_dataset_id]
        primary_indices = ds.partition_keys
        columns = get_dataset_columns(ds)
        secondary_indices = set()
        any_indices = set(cube.index_columns) & columns

        if ktk_cube_dataset_id == cube.seed_dataset:
            secondary_indices |= set(cube.dimension_columns) - set(
                cube.suppress_index_on)

        for types_untyped, elements in (
            ((PartitionIndex, ), primary_indices),
            ((ExplicitSecondaryIndex, ), secondary_indices),
            ((ExplicitSecondaryIndex, PartitionIndex), any_indices),
        ):
            types = cast(Tuple[type, ...], types_untyped)

            tname = " or ".join(t.__name__ for t in types)

            # it seems that partition indices are not always present (e.g. for empty datasets), so add partition keys to
            # the set
            indices = cast(Dict[str, Union[IndexBase, str]], copy(ds.indices))
            if PartitionIndex in types:
                for pk in ds.partition_keys:
                    if pk not in indices:
                        indices[pk] = "dummy"

            for e in sorted(elements):
                if e not in indices:
                    raise ValueError(
                        '{tname} "{e}" is missing in dataset "{ktk_cube_dataset_id}".'
                        .format(tname=tname,
                                e=e,
                                ktk_cube_dataset_id=ktk_cube_dataset_id))

                idx = indices[e]
                t2 = type(idx)
                tname2 = t2.__name__
                if (idx != "dummy") and (not isinstance(idx, types)):
                    raise ValueError(
                        '"{e}" in dataset "{ktk_cube_dataset_id}" is of type {tname2} but should be {tname}.'
                        .format(
                            tname=tname,
                            tname2=tname2,
                            e=e,
                            ktk_cube_dataset_id=ktk_cube_dataset_id,
                        ))
Exemple #3
0
def _process_conditions(conditions, cube, datasets, all_available_columns,
                        indexed_columns):
    """
    Process and check given query conditions.

    Parameters
    ----------
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied.
    cube: Cube
        Cube specification.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are present.
    all_available_columns: Set[str]
        All columns that are available for query.
    indexed_columns: Dict[str, Set[str]]
        Indexed columns per ktk_cube dataset ID.

    Returns
    -------
    conditions_pre: Dict[str, Conjuction]
        Conditions to be applied based on the index data alone.
    conditions_post: Dict[str, Conjuction]
        Conditions to be applied during the load process.

    Raises
    -------
    TypeError: In case of a wrong type.
    """
    conditions = Conjunction(conditions)

    condition_columns = conditions.columns
    missing = condition_columns - all_available_columns
    if missing:
        raise ValueError(
            "Following condition columns are required but are missing from the cube: {missing}"
            .format(missing=", ".join(sorted(missing))))
    _test_condition_types(conditions, datasets)

    conditions_split = conditions.split_by_column()

    conditions_pre = {}
    for ktk_cube_dataset_id, ds in datasets.items():
        candidate_cols = indexed_columns[ktk_cube_dataset_id]
        if not candidate_cols:
            continue

        filtered = [
            conj for col, conj in conditions_split.items()
            if col in candidate_cols
        ]
        if not filtered:
            continue

        conditions_pre[ktk_cube_dataset_id] = reduce(Conjunction.from_two,
                                                     filtered)

    conditions_post = {}
    for ktk_cube_dataset_id, ds in datasets.items():
        candidate_cols = (get_dataset_columns(ds) & condition_columns) - set(
            cube.partition_columns)
        if not candidate_cols:
            continue

        filtered = [
            conj for col, conj in conditions_split.items()
            if col in candidate_cols
        ]
        if not filtered:
            continue

        conditions_post[ktk_cube_dataset_id] = reduce(Conjunction.from_two,
                                                      filtered)

    return conditions_pre, conditions_post