Exemple #1
0
 def test_empty_real(self):
     conj = Conjunction([])
     assert conj.conditions == ()
     assert str(conj) == ""
     assert conj.columns == set()
     assert conj.predicate is None
     assert conj.split_by_column() == {}
Exemple #2
0
 def test_empty_pseudo(self):
     cond = InIntervalCondition("x")
     conj = Conjunction([cond])
     assert conj.conditions == (cond, )
     assert str(conj) == "(x.in_interval(None, None))"
     assert conj.columns == set()
     assert conj.predicate is None
     assert conj.split_by_column() == {}
Exemple #3
0
def apply_condition_unsafe(df, cond):
    # For the sparse_outer testset, the test_df has the wrong datatype because we cannot encode missing integer data in
    # pandas.
    #
    # The condition will not be applicable to the DF because the DF has floats while conditions have ints. We fix that
    # by modifying the the condition.
    #
    # In case there is no missing data because of the right conditions, kartothek will return integer data.
    # assert_frame_equal will then complain about this. So in case there is no missing data, let's recover the correct
    # dtype here.

    if not isinstance(cond, Conjunction):
        cond = Conjunction(cond)

    float_cols = {col for col in df.columns if df[col].dtype == float}

    # convert int to float conditions
    cond2 = Conjunction([])
    for col, conj in cond.split_by_column().items():
        if col in float_cols:
            parts = []
            for part in conj.conditions:
                if isinstance(part, IsInCondition):
                    part = IsInCondition(column=part.column,
                                         value=tuple(
                                             (float(v) for v in part.value)))
                elif isinstance(part, InIntervalCondition):
                    part = InIntervalCondition(
                        column=part.column,
                        start=float(part.start),
                        stop=float(part.stop),
                    )
                else:
                    part = part.__class__(column=part.column,
                                          value=float(part.value))
                parts.append(part)
            conj = Conjunction(parts)
        cond2 &= conj

    # apply conditions
    df = cond2.filter_df(df).reset_index(drop=True)

    # convert float columns to int columns
    for col in df.columns:
        if df[col].notnull().all():
            dtype = df[col].dtype
            if dtype == np.float64:
                dtype = np.int64
            elif dtype == np.float32:
                dtype = np.int32
            elif dtype == np.float16:
                dtype = np.int16

            df[col] = df[col].astype(dtype)

    return df
def prepare_metapartitions_for_removal_action(cube, store, conditions,
                                              ktk_cube_dataset_ids,
                                              existing_datasets):
    """
    Prepare MetaPartition to express removal of given data range from cube.

    The MetaPartition must still be written using ``mp.store_dataframes(...)`` and added to the Dataset using a
    kartothek update method.

    Parameters
    ----------
    cube: kartothek.core.cube.cube.Cube
        Cube spec.
    store: Union[simplekv.KeyValueStore, Callable[[], simplekv.KeyValueStore]]
        Store.
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied, optional. Defaults to "entire cube".
    ktk_cube_dataset_ids: Optional[Union[Iterable[str], str]]
        Ktk_cube dataset IDs to apply the remove action to, optional. Default to "all".
    existing_datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Existing datasets.

    Returns
    -------
    metapartitions: Dict[str, Tuple[kartothek.core.dataset.DatasetMetadata,
            kartothek.io_components.metapartition.MetaPartition, List[Dict[str, Any]]]]
        MetaPartitions that should be written and updatet to the kartothek datasets as well as the ``delete_scope`` for
        kartothek.
    """
    conditions = Conjunction(conditions)
    conditions_split = conditions.split_by_column()
    if set(conditions_split.keys()) - set(cube.partition_columns):
        raise ValueError(
            "Can only remove partitions with conditions concerning cubes physical partition columns."
        )

    ktk_cube_dataset_ids = converter_str_set_optional(ktk_cube_dataset_ids)
    if ktk_cube_dataset_ids is not None:
        unknown_dataset_ids = ktk_cube_dataset_ids - set(
            existing_datasets.keys())
        if unknown_dataset_ids:
            raise ValueError("Unknown ktk_cube_dataset_ids: {}".format(
                ", ".join(sorted(unknown_dataset_ids))))
    else:
        ktk_cube_dataset_ids = set(existing_datasets.keys())

    metapartitions = {}
    for ktk_cube_dataset_id in ktk_cube_dataset_ids:
        ds = existing_datasets[ktk_cube_dataset_id]
        ds = ds.load_partition_indices()
        mp = _prepare_mp_empty(ds)

        if not ds.partition_keys:
            # no partition keys --> delete all
            delete_scope = [{}]
        else:

            df_partitions = get_partition_dataframe(dataset=ds, cube=cube)
            df_partitions = df_partitions.drop_duplicates()
            local_condition = reduce(
                lambda a, b: a & b,
                (cond for col, cond in conditions_split.items()
                 if col in df_partitions.columns),
                Conjunction([]),
            )
            df_partitions = local_condition.filter_df(df_partitions)

            delete_scope = df_partitions.to_dict(orient="records")

        metapartitions[ktk_cube_dataset_id] = (ds, mp, delete_scope)

    return metapartitions
def _process_conditions(
    conditions, cube, datasets, all_available_columns, indexed_columns
):
    """
    Process and check given query conditions.

    Parameters
    ----------
    conditions: Union[None, Condition, Iterable[Condition], Conjunction]
        Conditions that should be applied.
    cube: Cube
        Cube specification.
    datasets: Dict[str, kartothek.core.dataset.DatasetMetadata]
        Datasets that are present.
    all_available_columns: Set[str]
        All columns that are available for query.
    indexed_columns: Dict[str, Set[str]]
        Indexed columns per ktk_cube dataset ID.

    Returns
    -------
    conditions_pre: Dict[str, kartothek.core.cube.conditions.Conjunction]
        Conditions to be applied based on the index data alone.
    conditions_post: Dict[str, kartothek.core.cube.conditions.Conjunction]
        Conditions to be applied during the load process.

    Raises
    -------
    TypeError: In case of a wrong type.
    """
    conditions = Conjunction(conditions)

    condition_columns = conditions.columns
    missing = condition_columns - all_available_columns
    if missing:
        raise ValueError(
            "Following condition columns are required but are missing from the cube: {missing}".format(
                missing=", ".join(sorted(missing))
            )
        )
    _test_condition_types(conditions, datasets)

    conditions_split = conditions.split_by_column()

    conditions_pre = {}
    for ktk_cube_dataset_id, ds in datasets.items():
        candidate_cols = indexed_columns[ktk_cube_dataset_id]
        if not candidate_cols:
            continue

        filtered = [
            conj for col, conj in conditions_split.items() if col in candidate_cols
        ]
        if not filtered:
            continue

        conditions_pre[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered)

    conditions_post = {}
    for ktk_cube_dataset_id, ds in datasets.items():
        candidate_cols = (get_dataset_columns(ds) & condition_columns) - set(
            cube.partition_columns
        )
        if not candidate_cols:
            continue

        filtered = [
            conj for col, conj in conditions_split.items() if col in candidate_cols
        ]
        if not filtered:
            continue

        conditions_post[ktk_cube_dataset_id] = reduce(Conjunction.from_two, filtered)

    return conditions_pre, conditions_post