Exemple #1
0
    def ktk_dataset_uuid(self, ktk_cube_dataset_id):
        """
        Get Kartothek dataset UUID for given dataset UUID, so the prefix is included.

        Parameters
        ----------
        ktk_cube_dataset_id: str
            Dataset ID w/o prefix

        Returns
        -------
        ktk_dataset_uuid: str
            Prefixed dataset UUID for Kartothek.

        Raises
        ------
        ValueError
            If ``ktk_cube_dataset_id`` is not a string or if it is not a valid UUID.
        """
        ktk_cube_dataset_id = converter_str(ktk_cube_dataset_id)
        _validator_uuid_freestanding("ktk_cube_dataset_id",
                                     ktk_cube_dataset_id)
        return "{uuid_prefix}{sep}{ktk_cube_dataset_id}".format(
            uuid_prefix=self.uuid_prefix,
            sep=KTK_CUBE_UUID_SEPERATOR,
            ktk_cube_dataset_id=ktk_cube_dataset_id,
        )
Exemple #2
0
def get_dataset_columns(dataset):
    """
    Get columns present in a Kartothek_Cube-compatible Kartothek dataset.

    Parameters
    ----------
    dataset: kartothek.core.dataset.DatasetMetadata
        Dataset to get the columns from.

    Returns
    -------
    columns: Set[str]
        Usable columns.
    """
    return {
        converter_str(col)
        for col in get_dataset_schema(dataset).names
        if not col.startswith("__") and col != "KLEE_TS"
    }
Exemple #3
0
def prepare_data_for_ktk(df,
                         ktk_cube_dataset_id,
                         cube,
                         existing_payload,
                         partition_on,
                         consume_df=False):
    """
    Prepare data so it can be handed over to Kartothek.

    Some checks will be applied to the data to ensure it is sane.

    Parameters
    ----------
    df: pandas.DataFrame
        DataFrame to be passed to Kartothek.
    ktk_cube_dataset_id: str
        Ktk_cube dataset UUID (w/o cube prefix).
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    existing_payload: Set[str]
        Existing payload columns.
    partition_on: Iterable[str]
        Partition-on attribute for given dataset.
    consume_df: bool
        Whether the incoming DataFrame can be destroyed while processing it.

    Returns
    -------
    mp: kartothek.io_components.metapartition.MetaPartition
        Kartothek-ready MetaPartition, may be sentinel (aka empty and w/o label).

    Raises
    ------
    ValueError
        In case anything is fishy.
    """
    check_user_df(ktk_cube_dataset_id, df, cube, existing_payload,
                  partition_on)

    if (df is None) or df.empty:
        # fast-path for empty DF
        return MetaPartition(
            label=None,
            metadata_version=KTK_CUBE_METADATA_VERSION,
            partition_keys=list(partition_on),
        )

    # TODO: find a more elegant solution that works w/o copy
    df_orig = df
    df = df.copy()
    if consume_df:
        # the original df is still referenced in the parent scope, so drop it
        df_orig.drop(columns=df_orig.columns,
                     index=df_orig.index,
                     inplace=True)
    df_columns = list(df.columns)
    df_columns_set = set(df_columns)

    # normalize value order and reset index
    sort_keys = [
        col for col in itertools.chain(cube.partition_columns,
                                       cube.dimension_columns)
        if col in df_columns_set
    ]
    df = sort_dataframe(df=df, columns=sort_keys)

    # check duplicate cells
    _check_duplicates(ktk_cube_dataset_id, df, sort_keys, cube)

    # check+convert column names to unicode strings
    df.rename(columns={c: converter_str(c) for c in df_columns}, inplace=True)

    # create MetaPartition object for easier handling
    mp = MetaPartition(
        label=gen_uuid(),
        data=df,
        metadata_version=KTK_CUBE_METADATA_VERSION,
    )
    del df

    # partition data
    mp = mp.partition_on(list(partition_on))

    # reset indices again (because partition_on breaks it)
    for mp2 in mp:
        mp2.data.reset_index(drop=True, inplace=True)
        del mp2

    # calculate indices
    indices_to_build = set(cube.index_columns) & df_columns_set
    if ktk_cube_dataset_id == cube.seed_dataset:
        indices_to_build |= set(cube.dimension_columns) - set(
            cube.suppress_index_on)
    indices_to_build -= set(partition_on)

    mp = mp.build_indices(indices_to_build)

    return mp
Exemple #4
0
def test_str_rejects_none():
    with pytest.raises(TypeError) as exc:
        converter_str(None)
    assert str(exc.value) == "Object of type NoneType is not a string: None"
Exemple #5
0
def test_str_fail(obj, msg):
    with pytest.raises(TypeError) as exc:
        converter_str(obj)
    assert str(exc.value) == msg
Exemple #6
0
def test_str_ok(obj, expected):
    actual = converter_str(obj)
    assert actual == expected