Exemple #1
0
def quick_concat(dfs, dimension_columns, partition_columns):
    """
    Fast version of::

        pd.concat(
            dfs,
            ignore_index=True,
            sort=False,
        ).sort_values(dimension_columns + partition_columns).reset_index(drop=True)

    if inputs are presorted.

    Parameters
    -----------
    dfs: Iterable[pandas.DataFrame]
        DataFrames to concat.
    dimension_columns: Iterable[str]
        Dimension columns in correct order.
    partition_columns: Iterable[str]
        Partition columns in correct order.

    Returns
    -------
    df: pandas.DataFrame
        Concatenated result.
    """
    return sort_dataframe(
        df=concat_dataframes(dfs),
        columns=list(dimension_columns) + list(partition_columns),
    )
Exemple #2
0
def _load_partition_dfs(cube, group, partition_mps, store):
    """
    Load partition Dataframes for seed, restrictive and other data.

    The information about the merge strategy (seed, restricting, others) is taken from ``group``.

    Parameters
    ----------
    cube: Cube
        Cube spec.
    group: QueryGroup
        Query group.
    partition_mps: Dict[str, Iterable[MetaPartition]]
        MetaPartitions for every dataset in this partition.
    store: simplekv.KeyValueStore
        Store to load data from.

    Returns
    -------
    df_seed: pandas.DataFrame
        Seed data.
    dfs_restrict: List[pandas.DataFrame]
        Restrictive data (for inner join).
    dfs_other: List[pandas.DataFrame]
        Other data (for left join).
    """
    df_seed = None
    dfs_restrict = []
    dfs_other = []

    for ktk_cube_dataset_id, empty in group.empty_df.items():
        mps = partition_mps.get(ktk_cube_dataset_id, [])
        df = _load_all_mps(
            mps=mps,
            store=store,
            load_columns=list(group.load_columns[ktk_cube_dataset_id]),
            predicates=group.predicates.get(ktk_cube_dataset_id, None),
            empty=empty,
        )

        # de-duplicate and sort data
        # PERF: keep order of dimensionality identical to group.dimension_columns
        df_cols = set(df.columns)
        dimensionality = [c for c in group.dimension_columns if c in df_cols]
        df = sort_dataframe(df=df, columns=dimensionality)

        df = drop_sorted_duplicates_keep_last(df, dimensionality)

        if ktk_cube_dataset_id == cube.seed_dataset:
            assert df_seed is None
            df_seed = df
        elif ktk_cube_dataset_id in group.restrictive_dataset_ids:
            dfs_restrict.append(df)
        else:
            dfs_other.append(df)

    assert df_seed is not None
    return df_seed, dfs_restrict, dfs_other
Exemple #3
0
def prepare_data_for_ktk(df,
                         ktk_cube_dataset_id,
                         cube,
                         existing_payload,
                         partition_on,
                         consume_df=False):
    """
    Prepare data so it can be handed over to Kartothek.

    Some checks will be applied to the data to ensure it is sane.

    Parameters
    ----------
    df: pandas.DataFrame
        DataFrame to be passed to Kartothek.
    ktk_cube_dataset_id: str
        Ktk_cube dataset UUID (w/o cube prefix).
    cube: kartothek.core.cube.cube.Cube
        Cube specification.
    existing_payload: Set[str]
        Existing payload columns.
    partition_on: Iterable[str]
        Partition-on attribute for given dataset.
    consume_df: bool
        Whether the incoming DataFrame can be destroyed while processing it.

    Returns
    -------
    mp: kartothek.io_components.metapartition.MetaPartition
        Kartothek-ready MetaPartition, may be sentinel (aka empty and w/o label).

    Raises
    ------
    ValueError
        In case anything is fishy.
    """
    check_user_df(ktk_cube_dataset_id, df, cube, existing_payload,
                  partition_on)

    if (df is None) or df.empty:
        # fast-path for empty DF
        return MetaPartition(
            label=None,
            metadata_version=KTK_CUBE_METADATA_VERSION,
            partition_keys=list(partition_on),
        )

    # TODO: find a more elegant solution that works w/o copy
    df_orig = df
    df = df.copy()
    if consume_df:
        # the original df is still referenced in the parent scope, so drop it
        df_orig.drop(columns=df_orig.columns,
                     index=df_orig.index,
                     inplace=True)
    df_columns = list(df.columns)
    df_columns_set = set(df_columns)

    # normalize value order and reset index
    sort_keys = [
        col for col in itertools.chain(cube.partition_columns,
                                       cube.dimension_columns)
        if col in df_columns_set
    ]
    df = sort_dataframe(df=df, columns=sort_keys)

    # check duplicate cells
    _check_duplicates(ktk_cube_dataset_id, df, sort_keys, cube)

    # check+convert column names to unicode strings
    df.rename(columns={c: converter_str(c) for c in df_columns}, inplace=True)

    # create MetaPartition object for easier handling
    mp = MetaPartition(
        label=gen_uuid(),
        data=df,
        metadata_version=KTK_CUBE_METADATA_VERSION,
    )
    del df

    # partition data
    mp = mp.partition_on(list(partition_on))

    # reset indices again (because partition_on breaks it)
    for mp2 in mp:
        mp2.data.reset_index(drop=True, inplace=True)
        del mp2

    # calculate indices
    indices_to_build = set(cube.index_columns) & df_columns_set
    if ktk_cube_dataset_id == cube.seed_dataset:
        indices_to_build |= set(cube.dimension_columns) - set(
            cube.suppress_index_on)
    indices_to_build -= set(partition_on)

    mp = mp.build_indices(indices_to_build)

    return mp
Exemple #4
0
def test_sort_dataframe(df, columns):
    expected = df.sort_values(columns).reset_index(drop=True)
    actual = sort_dataframe(df, columns)
    pdt.assert_frame_equal(actual, expected)