def ktk_dataset_uuid(self, ktk_cube_dataset_id): """ Get Kartothek dataset UUID for given dataset UUID, so the prefix is included. Parameters ---------- ktk_cube_dataset_id: str Dataset ID w/o prefix Returns ------- ktk_dataset_uuid: str Prefixed dataset UUID for Kartothek. Raises ------ ValueError If ``ktk_cube_dataset_id`` is not a string or if it is not a valid UUID. """ ktk_cube_dataset_id = converter_str(ktk_cube_dataset_id) _validator_uuid_freestanding("ktk_cube_dataset_id", ktk_cube_dataset_id) return "{uuid_prefix}{sep}{ktk_cube_dataset_id}".format( uuid_prefix=self.uuid_prefix, sep=KTK_CUBE_UUID_SEPERATOR, ktk_cube_dataset_id=ktk_cube_dataset_id, )
def get_dataset_columns(dataset): """ Get columns present in a Kartothek_Cube-compatible Kartothek dataset. Parameters ---------- dataset: kartothek.core.dataset.DatasetMetadata Dataset to get the columns from. Returns ------- columns: Set[str] Usable columns. """ return { converter_str(col) for col in get_dataset_schema(dataset).names if not col.startswith("__") and col != "KLEE_TS" }
def prepare_data_for_ktk(df, ktk_cube_dataset_id, cube, existing_payload, partition_on, consume_df=False): """ Prepare data so it can be handed over to Kartothek. Some checks will be applied to the data to ensure it is sane. Parameters ---------- df: pandas.DataFrame DataFrame to be passed to Kartothek. ktk_cube_dataset_id: str Ktk_cube dataset UUID (w/o cube prefix). cube: kartothek.core.cube.cube.Cube Cube specification. existing_payload: Set[str] Existing payload columns. partition_on: Iterable[str] Partition-on attribute for given dataset. consume_df: bool Whether the incoming DataFrame can be destroyed while processing it. Returns ------- mp: kartothek.io_components.metapartition.MetaPartition Kartothek-ready MetaPartition, may be sentinel (aka empty and w/o label). Raises ------ ValueError In case anything is fishy. """ check_user_df(ktk_cube_dataset_id, df, cube, existing_payload, partition_on) if (df is None) or df.empty: # fast-path for empty DF return MetaPartition( label=None, metadata_version=KTK_CUBE_METADATA_VERSION, partition_keys=list(partition_on), ) # TODO: find a more elegant solution that works w/o copy df_orig = df df = df.copy() if consume_df: # the original df is still referenced in the parent scope, so drop it df_orig.drop(columns=df_orig.columns, index=df_orig.index, inplace=True) df_columns = list(df.columns) df_columns_set = set(df_columns) # normalize value order and reset index sort_keys = [ col for col in itertools.chain(cube.partition_columns, cube.dimension_columns) if col in df_columns_set ] df = sort_dataframe(df=df, columns=sort_keys) # check duplicate cells _check_duplicates(ktk_cube_dataset_id, df, sort_keys, cube) # check+convert column names to unicode strings df.rename(columns={c: converter_str(c) for c in df_columns}, inplace=True) # create MetaPartition object for easier handling mp = MetaPartition( label=gen_uuid(), data=df, metadata_version=KTK_CUBE_METADATA_VERSION, ) del df # partition data mp = mp.partition_on(list(partition_on)) # reset indices again (because partition_on breaks it) for mp2 in mp: mp2.data.reset_index(drop=True, inplace=True) del mp2 # calculate indices indices_to_build = set(cube.index_columns) & df_columns_set if ktk_cube_dataset_id == cube.seed_dataset: indices_to_build |= set(cube.dimension_columns) - set( cube.suppress_index_on) indices_to_build -= set(partition_on) mp = mp.build_indices(indices_to_build) return mp
def test_str_rejects_none(): with pytest.raises(TypeError) as exc: converter_str(None) assert str(exc.value) == "Object of type NoneType is not a string: None"
def test_str_fail(obj, msg): with pytest.raises(TypeError) as exc: converter_str(obj) assert str(exc.value) == msg
def test_str_ok(obj, expected): actual = converter_str(obj) assert actual == expected