Example #1
0
    def store(self, store: KeyValueStore, dataset_uuid: str) -> str:
        """
        Store the index as a parquet file

        If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
        If this attribute is None, a new key will be generated of the format

            `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`

        where the timestamp is in nanosecond accuracy and is created upon Index object initialization

        Parameters
        ----------
        store:
        dataset_uuid:
        """
        storage_key = None

        if (self.index_storage_key is not None and dataset_uuid
                and dataset_uuid in self.index_storage_key):
            storage_key = self.index_storage_key
        if storage_key is None:
            storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
                dataset_uuid=dataset_uuid,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
                column=quote(self.column),
                timestamp=quote(self.creation_time.isoformat()),
            )

        table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf)

        store.put(storage_key, buf.getvalue().to_pybytes())
        return storage_key
Example #2
0
def _copy_naive(
    key_mappings: Dict[str, str],
    src_store: KeyValueStore,
    tgt_store: KeyValueStore,
    md_transformed: Optional[Dict[str, DatasetMetadata]] = None,
):
    """
    Copies a list of items from one KV store to another.
    Parameters
    ----------
    key_mappings: Dict[str, str]
        Mapping of source key names to target key names. May be equal if a key will
        not be renamed.
    src_store: simplekv.KeyValueStore
        Source KV store–
    tgt_store: simplekv.KeyValueStore
        Target KV store
    md_transformed: Dict[str, DatasetMetadata]
        Mapping containing {target dataset uuid: modified target metadata} values which will be written
        directly instead of being copied
    """
    for src_key, tgt_key in key_mappings.items():
        if (md_transformed is not None) and (tgt_key in md_transformed):
            item = md_transformed.get(tgt_key).to_json()  # type: ignore
        else:
            item = src_store.get(src_key)
        tgt_store.put(tgt_key, item)
Example #3
0
    def load(self, store: KeyValueStore):
        """
        Load an external index into memory. Returns a new index object that
        contains the index dictionary. Returns itself if the index is internal
        or an already loaded index.

        Parameters
        ----------
        store: Object
            Object that implements the .get method for file/object loading.

        Returns
        -------
        index: [kartothek.core.index.ExplicitSecondaryIndex]
        """
        if self.loaded:
            return self

        index_buffer = store.get(self.index_storage_key)
        index_dct, column_type = _parquet_bytes_to_dict(
            self.column, index_buffer)

        return ExplicitSecondaryIndex(
            column=self.column,
            index_dct=index_dct,
            dtype=column_type,
            index_storage_key=self.index_storage_key,
            normalize_dtype=False,
        )
Example #4
0
def store_schema_metadata(
    schema: SchemaWrapper,
    dataset_uuid: str,
    store: KeyValueStore,
    table: str = SINGLE_TABLE,
) -> str:
    """
    Store schema and metadata to store.

    Parameters
    ----------
    schema
        Schema information for DataFrame/table.
    dataset_uuid
        Unique ID of the dataset in question.
    store
        Object that implements `.put(key, data)` to write data.
    table
        Table to write metadata for.

    Returns
    -------
    key: str
        Key to which the metadata was written to.
    """
    key = _get_common_metadata_key(dataset_uuid=dataset_uuid, table=table)
    return store.put(key, _schema2bytes(schema.internal()))
Example #5
0
    def load_from_store(
        uuid: str,
        store: KeyValueStore,
        load_schema: bool = True,
        load_all_indices: bool = False,
    ) -> "DatasetMetadata":
        """
        Load a dataset from a storage

        Parameters
        ----------
        uuid: str or unicode
            UUID of the dataset.
        store: Object
            Object that implements the .get method for file/object loading.
        load_schema: bool
            Load table schema
        load_all_indices: bool
            Load all registered indices into memory.

        Returns
        -------
        dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata`
            Parsed metadata.
        """
        key1 = naming.metadata_key_from_uuid(uuid)
        try:
            value = store.get(key1)
            metadata = load_json(value)
        except KeyError:
            key2 = naming.metadata_key_from_uuid(uuid, format="msgpack")
            try:
                value = store.get(key2)
                metadata = msgpack.unpackb(value)
            except KeyError:
                raise KeyError(
                    "Dataset does not exist. Tried {} and {}".format(
                        key1, key2))

        ds = DatasetMetadata.load_from_dict(metadata,
                                            store,
                                            load_schema=load_schema)
        if load_all_indices:
            ds = ds.load_all_indices(store)
        return ds
Example #6
0
    def restore_dataframe(
        store: KeyValueStore,
        key: str,
        filter_query: Optional[str] = None,
        columns: Optional[Iterable[str]] = None,
        predicate_pushdown_to_io: Any = None,
        categories: Optional[Iterable[str]] = None,
        predicates: Optional[PredicatesType] = None,
        date_as_object: Any = None,
        **kwargs,
    ):
        check_predicates(predicates)
        compression: Optional[str]
        if key.endswith(".csv.gz"):
            compression = "gzip"
        elif key.endswith(".csv"):
            compression = None

        if (not columns) and (columns is not None):
            # pd.read_csv does not seem to support proper reads w/o columns (it returns a DF w/o any row)
            columns = None
            project_to_no_cols = True
        else:
            project_to_no_cols = False

        dtype: Optional[Dict[str, str]]
        if categories:
            dtype = {cat: "category" for cat in categories}
        else:
            dtype = None

        try:
            df = pd.read_csv(
                BytesIO(store.get(key)),
                compression=compression,
                sep=";",
                encoding="utf-8",
                usecols=columns,
                dtype=dtype,
            )
            if project_to_no_cols:
                df = df[[]]
            if len(df) == 0:
                # in that case, Pandas decided to use a weird index type, let's fix that
                df.index = pd.RangeIndex(start=0, stop=0, step=1)
        except EmptyDataError:
            df = pd.DataFrame()

        if predicates:
            return filter_df_from_predicates(df, predicates)
        else:
            return filter_df(df, filter_query)
Example #7
0
    def storage_keys(uuid: str, store: KeyValueStore) -> List[str]:
        """
        Retrieve all keys that belong to the given dataset.

        Parameters
        ----------
        uuid:
            UUID of the dataset.
        store:
            Object that implements the .iter_keys method for key retrieval loading.

        Returns
        -------
        keys:
            Sorted list of storage keys.
        """
        start_markers = ["{}.".format(uuid), "{}/".format(uuid)]
        return list(
            sorted(k for k in store.iter_keys(uuid) if any(
                k.startswith(marker) for marker in start_markers)))
def read_schema_metadata(dataset_uuid: str, store: KeyValueStore,
                         table: str) -> SchemaWrapper:
    """
    Read schema and metadata from store.

    Parameters
    ----------
    dataset_uuid
        Unique ID of the dataset in question.
    store
        Object that implements `.get(key)` to read data.
    table
        Table to read metadata for.

    Returns
    -------
    schema: Schema
        Schema information for DataFrame/table.
    """
    key = _get_common_metadata_key(dataset_uuid=dataset_uuid, table=table)
    return SchemaWrapper(_bytes2schema(store.get(key)), key)
Example #9
0
    def _restore_dataframe(
        store: KeyValueStore,
        key: str,
        filter_query: Optional[str] = None,
        columns: Optional[Iterable[str]] = None,
        predicate_pushdown_to_io: bool = True,
        categories: Optional[Iterable[str]] = None,
        predicates: Optional[PredicatesType] = None,
        date_as_object: bool = False,
    ) -> pd.DataFrame:
        check_predicates(predicates)
        # If we want to do columnar access we can benefit from partial reads
        # otherwise full read en block is the better option.
        if (not predicate_pushdown_to_io) or (columns is None and predicates is None):
            with pa.BufferReader(store.get(key)) as reader:
                table = pq.read_pandas(reader, columns=columns)
        else:
            if HAVE_BOTO and isinstance(store, BotoStore):
                # Parquet and seeks on S3 currently leak connections thus
                # we omit column projection to the store.
                reader = pa.BufferReader(store.get(key))
            else:
                reader = store.open(key)
                # Buffer at least 4 MB in requests. This is chosen because the default block size of the Azure
                # storage client is 4MB.
                reader = BlockBuffer(reader, 4 * 1024 * 1024)
            try:
                parquet_file = ParquetFile(reader)
                if predicates and parquet_file.metadata.num_rows > 0:
                    # We need to calculate different predicates for predicate
                    # pushdown and the later DataFrame filtering. This is required
                    # e.g. in the case where we have an `in` predicate as this has
                    # different normalized values.
                    columns_to_io = _columns_for_pushdown(columns, predicates)
                    predicates_for_pushdown = _normalize_predicates(
                        parquet_file, predicates, True
                    )
                    predicates = _normalize_predicates(parquet_file, predicates, False)
                    tables = _read_row_groups_into_tables(
                        parquet_file, columns_to_io, predicates_for_pushdown
                    )

                    if len(tables) == 0:
                        table = _empty_table_from_schema(parquet_file)
                    else:
                        table = pa.concat_tables(tables)
                else:
                    # ARROW-5139 Column projection with empty columns returns a table w/out index
                    if columns == []:
                        # Create an arrow table with expected index length.
                        df = (
                            parquet_file.schema.to_arrow_schema()
                            .empty_table()
                            .to_pandas(date_as_object=date_as_object)
                        )
                        index = pd.Int64Index(
                            pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows)
                        )
                        df = pd.DataFrame(df, index=index)
                        # convert back to table to keep downstream code untouched by this patch
                        table = pa.Table.from_pandas(df)
                    else:
                        table = pq.read_pandas(reader, columns=columns)
            finally:
                reader.close()

        if columns is not None:
            missing_columns = set(columns) - set(table.schema.names)
            if missing_columns:
                raise ValueError(
                    "Columns cannot be found in stored dataframe: {missing}".format(
                        missing=", ".join(sorted(missing_columns))
                    )
                )

        table = _reset_dictionary_columns(table, exclude=categories)
        df = table.to_pandas(categories=categories, date_as_object=date_as_object)
        df.columns = df.columns.map(ensure_unicode_string_type)
        if predicates:
            df = filter_df_from_predicates(
                df, predicates, strict_date_types=date_as_object
            )
        else:
            df = filter_df(df, filter_query)
        if columns is not None:
            return df.reindex(columns=columns, copy=False)
        else:
            return df