def store(self, store: KeyValueStore, dataset_uuid: str) -> str: """ Store the index as a parquet file If compatible, the new keyname will be the name stored under the attribute `index_storage_key`. If this attribute is None, a new key will be generated of the format `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet` where the timestamp is in nanosecond accuracy and is created upon Index object initialization Parameters ---------- store: dataset_uuid: """ storage_key = None if (self.index_storage_key is not None and dataset_uuid and dataset_uuid in self.index_storage_key): storage_key = self.index_storage_key if storage_key is None: storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format( dataset_uuid=dataset_uuid, suffix=naming.EXTERNAL_INDEX_SUFFIX, column=quote(self.column), timestamp=quote(self.creation_time.isoformat()), ) table = _index_dct_to_table(self.index_dct, self.column, self.dtype) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put(storage_key, buf.getvalue().to_pybytes()) return storage_key
def _copy_naive( key_mappings: Dict[str, str], src_store: KeyValueStore, tgt_store: KeyValueStore, md_transformed: Optional[Dict[str, DatasetMetadata]] = None, ): """ Copies a list of items from one KV store to another. Parameters ---------- key_mappings: Dict[str, str] Mapping of source key names to target key names. May be equal if a key will not be renamed. src_store: simplekv.KeyValueStore Source KV store– tgt_store: simplekv.KeyValueStore Target KV store md_transformed: Dict[str, DatasetMetadata] Mapping containing {target dataset uuid: modified target metadata} values which will be written directly instead of being copied """ for src_key, tgt_key in key_mappings.items(): if (md_transformed is not None) and (tgt_key in md_transformed): item = md_transformed.get(tgt_key).to_json() # type: ignore else: item = src_store.get(src_key) tgt_store.put(tgt_key, item)
def load(self, store: KeyValueStore): """ Load an external index into memory. Returns a new index object that contains the index dictionary. Returns itself if the index is internal or an already loaded index. Parameters ---------- store: Object Object that implements the .get method for file/object loading. Returns ------- index: [kartothek.core.index.ExplicitSecondaryIndex] """ if self.loaded: return self index_buffer = store.get(self.index_storage_key) index_dct, column_type = _parquet_bytes_to_dict( self.column, index_buffer) return ExplicitSecondaryIndex( column=self.column, index_dct=index_dct, dtype=column_type, index_storage_key=self.index_storage_key, normalize_dtype=False, )
def store_schema_metadata( schema: SchemaWrapper, dataset_uuid: str, store: KeyValueStore, table: str = SINGLE_TABLE, ) -> str: """ Store schema and metadata to store. Parameters ---------- schema Schema information for DataFrame/table. dataset_uuid Unique ID of the dataset in question. store Object that implements `.put(key, data)` to write data. table Table to write metadata for. Returns ------- key: str Key to which the metadata was written to. """ key = _get_common_metadata_key(dataset_uuid=dataset_uuid, table=table) return store.put(key, _schema2bytes(schema.internal()))
def load_from_store( uuid: str, store: KeyValueStore, load_schema: bool = True, load_all_indices: bool = False, ) -> "DatasetMetadata": """ Load a dataset from a storage Parameters ---------- uuid: str or unicode UUID of the dataset. store: Object Object that implements the .get method for file/object loading. load_schema: bool Load table schema load_all_indices: bool Load all registered indices into memory. Returns ------- dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata` Parsed metadata. """ key1 = naming.metadata_key_from_uuid(uuid) try: value = store.get(key1) metadata = load_json(value) except KeyError: key2 = naming.metadata_key_from_uuid(uuid, format="msgpack") try: value = store.get(key2) metadata = msgpack.unpackb(value) except KeyError: raise KeyError( "Dataset does not exist. Tried {} and {}".format( key1, key2)) ds = DatasetMetadata.load_from_dict(metadata, store, load_schema=load_schema) if load_all_indices: ds = ds.load_all_indices(store) return ds
def restore_dataframe( store: KeyValueStore, key: str, filter_query: Optional[str] = None, columns: Optional[Iterable[str]] = None, predicate_pushdown_to_io: Any = None, categories: Optional[Iterable[str]] = None, predicates: Optional[PredicatesType] = None, date_as_object: Any = None, **kwargs, ): check_predicates(predicates) compression: Optional[str] if key.endswith(".csv.gz"): compression = "gzip" elif key.endswith(".csv"): compression = None if (not columns) and (columns is not None): # pd.read_csv does not seem to support proper reads w/o columns (it returns a DF w/o any row) columns = None project_to_no_cols = True else: project_to_no_cols = False dtype: Optional[Dict[str, str]] if categories: dtype = {cat: "category" for cat in categories} else: dtype = None try: df = pd.read_csv( BytesIO(store.get(key)), compression=compression, sep=";", encoding="utf-8", usecols=columns, dtype=dtype, ) if project_to_no_cols: df = df[[]] if len(df) == 0: # in that case, Pandas decided to use a weird index type, let's fix that df.index = pd.RangeIndex(start=0, stop=0, step=1) except EmptyDataError: df = pd.DataFrame() if predicates: return filter_df_from_predicates(df, predicates) else: return filter_df(df, filter_query)
def storage_keys(uuid: str, store: KeyValueStore) -> List[str]: """ Retrieve all keys that belong to the given dataset. Parameters ---------- uuid: UUID of the dataset. store: Object that implements the .iter_keys method for key retrieval loading. Returns ------- keys: Sorted list of storage keys. """ start_markers = ["{}.".format(uuid), "{}/".format(uuid)] return list( sorted(k for k in store.iter_keys(uuid) if any( k.startswith(marker) for marker in start_markers)))
def read_schema_metadata(dataset_uuid: str, store: KeyValueStore, table: str) -> SchemaWrapper: """ Read schema and metadata from store. Parameters ---------- dataset_uuid Unique ID of the dataset in question. store Object that implements `.get(key)` to read data. table Table to read metadata for. Returns ------- schema: Schema Schema information for DataFrame/table. """ key = _get_common_metadata_key(dataset_uuid=dataset_uuid, table=table) return SchemaWrapper(_bytes2schema(store.get(key)), key)
def _restore_dataframe( store: KeyValueStore, key: str, filter_query: Optional[str] = None, columns: Optional[Iterable[str]] = None, predicate_pushdown_to_io: bool = True, categories: Optional[Iterable[str]] = None, predicates: Optional[PredicatesType] = None, date_as_object: bool = False, ) -> pd.DataFrame: check_predicates(predicates) # If we want to do columnar access we can benefit from partial reads # otherwise full read en block is the better option. if (not predicate_pushdown_to_io) or (columns is None and predicates is None): with pa.BufferReader(store.get(key)) as reader: table = pq.read_pandas(reader, columns=columns) else: if HAVE_BOTO and isinstance(store, BotoStore): # Parquet and seeks on S3 currently leak connections thus # we omit column projection to the store. reader = pa.BufferReader(store.get(key)) else: reader = store.open(key) # Buffer at least 4 MB in requests. This is chosen because the default block size of the Azure # storage client is 4MB. reader = BlockBuffer(reader, 4 * 1024 * 1024) try: parquet_file = ParquetFile(reader) if predicates and parquet_file.metadata.num_rows > 0: # We need to calculate different predicates for predicate # pushdown and the later DataFrame filtering. This is required # e.g. in the case where we have an `in` predicate as this has # different normalized values. columns_to_io = _columns_for_pushdown(columns, predicates) predicates_for_pushdown = _normalize_predicates( parquet_file, predicates, True ) predicates = _normalize_predicates(parquet_file, predicates, False) tables = _read_row_groups_into_tables( parquet_file, columns_to_io, predicates_for_pushdown ) if len(tables) == 0: table = _empty_table_from_schema(parquet_file) else: table = pa.concat_tables(tables) else: # ARROW-5139 Column projection with empty columns returns a table w/out index if columns == []: # Create an arrow table with expected index length. df = ( parquet_file.schema.to_arrow_schema() .empty_table() .to_pandas(date_as_object=date_as_object) ) index = pd.Int64Index( pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows) ) df = pd.DataFrame(df, index=index) # convert back to table to keep downstream code untouched by this patch table = pa.Table.from_pandas(df) else: table = pq.read_pandas(reader, columns=columns) finally: reader.close() if columns is not None: missing_columns = set(columns) - set(table.schema.names) if missing_columns: raise ValueError( "Columns cannot be found in stored dataframe: {missing}".format( missing=", ".join(sorted(missing_columns)) ) ) table = _reset_dictionary_columns(table, exclude=categories) df = table.to_pandas(categories=categories, date_as_object=date_as_object) df.columns = df.columns.map(ensure_unicode_string_type) if predicates: df = filter_df_from_predicates( df, predicates, strict_date_types=date_as_object ) else: df = filter_df(df, filter_query) if columns is not None: return df.reindex(columns=columns, copy=False) else: return df