Beispiel #1
0
def test_ensure_store(store_input_types):
    store = ensure_store(store_input_types)
    assert isinstance(store, KeyValueStore)
    value = b"value"
    key = "key"
    store.put(key, value)
    assert value == store.get(key)

    assert store is ensure_store(store)
def _discover_dataset_meta_files(prefix: str, store: StoreInput) -> Set[str]:
    """
    Get meta file names for all datasets.

    Parameters
    ----------
    prefix
        the prefix.
    store
        KV store.

    Returns
    -------
    names: Set[str]
        The meta file names
    """

    store = ensure_store(store)

    names = {
        name[: -len(METADATA_BASE_SUFFIX + suffix)]
        for name in store.iter_prefixes(delimiter="/", prefix=prefix)
        for suffix in [METADATA_FORMAT_JSON, METADATA_FORMAT_MSGPACK]
        if name.endswith(METADATA_BASE_SUFFIX + suffix)
    }
    return names
Beispiel #3
0
    def validate_schema_compatible(self, store: StoreInput,
                                   dataset_uuid: str) -> "MetaPartition":
        """
        Validates that the currently held DataFrames match the schema of the existing dataset.

        Parameters
        ----------
        store
            If it is a function, the result of calling it must be a KeyValueStore.
        dataset_uuid
            The dataset UUID the partition will be assigned to
        """

        # Load the reference meta of the existing dataset. Using the built-in
        # `load_all_table_meta` would not be helpful here as it would be a no-op
        # as we have already loaded the meta from the input DataFrame.
        store = ensure_store(store)
        reference_meta = read_schema_metadata(dataset_uuid=dataset_uuid,
                                              store=store,
                                              table=self.table_name)
        try:
            validate_compatible([self.schema, reference_meta])
        except ValueError as e:
            raise ValueError(
                f"Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}"
            )
        return self
    def load(self, store: StoreInput):
        """
        Load an external index into memory. Returns a new index object that
        contains the index dictionary. Returns itself if the index is internal
        or an already loaded index.

        Parameters
        ----------
        store
            Object that implements the .get method for file/object loading.

        Returns
        -------
        index: [kartothek.core.index.ExplicitSecondaryIndex]
        """
        if self.loaded:
            return self

        store = ensure_store(store)

        index_buffer = store.get(self.index_storage_key)
        index_dct, column_type = _parquet_bytes_to_dict(
            self.column, index_buffer)

        return ExplicitSecondaryIndex(
            column=self.column,
            index_dct=index_dct,
            dtype=column_type,
            index_storage_key=self.index_storage_key,
            normalize_dtype=False,
        )
Beispiel #5
0
def validate_partition_keys(
    dataset_uuid,
    store,
    ds_factory,
    default_metadata_version,
    partition_on,
    load_dataset_metadata=True,
):
    if ds_factory or DatasetMetadata.exists(dataset_uuid, ensure_store(store)):
        ds_factory = _ensure_factory(
            dataset_uuid=dataset_uuid,
            store=store,
            factory=ds_factory,
            load_dataset_metadata=load_dataset_metadata,
        )

        ds_metadata_version = ds_factory.metadata_version
        if partition_on:
            if not isinstance(partition_on, list):
                partition_on = [partition_on]
            if partition_on != ds_factory.partition_keys:
                raise ValueError(
                    "Incompatible set of partition keys encountered. "
                    "Input partitioning was `{}` while actual dataset was `{}`".format(
                        partition_on, ds_factory.partition_keys
                    )
                )
        else:
            partition_on = ds_factory.partition_keys
    else:
        ds_factory = None
        ds_metadata_version = default_metadata_version
    return ds_factory, ds_metadata_version, partition_on
Beispiel #6
0
def update_dataset_from_partitions(
    partition_list,
    store_factory,
    dataset_uuid,
    ds_factory,
    delete_scope,
    metadata,
    metadata_merger,
):
    store = ensure_store(store_factory)

    if ds_factory:
        ds_factory = ds_factory.load_all_indices()
        remove_partitions = _get_partitions(ds_factory, delete_scope)

        index_columns = list(ds_factory.indices.keys())
        for column in index_columns:
            index = ds_factory.indices[column]
            if isinstance(index, PartitionIndex):
                del ds_factory.indices[column]
    else:
        # Dataset does not exist yet.
        remove_partitions = []

    new_dataset = store_dataset_from_partitions(
        partition_list=partition_list,
        store=store,
        dataset_uuid=dataset_uuid,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        update_dataset=ds_factory,
        remove_partitions=remove_partitions,
    )

    return new_dataset
def write_partition(
    partition_df: MetaPartitionInput,
    secondary_indices: List[str],
    sort_partitions_by: List[str],
    dataset_uuid: str,
    partition_on: List[str],
    store_factory: StoreFactory,
    df_serializer: Optional[DataFrameSerializer],
    metadata_version: int,
    dataset_table_name: str = SINGLE_TABLE,
) -> MetaPartition:
    """
    Write a dataframe to store, performing all necessary preprocessing tasks
    like partitioning, bucketing (NotImplemented), indexing, etc. in the correct order.
    """
    store = ensure_store(store_factory)

    # I don't have access to the group values
    mps = parse_input_to_metapartition(
        partition_df,
        metadata_version=metadata_version,
        table_name=dataset_table_name,
    )
    if sort_partitions_by:
        mps = mps.apply(
            partial(sort_values_categorical, columns=sort_partitions_by))
    if partition_on:
        mps = mps.partition_on(partition_on)
    if secondary_indices:
        mps = mps.build_indices(secondary_indices)
    return mps.store_dataframes(store=store,
                                dataset_uuid=dataset_uuid,
                                df_serializer=df_serializer)
Beispiel #8
0
def discover_datasets_unchecked(
    uuid_prefix: str,
    store: StoreInput,
    filter_ktk_cube_dataset_ids: Optional[Union[str, Iterable[str]]] = None,
) -> Dict[str, DatasetMetadata]:
    """
    Get all known datasets that may belong to a give cube w/o applying any checks.

    .. warning::
        The results are not checked for validity. Found datasets may be incompatible w/ the given cube. Use
        :meth:`check_datasets` to check the results, or go for :meth:`discover_datasets` in the first place.

    Parameters
    ----------
    uuid_prefix
        Dataset UUID prefix.
    store
        KV store.
    filter_ktk_cube_dataset_ids
        Optional selection of datasets to include.

    Returns
    -------
    datasets: Dict[str, DatasetMetadata]
        All discovered datasets. Empty Dict if no dataset is found
    """
    store = ensure_store(store)

    filter_ktk_cube_dataset_ids = converter_str_set_optional(
        filter_ktk_cube_dataset_ids)
    prefix = uuid_prefix + KTK_CUBE_UUID_SEPERATOR

    names = _discover_dataset_meta_files(prefix, store)

    if filter_ktk_cube_dataset_ids is not None:
        names = {
            name
            for name in names
            if name[len(prefix):] in filter_ktk_cube_dataset_ids
        }

    result = {}
    # sorted iteration for determistic error messages in case DatasetMetadata.load_from_store fails
    for name in sorted(names):
        try:
            result[name[len(prefix):]] = DatasetMetadata.load_from_store(
                uuid=name,
                store=store,
                load_schema=True,
                load_all_indices=False)
        except KeyError as e:
            _logger.warning(
                'Ignore dataset "{name}" due to KeyError: {e}'.format(
                    name=name, e=e))

    return result
    def store(self, store: StoreInput, dataset_uuid: str) -> str:
        """
        Store the index as a parquet file

        If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
        If this attribute is None, a new key will be generated of the format

            `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`

        where the timestamp is in nanosecond accuracy and is created upon Index object initialization

        Parameters
        ----------
        store:
        dataset_uuid:
        """
        storage_key = None
        store = ensure_store(store)

        if (self.index_storage_key is not None and dataset_uuid
                and dataset_uuid in self.index_storage_key):
            storage_key = self.index_storage_key
        if storage_key is None:
            storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
                dataset_uuid=dataset_uuid,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
                column=quote(self.column),
                timestamp=quote(self.creation_time.isoformat()),
            )

        # The arrow representation of index_dct requires a large amount of memory because strings are duplicated and
        # flattened into the buffer. To avoid a high peak memory usage, split the index_dct into chunks and only convert
        # one chunk a time to arrow.
        parts_iter = partition_all(10_000, self.index_dct.items())

        # Get first table explicit because its schema is required for ParquetWriter.
        try:
            table = _index_dct_to_table(dict(next(parts_iter)), self.column,
                                        self.dtype)
        except StopIteration:
            # index_dct was empty, just pass it entirely
            table = _index_dct_to_table(self.index_dct, self.column,
                                        self.dtype)

        buf = pa.BufferOutputStream()
        with pq.ParquetWriter(buf, schema=table.schema) as writer:
            writer.write_table(table)
            del table

            for part in parts_iter:
                writer.write_table(
                    _index_dct_to_table(dict(part), self.column, self.dtype))

        store.put(storage_key, buf.getvalue().to_pybytes())
        return storage_key
Beispiel #10
0
def raise_if_dataset_exists(dataset_uuid, store):
    try:
        store_instance = ensure_store(store)
        for form in ["msgpack", "json"]:
            key = naming.metadata_key_from_uuid(uuid=dataset_uuid, format=form)
            if key in store_instance:
                raise RuntimeError(
                    "Dataset `%s` already exists and overwrite is not permitted!",
                    dataset_uuid,
                )
    except KeyError:
        pass
Beispiel #11
0
    def load_schema(self, store: StoreInput,
                    dataset_uuid: str) -> "MetaPartition":
        """
        Loads all table metadata in memory and stores it under the `tables` attribute

        """

        if self.schema is None:
            store = ensure_store(store)
            self.schema = read_schema_metadata(dataset_uuid=dataset_uuid,
                                               store=store,
                                               table=self.table_name)
        return self
Beispiel #12
0
def _initialize_store_for_metapartition(method, method_args, method_kwargs):

    for store_variable in ["store", "storage"]:
        if store_variable in method_kwargs:
            method_kwargs[store_variable] = ensure_store(
                method_kwargs[store_variable])
        else:
            method = cast(object, method)
            args = inspect.getfullargspec(method).args

            if store_variable in args:
                ix = args.index(store_variable)
                # reduce index since the argspec and method_args start counting differently due to self
                ix -= 1
                instantiated_store = ensure_store(method_args[ix])
                new_args = []
                for ix_method, arg in enumerate(method_args):
                    if ix_method != ix:
                        new_args.append(arg)
                    else:
                        new_args.append(instantiated_store)
                method_args = tuple(new_args)

    return method_args, method_kwargs
Beispiel #13
0
    def load_from_store(
        uuid: str,
        store: StoreInput,
        load_schema: bool = True,
        load_all_indices: bool = False,
    ) -> "DatasetMetadata":
        """
        Load a dataset from a storage

        Parameters
        ----------
        uuid
            UUID of the dataset.
        store
            Object that implements the .get method for file/object loading.
        load_schema
            Load table schema
        load_all_indices
            Load all registered indices into memory.

        Returns
        -------
        dataset_metadata: :class:`~kartothek.core.dataset.DatasetMetadata`
            Parsed metadata.
        """
        key1 = naming.metadata_key_from_uuid(uuid)
        store = ensure_store(store)
        try:
            value = store.get(key1)
            metadata = load_json(value)
        except KeyError:
            key2 = naming.metadata_key_from_uuid(uuid, format="msgpack")
            try:
                value = store.get(key2)
                metadata = unpackb(value)
            except KeyError:
                raise KeyError(
                    "Dataset does not exist. Tried {} and {}".format(
                        key1, key2))

        ds = DatasetMetadata.load_from_dict(metadata,
                                            store,
                                            load_schema=load_schema)
        if load_all_indices:
            ds = ds.load_all_indices(store)
        return ds
Beispiel #14
0
    def storage_keys(uuid: str, store: StoreInput) -> List[str]:
        """
        Retrieve all keys that belong to the given dataset.

        Parameters
        ----------
        uuid
            UUID of the dataset.
        store
            Object that implements the .iter_keys method for key retrieval loading.

        """
        store = ensure_store(store)
        start_markers = ["{}.".format(uuid), "{}/".format(uuid)]
        return list(
            sorted(k for k in store.iter_keys(uuid) if any(
                k.startswith(marker) for marker in start_markers)))
Beispiel #15
0
    def get_parquet_metadata(self, store: StoreInput) -> pd.DataFrame:
        """
        Retrieve the parquet metadata for the MetaPartition.
        Especially relevant for calculating dataset statistics.

        Parameters
        ----------
        store
          A factory function providing a KeyValueStore
        table_name
          Name of the kartothek table for which the statistics should be retrieved

        Returns
        -------
        pd.DataFrame
          A DataFrame with relevant parquet metadata
        """
        store = ensure_store(store)

        data = {}
        with store.open(self.file) as fd:  # type: ignore
            pq_metadata = pa.parquet.ParquetFile(fd).metadata

        data = {
            "partition_label": self.label,
            "serialized_size": pq_metadata.serialized_size,
            "number_rows_total": pq_metadata.num_rows,
            "number_row_groups": pq_metadata.num_row_groups,
            "row_group_id": [],
            "number_rows_per_row_group": [],
            "row_group_compressed_size": [],
            "row_group_uncompressed_size": [],
        }
        for rg_ix in range(pq_metadata.num_row_groups):
            rg = pq_metadata.row_group(rg_ix)
            data["row_group_id"].append(rg_ix)
            data["number_rows_per_row_group"].append(rg.num_rows)
            data["row_group_compressed_size"].append(rg.total_byte_size)
            data["row_group_uncompressed_size"].append(
                sum(
                    rg.column(col_ix).total_uncompressed_size
                    for col_ix in range(rg.num_columns)))

        df = pd.DataFrame(data=data, columns=_METADATA_SCHEMA.keys())
        df = df.astype(_METADATA_SCHEMA)
        return df
Beispiel #16
0
    def exists(uuid: str, store: StoreInput) -> bool:
        """
        Check if  a dataset exists in a storage

        Parameters
        ----------
        uuid
            UUID of the dataset.
        store
            Object that implements the .get method for file/object loading.

        """
        store = ensure_store(store)
        key = naming.metadata_key_from_uuid(uuid)

        if key in store:
            return True

        key = naming.metadata_key_from_uuid(uuid, format="msgpack")
        return key in store
Beispiel #17
0
def persist_indices(store: StoreInput, dataset_uuid: str,
                    indices: Dict[str, IndexBase]) -> Dict[str, str]:
    store = ensure_store(store)
    output_filenames = {}
    for column, index in indices.items():
        # backwards compat
        if isinstance(index, dict):
            legacy_storage_key = "{dataset_uuid}.{column}{suffix}".format(
                dataset_uuid=dataset_uuid,
                column=column,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
            )
            index = ExplicitSecondaryIndex(
                column=column,
                index_dct=index,
                index_storage_key=legacy_storage_key)
        elif isinstance(index, PartitionIndex):
            continue
        index = cast(ExplicitSecondaryIndex, index)
        output_filenames[column] = index.store(store=store,
                                               dataset_uuid=dataset_uuid)
    return output_filenames
Beispiel #18
0
def write_partition(
    partition_df: MetaPartitionInput,
    secondary_indices: Optional[InferredIndices],
    sort_partitions_by: Optional[Union[str, Sequence[str]]],
    dataset_uuid: str,
    partition_on: Optional[Union[str, Sequence[str]]],
    store_factory: StoreFactory,
    df_serializer: Optional[DataFrameSerializer],
    metadata_version: int,
    dataset_table_name: Optional[str] = None,
) -> MetaPartition:
    """
    Write a dataframe to store, performing all necessary preprocessing tasks
    like partitioning, bucketing (NotImplemented), indexing, etc. in the correct order.
    """
    store = ensure_store(store_factory)
    parse_input: MetaPartitionInput
    if isinstance(partition_df, pd.DataFrame) and dataset_table_name:
        parse_input = [{"data": {dataset_table_name: partition_df}}]
    else:
        parse_input = partition_df
    # delete reference to enable release after partition_on; before index build
    del partition_df
    # I don't have access to the group values
    mps = parse_input_to_metapartition(
        parse_input,
        metadata_version=metadata_version,
        expected_secondary_indices=secondary_indices,
    )
    if sort_partitions_by:
        mps = mps.apply(
            partial(sort_values_categorical, columns=sort_partitions_by))
    if partition_on:
        mps = mps.partition_on(partition_on)
    if secondary_indices:
        mps = mps.build_indices(secondary_indices)
    return mps.store_dataframes(store=store,
                                dataset_uuid=dataset_uuid,
                                df_serializer=df_serializer)
Beispiel #19
0
def align_datasets(
    left_dataset_uuid: str,
    right_dataset_uuid: str,
    store: StoreInput,
    match_how: Union[Literal["exact", "prefix", "all"], Callable] = "exact",
) -> Generator[List[MetaPartition], None, None]:
    """
    Determine dataset partition alignment

    Parameters
    ----------
    left_dataset_uuid
    right_dataset_uuid
    store
    match_how


    Yields
    ------
    List

    """
    store = ensure_store(store)
    left_dataset = DatasetMetadata.load_from_store(uuid=left_dataset_uuid, store=store)
    right_dataset = DatasetMetadata.load_from_store(
        uuid=right_dataset_uuid, store=store
    )

    metadata_version = left_dataset.metadata_version

    # Loop over the dataset with fewer partitions, treating its keys as
    # partition label prefixes
    if (
        callable(match_how)
        or match_how == "left"
        or (
            match_how == "prefix"
            and len(list(left_dataset.partitions.keys())[0])
            < len(list(right_dataset.partitions.keys())[0])
        )
    ):
        first_dataset = left_dataset
        second_dataset = right_dataset
    else:
        first_dataset = right_dataset
        second_dataset = left_dataset
    # The del statements are here to reduce confusion below
    del left_dataset
    del right_dataset

    # For every partition in the 'small' dataset, at least one partition match
    # needs to be found in the larger dataset.
    available_partitions = list(second_dataset.partitions.items())
    partition_stack = available_partitions[:]

    # TODO: write a test which protects against the following scenario!!
    # Sort the partition labels by length of the labels, starting with the
    # labels which are the longest. This way we prevent label matching for
    # similar partitions, e.g. cluster_100 and cluster_1. This, of course,
    # works only as long as the internal loop removes elements which were
    # matched already (here improperly called stack)
    for l_1 in sorted(first_dataset.partitions, key=len, reverse=True):
        p_1 = first_dataset.partitions[l_1]
        res = [
            MetaPartition.from_partition(
                partition=p_1, metadata_version=metadata_version
            )
        ]
        for parts in available_partitions:
            l_2, p_2 = parts
            if callable(match_how) and not match_how(l_1, l_2):
                continue
            if match_how == "exact" and l_1 != l_2:
                continue
            elif match_how == "prefix" and not l_2.startswith(l_1):
                LOGGER.debug("rejecting (%s, %s)", l_1, l_2)
                continue

            LOGGER.debug(
                "Found alignment between partitions " "(%s, %s) and" "(%s, %s)",
                first_dataset.uuid,
                p_1.label,
                second_dataset.uuid,
                p_2.label,
            )
            res.append(
                MetaPartition.from_partition(
                    partition=p_2, metadata_version=metadata_version
                )
            )

            # In exact or prefix matching schemes, it is expected to only
            # find one partition alignment. in this case reduce the size of
            # the inner loop
            if match_how in ["exact", "prefix"]:
                partition_stack.remove((l_2, p_2))
        # Need to copy, otherwise remove will alter the loop iterator
        available_partitions = partition_stack[:]
        if len(res) == 1:
            raise RuntimeError(
                "No matching partition for {} in dataset {} "
                "found".format(p_1, first_dataset)
            )
        yield res
Beispiel #20
0
def test_ensure_store_returns_same_store():
    store = get_store_from_url("memory://")
    assert ensure_store(lambda: store) is store
Beispiel #21
0
 def delete_from_store(self, dataset_uuid: Any,
                       store: StoreInput) -> "MetaPartition":
     store = ensure_store(store)
     # Delete data first
     store.delete(self.file)
     return self.copy(file=None, data=None)
def store_dataset_from_partitions(
    partition_list,
    store: StoreInput,
    dataset_uuid,
    dataset_metadata=None,
    metadata_merger=None,
    update_dataset=None,
    remove_partitions=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
):
    store = ensure_store(store)

    schemas = set()
    if update_dataset:
        dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset)
        metadata_version = dataset_builder.metadata_version
        table_name = update_dataset.table_name
        schemas.add(update_dataset.schema)
    else:
        mp = next(iter(partition_list), None)

        if mp is None:
            raise ValueError(
                "Cannot store empty datasets, partition_list must not be empty if in store mode."
            )
        table_name = mp.table_name
        metadata_version = mp.metadata_version
        dataset_builder = DatasetMetadataBuilder(
            uuid=dataset_uuid,
            metadata_version=metadata_version,
            partition_keys=mp.partition_keys,
        )

    for mp in partition_list:
        if mp.schema:
            schemas.add(mp.schema)

    dataset_builder.schema = persist_common_metadata(
        schemas=schemas,
        update_dataset=update_dataset,
        store=store,
        dataset_uuid=dataset_uuid,
        table_name=table_name,
    )

    # We can only check for non unique partition labels here and if they occur we will
    # fail hard. The resulting dataset may be corrupted or file may be left in the store
    # without dataset metadata
    partition_labels = partition_labels_from_mps(partition_list)

    # This could be safely removed since we do not allow to set this by the user
    # anymore. It has implications on tests if mocks are used
    non_unique_labels = extract_duplicates(partition_labels)

    if non_unique_labels:
        raise ValueError(
            "The labels {} are duplicated. Dataset metadata was not written.".
            format(", ".join(non_unique_labels)))

    if remove_partitions is None:
        remove_partitions = []

    if metadata_merger is None:
        metadata_merger = combine_metadata

    dataset_builder = update_metadata(dataset_builder, metadata_merger,
                                      dataset_metadata)
    dataset_builder = update_partitions(dataset_builder, partition_list,
                                        remove_partitions)
    dataset_builder = update_indices(dataset_builder, store, partition_list,
                                     remove_partitions)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    dataset = dataset_builder.to_dataset()
    return dataset
Beispiel #23
0
def store_dataset_from_partitions(
    partition_list,
    store: StoreInput,
    dataset_uuid,
    dataset_metadata=None,
    metadata_merger=None,
    update_dataset=None,
    remove_partitions=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
):
    store = ensure_store(store)

    if update_dataset:
        dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset)
        metadata_version = dataset_builder.metadata_version
    else:
        mp = next(iter(partition_list), None)
        if mp is None:
            raise ValueError(
                "Cannot store empty datasets, partition_list must not be empty if in store mode."
            )

        metadata_version = mp.metadata_version
        dataset_builder = DatasetMetadataBuilder(
            uuid=dataset_uuid,
            metadata_version=metadata_version,
            partition_keys=mp.partition_keys,
        )

    dataset_builder.explicit_partitions = True

    dataset_builder.table_meta = persist_common_metadata(
        partition_list, update_dataset, store, dataset_uuid)

    # We can only check for non unique partition labels here and if they occur we will
    # fail hard. The resulting dataset may be corrupted or file may be left in the store
    # without dataset metadata
    partition_labels = partition_labels_from_mps(partition_list)
    non_unique_labels = extract_duplicates(partition_labels)

    if non_unique_labels:
        raise ValueError(
            "The labels {} are duplicated. Dataset metadata was not written.".
            format(", ".join(non_unique_labels)))

    if remove_partitions is None:
        remove_partitions = []

    if metadata_merger is None:
        metadata_merger = combine_metadata

    dataset_builder = update_metadata(dataset_builder, metadata_merger,
                                      partition_list, dataset_metadata)
    dataset_builder = update_partitions(dataset_builder, partition_list,
                                        remove_partitions)
    dataset_builder = update_indices(dataset_builder, store, partition_list,
                                     remove_partitions)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    dataset = dataset_builder.to_dataset()
    return dataset