Ejemplo n.º 1
0
def update_dataset_from_partitions(
    partition_list,
    store_factory,
    dataset_uuid,
    ds_factory,
    delete_scope,
    metadata,
    metadata_merger,
):
    store = _instantiate_store(store_factory)

    if ds_factory:
        ds_factory = ds_factory.load_all_indices()
        remove_partitions = _get_partitions(ds_factory, delete_scope)

        index_columns = list(ds_factory.indices.keys())
        for column in index_columns:
            index = ds_factory.indices[column]
            if isinstance(index, PartitionIndex):
                del ds_factory.indices[column]
    else:
        # Dataset does not exist yet.
        remove_partitions = []

    new_dataset = store_dataset_from_partitions(
        partition_list=partition_list,
        store=store,
        dataset_uuid=dataset_uuid,
        dataset_metadata=metadata,
        metadata_merger=metadata_merger,
        update_dataset=ds_factory,
        remove_partitions=remove_partitions,
    )

    return new_dataset
Ejemplo n.º 2
0
def raise_if_dataset_exists(dataset_uuid, store):
    try:
        store_instance = _instantiate_store(store)
        for form in ["msgpack", "json"]:
            key = naming.metadata_key_from_uuid(uuid=dataset_uuid, format=form)
            if key in store_instance:
                raise RuntimeError(
                    "Dataset `%s` already exists and overwrite is not permitted!",
                    dataset_uuid,
                )
    except KeyError:
        pass
Ejemplo n.º 3
0
def persist_indices(store, dataset_uuid, indices):
    store = _instantiate_store(store)
    output_filenames = {}
    for column, index in indices.items():
        # backwards compat
        if isinstance(index, dict):
            legacy_storage_key = "{dataset_uuid}.{column}{suffix}".format(
                dataset_uuid=dataset_uuid,
                column=column,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
            )
            index = ExplicitSecondaryIndex(
                column=column,
                index_dct=index,
                index_storage_key=legacy_storage_key)
        elif isinstance(index, PartitionIndex):
            continue
        output_filenames[column] = index.store(store=store,
                                               dataset_uuid=dataset_uuid)
    return output_filenames
Ejemplo n.º 4
0
def align_datasets(left_dataset_uuid, right_dataset_uuid, store, match_how="exact"):
    """
    Determine dataset partition alignment

    Parameters
    ----------
    left_dataset_uuid : basestring
    right_dataset_uuid : basestring
    store : KeyValuestore or callable
    match_how : basestring or callable, {exact, prefix, all, callable}

    Yields
    ------
    list

    """
    store = _instantiate_store(store)
    left_dataset = DatasetMetadata.load_from_store(uuid=left_dataset_uuid, store=store)
    right_dataset = DatasetMetadata.load_from_store(
        uuid=right_dataset_uuid, store=store
    )

    metadata_version = left_dataset.metadata_version

    # Loop over the dataset with fewer partitions, treating its keys as
    # partition label prefixes
    if (
        callable(match_how)
        or match_how == "left"
        or (
            match_how == "prefix"
            and len(list(left_dataset.partitions.keys())[0])
            < len(list(right_dataset.partitions.keys())[0])
        )
    ):
        first_dataset = left_dataset
        second_dataset = right_dataset
    else:
        first_dataset = right_dataset
        second_dataset = left_dataset
    # The del statements are here to reduce confusion below
    del left_dataset
    del right_dataset

    # For every partition in the 'small' dataset, at least one partition match
    # needs to be found in the larger dataset.
    available_partitions = list(second_dataset.partitions.items())
    partition_stack = available_partitions[:]

    # TODO: write a test which protects against the following scenario!!
    # Sort the partition labels by length of the labels, starting with the
    # labels which are the longest. This way we prevent label matching for
    # similar partitions, e.g. cluster_100 and cluster_1. This, of course,
    # works only as long as the internal loop removes elements which were
    # matched already (here improperly called stack)
    for l_1 in sorted(first_dataset.partitions, key=len, reverse=True):
        p_1 = first_dataset.partitions[l_1]
        res = [
            MetaPartition.from_partition(
                partition=p_1, metadata_version=metadata_version
            )
        ]
        for parts in available_partitions:
            l_2, p_2 = parts
            if callable(match_how) and not match_how(l_1, l_2):
                continue
            if match_how == "exact" and l_1 != l_2:
                continue
            elif match_how == "prefix" and not l_2.startswith(l_1):
                LOGGER.debug("rejecting (%s, %s)", l_1, l_2)
                continue

            LOGGER.debug(
                "Found alignment between partitions " "(%s, %s) and" "(%s, %s)",
                first_dataset.uuid,
                p_1.label,
                second_dataset.uuid,
                p_2.label,
            )
            res.append(
                MetaPartition.from_partition(
                    partition=p_2, metadata_version=metadata_version
                )
            )

            # In exact or prefix matching schemes, it is expected to only
            # find one partition alignment. in this case reduce the size of
            # the inner loop
            if match_how in ["exact", "prefix"]:
                partition_stack.remove((l_2, p_2))
        # Need to copy, otherwise remove will alter the loop iterator
        available_partitions = partition_stack[:]
        if len(res) == 1:
            raise RuntimeError(
                "No matching partition for {} in dataset {} "
                "found".format(p_1, first_dataset)
            )
        yield res
Ejemplo n.º 5
0
def store_dataset_from_partitions(
    partition_list,
    store,
    dataset_uuid,
    dataset_metadata=None,
    metadata_merger=None,
    update_dataset=None,
    remove_partitions=None,
    metadata_storage_format=naming.DEFAULT_METADATA_STORAGE_FORMAT,
):
    store = _instantiate_store(store)

    if update_dataset:
        dataset_builder = DatasetMetadataBuilder.from_dataset(update_dataset)
        metadata_version = dataset_builder.metadata_version
    else:
        mp = next(iter(partition_list), None)
        if mp is None:
            raise ValueError(
                "Cannot store empty datasets, partition_list must not be empty if in store mode."
            )

        metadata_version = mp.metadata_version
        dataset_builder = DatasetMetadataBuilder(
            uuid=dataset_uuid,
            metadata_version=metadata_version,
            partition_keys=mp.partition_keys,
        )

    dataset_builder.explicit_partitions = True

    dataset_builder.table_meta = persist_common_metadata(
        partition_list, update_dataset, store, dataset_uuid)

    # We can only check for non unique partition labels here and if they occur we will
    # fail hard. The resulting dataset may be corrupted or file may be left in the store
    # without dataset metadata
    partition_labels = partition_labels_from_mps(partition_list)
    non_unique_labels = extract_duplicates(partition_labels)

    if non_unique_labels:
        raise ValueError(
            "The labels {} are duplicated. Dataset metadata was not written.".
            format(", ".join(non_unique_labels)))

    if remove_partitions is None:
        remove_partitions = []

    if metadata_merger is None:
        metadata_merger = combine_metadata

    dataset_builder = update_metadata(dataset_builder, metadata_merger,
                                      partition_list, dataset_metadata)
    dataset_builder = update_partitions(dataset_builder, partition_list,
                                        remove_partitions)
    dataset_builder = update_indices(dataset_builder, store, partition_list,
                                     remove_partitions)
    if metadata_storage_format.lower() == "json":
        store.put(*dataset_builder.to_json())
    elif metadata_storage_format.lower() == "msgpack":
        store.put(*dataset_builder.to_msgpack())
    else:
        raise ValueError(
            "Unknown metadata storage format encountered: {}".format(
                metadata_storage_format))
    dataset = dataset_builder.to_dataset()
    return dataset