Example #1
0
def update_indices_from_partitions(partition_list, dataset_metadata_factory):
    """
    This takes indices from a partition list and overwrites all indices in the dataset metadata
    provided by the dataset metadata factory. The same is done in the store dataset part. This is used
    in an additional build index step (by the build_dataset_indices__pipeline) which should be used after
    updating partitions of a dataset.
    """

    dataset_indices = MetaPartition.merge_indices(partition_list)

    indices = persist_indices(
        store=dataset_metadata_factory.store,
        dataset_uuid=dataset_metadata_factory.uuid,
        indices=dataset_indices,
    )

    for column, storage_key in six.iteritems(indices):
        dataset_metadata_factory.indices[column] = ExplicitSecondaryIndex(
            column=column, index_storage_key=storage_key)

    dataset_metadata_factory.store.put(
        naming.metadata_key_from_uuid(dataset_metadata_factory.uuid),
        dataset_metadata_factory.to_json(),
    )
    return dataset_metadata_factory
Example #2
0
def update_indices(dataset_builder, store, add_partitions, remove_partitions):
    dataset_indices = dataset_builder.indices
    partition_indices = MetaPartition.merge_indices(add_partitions)

    if dataset_indices:  # dataset already exists and will be updated
        if remove_partitions:
            for column, dataset_index in dataset_indices.items():
                dataset_indices[column] = dataset_index.remove_partitions(
                    remove_partitions, inplace=True)

        for column, index in partition_indices.items():
            dataset_indices[column] = dataset_indices[column].update(
                index, inplace=True)

    else:  # dataset index will be created first time from partitions
        dataset_indices = partition_indices

    # Store indices
    index_filenames = persist_indices(store=store,
                                      dataset_uuid=dataset_builder.uuid,
                                      indices=dataset_indices)
    for column, filename in index_filenames.items():
        dataset_builder.add_external_index(column, filename)

    return dataset_builder
Example #3
0
def test_merge_indices():
    indices = [
        MetaPartition(
            label="label1",
            indices={"location": {"Loc1": ["label1"], "Loc2": ["label1"]}},
        ),
        MetaPartition(
            label="label2",
            indices={
                "location": {"Loc3": ["label2"], "Loc2": ["label2"]},
                "product": {"Product1": ["label2"], "Product2": ["label2"]},
            },
        ),
    ]
    result = MetaPartition.merge_indices(indices)
    expected = {
        "location": ExplicitSecondaryIndex(
            "location",
            {"Loc1": ["label1"], "Loc2": ["label1", "label2"], "Loc3": ["label2"]},
        ),
        "product": ExplicitSecondaryIndex(
            "product", {"Product1": ["label2"], "Product2": ["label2"]}
        ),
    }
    assert result == expected
Example #4
0
 def time_merge_indices(self, cardinality, num_values, partitions_to_merge):
     MetaPartition.merge_indices(self.merge_indices)