Beispiel #1
0
    def modify_uuid(self, target_uuid: str):
        """
        Modify the dataset uuid and depending metadata:
        - paths to partitioning files
        - path to index files
        Parameters
        ----------
        target_uuid: str
            Modified dataset UUID.
        Returns
        -------
        DatasetMetadataBuilder
            modified builder object
        """

        # modify file names in partition metadata
        modified_partitions = {}
        for p_key, p in self.partitions.items():
            pdict = p.to_dict()
            for table_key, table_file in pdict["files"].items():
                if table_file.startswith(f"{self.uuid}/"):
                    pdict["files"][table_key] = table_file.replace(
                        self.uuid, target_uuid, 1)
            modified_partitions[p_key] = Partition.from_dict(p_key, pdict)

        self.partitions = modified_partitions

        for i_key, i in self.indices.items():
            if (isinstance(i, ExplicitSecondaryIndex)
                    and i.index_storage_key is not None):
                i.index_storage_key = i.index_storage_key.replace(
                    self.uuid, target_uuid, 1)
        self.uuid = target_uuid
        return self
Beispiel #2
0
    def from_dict(dct: Dict, explicit_partitions: bool = True):
        """
        Load dataset metadata from a dictionary.

        This must have no external references. Otherwise use ``load_from_dict``
        to have them resolved automatically.
        """

        # Use the builder class for reconstruction to have a single point for metadata version changes
        builder = DatasetMetadataBuilder(
            uuid=dct[naming.UUID_KEY],
            metadata_version=dct[naming.METADATA_VERSION_KEY],
            explicit_partitions=explicit_partitions,
            partition_keys=dct.get("partition_keys", None),
            table_meta=dct.get("table_meta", None),
        )

        for key, value in dct.get("metadata", {}).items():
            builder.add_metadata(key, value)
        for partition_label, part_dct in dct.get("partitions", {}).items():
            builder.add_partition(
                partition_label, Partition.from_dict(partition_label,
                                                     part_dct))
        for column, index_dct in dct.get("indices", {}).items():
            if isinstance(index_dct, IndexBase):
                builder.add_embedded_index(column, index_dct)
            else:
                builder.add_embedded_index(
                    column, ExplicitSecondaryIndex.from_v2(column, index_dct))
        return builder.to_dataset()
Beispiel #3
0
def update_partitions(dataset_builder, add_partitions, remove_partitions):

    for mp in add_partitions:
        for sub_mp_dct in mp.metapartitions:
            # label is None in case of an empty partition
            if sub_mp_dct["label"] is not None:
                partition = Partition(label=sub_mp_dct["label"],
                                      files=sub_mp_dct["files"])
                dataset_builder.add_partition(sub_mp_dct["label"], partition)

    for partition_name in remove_partitions:
        del dataset_builder.partitions[partition_name]

    return dataset_builder
Beispiel #4
0
def test_builder_full(metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "partitions": {
            "run_id=1/L=1/P=1/part_1": {
                "files": {
                    "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet",
                    "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet",
                }
            }
        },
        "metadata": {
            "key": "value",
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "indices": {
            "col1": {
                "a": ["run_id=1/L=1/P=1/part_1"],
                "b": ["run_id=2/L=1/P=1/part_1"],
            },
            "col2": "uuid.col2.by-dataset-index.parquet",
        },
        "partition_keys": ["L", "P"],
    }

    builder = DatasetMetadataBuilder("uuid",
                                     metadata_version=metadata_version,
                                     partition_keys=["L", "P"])
    part_2 = Partition(
        label="run_id=1/L=1/P=1/part_1",
        files={
            "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet",
            "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet",
        },
    )
    builder.add_partition("run_id=1/L=1/P=1/part_1", part_2)
    builder.add_metadata("key", "value")
    builder.add_external_index("col2")
    builder.add_embedded_index(
        "col1",
        ExplicitSecondaryIndex("col1", {
            "a": ["run_id=1/L=1/P=1/part_1"],
            "b": ["run_id=2/L=1/P=1/part_1"]
        }),
    )
    key, result = builder.to_json()
    result = simplejson.loads(result)
    assert key == "uuid.by-dataset-metadata.json"
    assert result == expected
Beispiel #5
0
def test_eq():
    assert not (Partition("label") == Partition("other_label"))
    assert not (Partition("label") == Partition("label",
                                                files={"some": "file"}))

    assert Partition(label="label",
                     files={"some":
                            "file"}) == Partition(label="label",
                                                  files={"some": "file"})
Beispiel #6
0
def test_builder_to_dataset(metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "partitions": {
            "part_2": {
                "files": {
                    "core": "uuid/core/part_2.parquet"
                }
            }
        },
        "metadata": {
            "key": "value",
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "indices": {
            "col1": {
                "a": ["part1"],
                "b": ["part2"]
            }
        },
    }

    builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version)
    part_2 = Partition("part_2", {"core": "uuid/core/part_2.parquet"})
    builder.add_partition("part_2", part_2)
    builder.add_metadata("key", "value")
    builder.add_embedded_index(
        "col1", ExplicitSecondaryIndex("col1", {
            "a": ["part1"],
            "b": ["part2"]
        }))

    result = builder.to_dataset()
    expected_from_dict = DatasetMetadata.from_dict(expected)
    assert result == expected_from_dict
Beispiel #7
0
 def partition(self) -> Partition:
     return Partition(label=self.label, files={self.table_name: self.file})
Beispiel #8
0
def test_roundtrip():
    expected = {"files": {"Queejeb3": "file.parquet"}}
    result = Partition.from_dict("partition_label", expected).to_dict()
    assert expected == result
Beispiel #9
0
def test_raise_on_erroneous_input():
    with pytest.raises(ValueError):
        Partition.from_dict(label="label",
                            dct="some_not_supported_external_ref")
Beispiel #10
0
def test_roundtrip_empty_metadata():
    _input = {"files": {"Queejeb3": "file.parquet"}}
    expected = {"files": {"Queejeb3": "file.parquet"}}
    result = Partition.from_dict("partition_label", _input).to_dict()
    assert expected == result