Ejemplo n.º 1
0
def test_dynamic_partitions_multiple_indices(store):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    suffix = "suffix"
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0"), ("product", "P-0")],
        "{}.parquet".format(suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1"), ("product", "P-0")],
        "{}.parquet".format(suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/product=P-0/{}".format(suffix): {
            "files": {"core": partition0_core}
        },
        "location=L-1/product=P-0/{}".format(suffix): {
            "files": {"core": partition1_core}
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/product=P-0/{}".format(suffix)],
            "L-1": ["location=L-1/product=P-0/{}".format(suffix)],
        },
        "product": {
            "P-0": [
                "location=L-0/product=P-0/{}".format(suffix),
                "location=L-1/product=P-0/{}".format(suffix),
            ]
        },
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({"location": ["L-0"], "product": ["P-0"]}), origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()
    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    # Sorting may differ in the index list. This is ok for runtime
    # but does produce flaky tests thus sort them.
    sorted_result = {
        column: {label: sorted(x) for label, x in index.items()}
        for column, index in dmd_dict["indices"].items()
    }
    assert sorted_result == expected_indices
Ejemplo n.º 2
0
def test_dynamic_partitions_quote(store, metadata_version):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    dataset_uuid = "uuid-namespace-attribute12_underscored"
    partition0_core = create_partition_key(dataset_uuid, "core",
                                           [("location", "München")],
                                           "data.parquet")
    partition1_core = create_partition_key(dataset_uuid, "core",
                                           [("location", "å\\ øß")],
                                           "data.parquet")
    metadata = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": dataset_uuid,
    }
    expected_partitions = {
        "location=M%C3%BCnchen/data": {
            "files": {
                "core": partition0_core
            }
        },
        "location=%C3%A5%5C%20%C3%B8%C3%9F/data": {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "München": ["location=M%C3%BCnchen/data"],
            "å\\ øß": ["location=%C3%A5%5C%20%C3%B8%C3%9F/data"],
        }
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()

    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    assert dmd_dict["indices"] == expected_indices
Ejemplo n.º 3
0
def test_dynamic_partitions(store):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    partition_suffix = "suffix"
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1")],
        "{}.parquet".format(partition_suffix),
    )
    partition0_ext = create_partition_key(
        dataset_uuid,
        "extension",
        [("location", "L-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_ext = create_partition_key(
        dataset_uuid,
        "extension",
        [("location", "L-1")],
        "{}.parquet".format(partition_suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/{}".format(partition_suffix): {
            "files": {
                "core": partition0_core,
                "extension": partition0_ext
            }
        },
        "location=L-1/{}".format(partition_suffix): {
            "files": {
                "core": partition1_core,
                "extension": partition1_ext
            }
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/{}".format(partition_suffix)],
            "L-1": ["location=L-1/{}".format(partition_suffix)],
        }
    }

    # put two partitions for two tables each to store
    store.put(
        "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX),
        simplejson.dumps(metadata).encode("utf-8"),
    )
    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store.put(partition0_ext, b"test")
    store.put(partition1_ext, b"test")
    store_schema_metadata(
        make_meta(
            pd.DataFrame({"location": ["L-0/{}".format(partition_suffix)]}),
            origin="stored",
        ),
        dataset_uuid,
        store,
        "core",
    )

    # instantiate metadata to write table metadatad
    core_schema = make_meta(
        pd.DataFrame({
            "column_0": pd.Series([1], dtype=int),
            "column_1": pd.Series([1], dtype=int),
            "location": pd.Series(["str"]),
        }),
        origin="core",
    )
    extension_schema = make_meta(
        pd.DataFrame({
            "column_77": pd.Series([1], dtype=int),
            "column_78": pd.Series([1], dtype=int),
            "location": pd.Series(["str"]),
        }),
        origin="extension",
    )
    store_schema_metadata(core_schema, dataset_uuid, store, "core")
    store_schema_metadata(extension_schema, dataset_uuid, store, "extension")
    dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
    # reload metadata to use table metadata
    dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
    dmd = dmd.load_partition_indices()

    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    assert dmd_dict["indices"] == expected_indices
Ejemplo n.º 4
0
def test_create_partition_key():
    key = create_partition_key("my-uuid", "testtable", [("index1", "value1"),
                                                        ("index2", "value2")])
    assert key == "my-uuid/testtable/index1=value1/index2=value2/data"
Ejemplo n.º 5
0
def test_dynamic_partitions_with_garbage(store):
    """
    In case there are unknown files, dataset and indices still load correctly
    """
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition_suffix = "suffix"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0"), ("product", "P-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1"), ("product", "P-0")],
        "{}.parquet".format(partition_suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/product=P-0/{}".format(partition_suffix): {
            "files": {
                "core": partition0_core
            }
        },
        "location=L-1/product=P-0/{}".format(partition_suffix): {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/product=P-0/{}".format(partition_suffix)],
            "L-1": ["location=L-1/product=P-0/{}".format(partition_suffix)],
        },
        "product": {
            "P-0": [
                "location=L-0/product=P-0/{}".format(partition_suffix),
                "location=L-1/product=P-0/{}".format(partition_suffix),
            ]
        },
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({
            "location": ["L-0"],
            "product": ["P-0"]
        }),
                  origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    # the following files are garbage and should not interfere with the indices and/or partitions
    for suffix in ["", ".json", ".msgpack", ".my_own_file_format"]:
        store.put("this_should_not_exist{}".format(suffix), b"ignore me")
        store.put("{}/this_should_not_exist{}".format(dataset_uuid, suffix),
                  b"ignore me")
        store.put(
            "{}/{}/this_should_not_exist{}".format(dataset_uuid, "core",
                                                   suffix),
            b"ignore me",
        )
        store.put(
            "{}/{}/location=L-0/this_should_not_exist{}".format(
                dataset_uuid, "core", suffix),
            b"ignore me",
        )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()
    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    # Sorting may differ in the index list. This is ok for runtime
    # but does produce flaky tests thus sort them.
    sorted_result = {
        column: {label: sorted(x)
                 for label, x in index.items()}
        for column, index in dmd_dict["indices"].items()
    }
    assert sorted_result == expected_indices