Exemple #1
0
def test_overlap_keyspace(store, metadata_version):
    dataset_uuid1 = "uuid+namespace-attribute12_underscored"
    dataset_uuid2 = "uuid+namespace-attribute12_underscored_ext"
    table = "core"

    for dataset_uuid in (dataset_uuid1, dataset_uuid2):
        partition0 = "location=L-0".format(dataset_uuid)
        partition0_key = "{}/{}/{}/data.parquet".format(
            dataset_uuid, table, partition0)
        metadata = {
            "dataset_metadata_version": metadata_version,
            "dataset_uuid": dataset_uuid,
        }

        # put two partitions for two tables each to store
        store.put(
            "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX),
            simplejson.dumps(metadata).encode("utf-8"),
        )
        store.put(partition0_key, b"test")
        store_schema_metadata(
            make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"),
            dataset_uuid,
            store,
            "core",
        )

    for dataset_uuid in (dataset_uuid1, dataset_uuid2):
        partition0_label = "location=L-0/data".format(dataset_uuid)
        partition0_key = "{}/{}/{}.parquet".format(dataset_uuid, table,
                                                   partition0_label)
        expected_partitions = {
            "location=L-0/data": {
                "files": {
                    "core": partition0_key
                }
            }
        }
        expected_indices = {"location": {"L-0": ["location=L-0/data"]}}
        assert DatasetMetadata.storage_keys(dataset_uuid, store) == [
            "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX),
            _get_common_metadata_key(dataset_uuid, "core"),
            partition0_key,
        ]
        dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
        dmd = dmd.load_partition_indices()
        dmd_dict = dmd.to_dict()
        assert dmd_dict["partitions"] == expected_partitions
        assert dmd_dict["indices"] == expected_indices
Exemple #2
0
def test_dynamic_partitions_quote(store, metadata_version):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    dataset_uuid = "uuid-namespace-attribute12_underscored"
    partition0_core = create_partition_key(dataset_uuid, "core",
                                           [("location", "München")],
                                           "data.parquet")
    partition1_core = create_partition_key(dataset_uuid, "core",
                                           [("location", "å\\ øß")],
                                           "data.parquet")
    metadata = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": dataset_uuid,
    }
    expected_partitions = {
        "location=M%C3%BCnchen/data": {
            "files": {
                "core": partition0_core
            }
        },
        "location=%C3%A5%5C%20%C3%B8%C3%9F/data": {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "München": ["location=M%C3%BCnchen/data"],
            "å\\ øß": ["location=%C3%A5%5C%20%C3%B8%C3%9F/data"],
        }
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({"location": ["L-0"]}), origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()

    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    assert dmd_dict["indices"] == expected_indices
def test_compat_old_rw_path(df_all_types, store):
    # strip down DF before some column types weren't supported before anyway
    df = df_all_types[[
        c for c in df_all_types.columns
        if (not c.startswith("array_")  # array types (always null)
            and c != "unicode"  # unicode type (alway null)
            and "8" not in c  # 8 bit types are casted to 64 bit
            and "16" not in c  # 16 bit types are casted to 64 bit
            and "32" not in c  # 32 bit types are casted to 64 bit
            )
    ]]
    expected_meta = make_meta(df, origin="df")

    # old schema write path
    old_meta = dask_make_meta(df)
    pa_table = pa.Table.from_pandas(old_meta)
    buf = pa.BufferOutputStream()
    pq.write_table(pa_table, buf, version="2.0")
    key_old = _get_common_metadata_key("dataset_uuid_old", "table")
    store.put(key_old, buf.getvalue().to_pybytes())

    actual_meta = read_schema_metadata(dataset_uuid="dataset_uuid_old",
                                       store=store,
                                       table="table")
    validate_compatible([actual_meta, expected_meta])

    store_schema_metadata(
        schema=make_meta(df, origin="df"),
        dataset_uuid="dataset_uuid_new",
        store=store,
        table="table",
    )
    key_new = _get_common_metadata_key("dataset_uuid_new", "table")
    actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store)
    actual_df["date"] = actual_df["date"].dt.date
    pdt.assert_frame_equal(actual_df, old_meta)
def test_empty_dataframe_from_schema(df_all_types):
    schema = make_meta(df_all_types, origin="1")
    actual_df = empty_dataframe_from_schema(schema)

    expected_df = df_all_types.loc[[]]
    expected_df["date"] = pd.Series([], dtype="datetime64[ns]")
    for c in expected_df.columns:
        if c.startswith("float"):
            expected_df[c] = pd.Series([], dtype=float)
        if c.startswith("int"):
            expected_df[c] = pd.Series([], dtype=int)
        if c.startswith("uint"):
            expected_df[c] = pd.Series([], dtype=np.uint64)

    pdt.assert_frame_equal(actual_df, expected_df)
def test_store_schema_metadata(store, df_all_types):
    store_schema_metadata(
        schema=make_meta(df_all_types, origin="df_all_types"),
        dataset_uuid="some_uuid",
        store=store,
        table="some_table",
    )

    key = "some_uuid/some_table/_common_metadata"
    assert key in store.keys()
    pq_file = pq.ParquetFile(store.open(key))
    actual_schema = pq_file.schema.to_arrow_schema()
    fields = [
        pa.field("array_float32", pa.list_(pa.float64())),
        pa.field("array_float64", pa.list_(pa.float64())),
        pa.field("array_int16", pa.list_(pa.int64())),
        pa.field("array_int32", pa.list_(pa.int64())),
        pa.field("array_int64", pa.list_(pa.int64())),
        pa.field("array_int8", pa.list_(pa.int64())),
        pa.field("array_uint16", pa.list_(pa.uint64())),
        pa.field("array_uint32", pa.list_(pa.uint64())),
        pa.field("array_uint64", pa.list_(pa.uint64())),
        pa.field("array_uint8", pa.list_(pa.uint64())),
        pa.field("array_unicode", pa.list_(pa.string())),
        pa.field("bool", pa.bool_()),
        pa.field("byte", pa.binary()),
        pa.field("date", pa.date32()),
        pa.field("datetime64", pa.timestamp("us")),
        pa.field("float32", pa.float64()),
        pa.field("float64", pa.float64()),
        pa.field("int16", pa.int64()),
        pa.field("int32", pa.int64()),
        pa.field("int64", pa.int64()),
        pa.field("int8", pa.int64()),
        pa.field("null", pa.null()),
        pa.field("uint16", pa.uint64()),
        pa.field("uint32", pa.uint64()),
        pa.field("uint64", pa.uint64()),
        pa.field("uint8", pa.uint64()),
        pa.field("unicode", pa.string()),
    ]
    if not ARROW_LARGER_EQ_0130:
        fields.append(pa.field("__index_level_0__", pa.int64()))
    expected_schema = pa.schema(fields)

    assert actual_schema.remove_metadata() == expected_schema
Exemple #6
0
def test_load_from_store_with_indices(store):
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid",
        "partitions": {
            "product_id=1/part_1": {
                "files": {
                    "core_data":
                    "dataset_uuid/table/location_id=1/part_1.parquet"
                }
            }
        },
        "indices": {
            "product_id": {
                "1": ["part_1"],
                "2": ["part_1"],
                "100": ["part_1"],
                "34": ["part_1"],
            }
        },
    }
    store.put("uuid.by-dataset-metadata.json",
              simplejson.dumps(meta_dct).encode("utf-8"))
    df = pd.DataFrame({"index": [1], "location_id": [1], "product_id": [1]})
    store_schema_metadata(make_meta(df, origin="core"), "uuid", store,
                          "core_data")

    storage_key = "uuid/some_index.parquet"
    index2 = ExplicitSecondaryIndex(
        column="location_id",
        index_dct={
            1: ["part_1", "part_2"],
            3: ["part_3"]
        },
        index_storage_key=storage_key,
        dtype=pa.int64(),
    )
    index2.store(store, "dataset_uuid")

    dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid")
    assert "location_id" not in dmd.indices

    dmd = DatasetMetadata.load_from_store(store=store,
                                          uuid="uuid",
                                          load_all_indices=True)
    assert "location_id" in dmd.indices
Exemple #7
0
def test_load_partition_indices_types(store):
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    table = "table"
    index_name = "location_id"
    index_value = 1
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": dataset_uuid,
        "partitions": {
            "{index_name}={index_value}/part_1".format(index_name=index_name,
                                                       index_value=index_value):
            {
                "files": {
                    table:
                    "{dataset_uuid}/{table}/location_id=1/part_1.parquet".
                    format(dataset_uuid=dataset_uuid, table=table)
                }
            }
        },
    }
    store.put(
        "{dataset_uuid}.by-dataset-metadata.json".format(
            dataset_uuid=dataset_uuid),
        simplejson.dumps(meta_dct).encode(),
    )
    store_schema_metadata(
        make_meta(
            pd.DataFrame({index_name: pd.Series([index_value], dtype=int)}),
            origin="core",
        ),
        dataset_uuid,
        store,
        table,
    )
    dmd = DatasetMetadata.load_from_store(store=store, uuid=dataset_uuid)

    dmd = dmd.load_partition_indices()
    assert len(dmd.indices) == 1

    assert "location_id" in dmd.indices
    assert isinstance(dmd.indices["location_id"], PartitionIndex)

    idx = dmd.indices["location_id"]
    assert idx.dtype == pa.int64()
    assert idx.query(1) == ["location_id=1/part_1"]
def test_parse_nested_input_schema_compatible_but_different():
    # Ensure that input can be parsed even though the schemas are not identical but compatible
    df_input = [[
        {
            "data": {
                "table": pd.DataFrame({"A": [None]})
            }
        },
        {
            "data": {
                "table": pd.DataFrame({"A": ["str"]})
            }
        },
    ]]
    mp = parse_input_to_metapartition(df_input, metadata_version=4)
    expected_schema = make_meta(pd.DataFrame({"A": ["str"]}),
                                origin="expected")
    assert mp.table_meta["table"] == expected_schema
Exemple #9
0
def test_load_partition_keys(store):
    expected = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid",
        "partitions": {
            "part_1": {
                "files": {
                    "core_data": "uuid/table/index=1/index2=2/file.parquet"
                }
            },
            "part_2": {
                "files": {
                    "core_data": "uuid/table/index=1/index2=2/file2.parquet"
                }
            },
        },
        "indices": {
            "product_id": {
                "1": ["part_1"],
                "2": ["part_2"],
                "100": ["part_1", "part_2"],
                "34": ["part_1"],
            },
            "location_id": {
                "1": ["part_1"],
                "2": ["part_2"],
                "3": ["part_1"],
                "4": ["part_2"],
            },
        },
    }
    store.put("uuid.by-dataset-metadata.json",
              simplejson.dumps(expected).encode("utf-8"))
    df = pd.DataFrame({
        "index": [1],
        "index2": [1],
        "product_id": [1],
        "location_id": [1]
    })
    store_schema_metadata(make_meta(df, origin="core"), "uuid", store,
                          "core_data")
    dmd = DatasetMetadata.load_from_store("uuid", store)
    assert dmd.partition_keys == ["index", "index2"]
def test_reconstruct_index_duplicates(store):
    ser = ParquetSerializer()
    df = pd.DataFrame({"index_col": [1, 1], "column": list("ab")})

    label = "dontcare"
    key_prefix = "uuid/table/index_col=2/{}".format(label)
    key = ser.store(store, key_prefix, df)

    schema = make_meta(df, origin="1", partition_keys="index_col")
    store_schema_metadata(schema, "uuid", store)

    mp = MetaPartition(
        label="dontcare",
        file=key,
        metadata_version=4,
        schema=schema,
        partition_keys=["index_col"],
    )
    mp = mp.load_dataframes(store)
    df_actual = mp.data
    df_expected = pd.DataFrame(
        OrderedDict([("index_col", [2, 2]), ("column", list("ab"))]))
    pdt.assert_frame_equal(df_actual, df_expected)
Exemple #11
0
def test_read_table_meta(store):
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "dataset_uuid",
        "partitions": {
            "location_id=1/part_1": {
                "files": {
                    "table1":
                    "dataset_uuid/table1/location_id=1/part_1.parquet"
                }
            }
        },
    }
    df1 = pd.DataFrame({
        "location_id": pd.Series([1], dtype=int),
        "x": pd.Series([True], dtype=bool)
    })
    schema1 = make_meta(df1, origin="1")
    store_schema_metadata(schema1, "dataset_uuid", store, "table1")

    dmd = DatasetMetadata.load_from_dict(meta_dct, store)

    assert dmd.schema == schema1
Exemple #12
0
def test_to_dict(metadata_version):
    df = pd.DataFrame({"A": [1]})
    schema = make_meta(df, origin="test")
    mp = MetaPartition(
        label="label_1",
        file="file",
        data=df,
        indices={"test": [1, 2, 3]},
        metadata_version=metadata_version,
        schema=schema,
    )
    mp_dct = mp.to_dict()
    assert mp_dct == {
        "label": "label_1",
        "data": df,
        "file": "file",
        "indices": {"test": [1, 2, 3]},
        "metadata_version": metadata_version,
        "schema": schema,
        "partition_keys": [],
        "logical_conjunction": None,
        "table_name": SINGLE_TABLE,
    }
Exemple #13
0
def test_reconstruct_date_index(store, metadata_version, dates_as_object):
    ser = ParquetSerializer()
    # If the parquet file does include the primary index col, still use the reconstructed index and ignore the content of the file
    df = pd.DataFrame(
        {"index_col": [date(2018, 6, 1), date(2018, 6, 1)], "column": list("ab")}
    )

    label = "dontcare"
    key_prefix = "uuid/table/index_col=2018-06-02/{}".format(label)
    key = ser.store(store, key_prefix, df)

    schema = make_meta(df, origin="1", partition_keys="index_col")
    store_schema_metadata(schema, "uuid", store)

    mp = MetaPartition(
        label="dontcare",
        file=key,
        metadata_version=metadata_version,
        schema=schema,
        partition_keys=["index_col"],
    )

    mp = mp.load_dataframes(store, dates_as_object=dates_as_object)
    df_actual = mp.data
    if dates_as_object:
        dt_constructor = date
    else:
        dt_constructor = datetime
    df_expected = pd.DataFrame(
        OrderedDict(
            [
                ("index_col", [dt_constructor(2018, 6, 2), dt_constructor(2018, 6, 2)]),
                ("column", list("ab")),
            ]
        )
    )
    pdt.assert_frame_equal(df_actual, df_expected)
Exemple #14
0
def test_validate_compatible_other_pandas(df_all_types, remove_metadata, ignore_pandas):
    def _with_pandas(version):
        schema = make_meta(df_all_types, origin=version)
        metadata = schema.metadata
        pandas_metadata = simplejson.loads(metadata[b"pandas"].decode("utf8"))
        pandas_metadata["pandas_version"] = version
        metadata[b"pandas"] = simplejson.dumps(pandas_metadata).encode("utf8")
        schema = SchemaWrapper(pa.schema(schema, metadata), version)
        if remove_metadata:
            return schema.remove_metadata()
        else:
            return schema

    schema1 = make_meta(df_all_types, origin="all")
    schema2 = _with_pandas("0.19.0")
    schema3 = _with_pandas("0.99.0")
    if remove_metadata and not ignore_pandas:
        # This should fail as long as we have the metadata attached
        with pytest.raises(ValueError):
            validate_compatible(
                [schema1, schema2, schema3], ignore_pandas=ignore_pandas
            )
        schema1 = schema1.remove_metadata()
    validate_compatible([schema1, schema2, schema3], ignore_pandas=ignore_pandas)
Exemple #15
0
def test_dynamic_partitions_multiple_indices(store):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    suffix = "suffix"
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0"), ("product", "P-0")],
        "{}.parquet".format(suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1"), ("product", "P-0")],
        "{}.parquet".format(suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/product=P-0/{}".format(suffix): {
            "files": {
                "core": partition0_core
            }
        },
        "location=L-1/product=P-0/{}".format(suffix): {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/product=P-0/{}".format(suffix)],
            "L-1": ["location=L-1/product=P-0/{}".format(suffix)],
        },
        "product": {
            "P-0": [
                "location=L-0/product=P-0/{}".format(suffix),
                "location=L-1/product=P-0/{}".format(suffix),
            ]
        },
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({
            "location": ["L-0"],
            "product": ["P-0"]
        }),
                  origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()
    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    # Sorting may differ in the index list. This is ok for runtime
    # but does produce flaky tests thus sort them.
    sorted_result = {
        column: {label: sorted(x)
                 for label, x in index.items()}
        for column, index in dmd_dict["indices"].items()
    }
    assert sorted_result == expected_indices
Exemple #16
0
def df_all_types_schema(df_all_types):
    return make_meta(df_all_types, origin="df_all_types")
Exemple #17
0
def df_all_types_empty_schema(df_all_types):
    df_empty = df_all_types.drop(0)
    assert df_empty.empty
    return make_meta(df_empty, origin="df_empty")
def test_diff_schemas(df_all_types):
    # Prepare a schema with one missing, one additional and one changed column
    df2 = df_all_types.drop(columns=df_all_types.columns[0])
    df2["new_col"] = pd.Series(df_all_types["bool"])
    df2["int16"] = df2["int16"].astype(float)
    df2 = df2.reset_index(drop=True)

    schema2 = make_meta(df2, origin="2")
    schema1 = make_meta(df_all_types, origin="1")
    diff = _diff_schemas(schema1, schema2)
    expected_arrow_diff = """Arrow schema:
@@ -1,5 +1,3 @@

-array_float32: list<item: double>
-  child 0, item: double
 array_float64: list<item: double>
   child 0, item: double
 array_int16: list<item: int64>
@@ -26,10 +24,11 @@

 datetime64: timestamp[ns]
 float32: double
 float64: double
-int16: int64
+int16: double
 int32: int64
 int64: int64
 int8: int64
+new_col: bool
 null: null
 uint16: uint64
 uint32: uint64

"""
    expected_pandas_diff = """Pandas_metadata:
@@ -3,12 +3,7 @@

                      'name': None,
                      'numpy_type': 'object',
                      'pandas_type': 'unicode'}],
- 'columns': [{'field_name': 'array_float32',
-              'metadata': None,
-              'name': 'array_float32',
-              'numpy_type': 'object',
-              'pandas_type': 'list[float64]'},
-             {'field_name': 'array_float64',
+ 'columns': [{'field_name': 'array_float64',
               'metadata': None,
               'name': 'array_float64',
               'numpy_type': 'object',
@@ -91,8 +86,8 @@

              {'field_name': 'int16',
               'metadata': None,
               'name': 'int16',
-              'numpy_type': 'int64',
-              'pandas_type': 'int64'},
+              'numpy_type': 'float64',
+              'pandas_type': 'float64'},
              {'field_name': 'int32',
               'metadata': None,
               'name': 'int32',
@@ -108,6 +103,11 @@

               'name': 'int8',
               'numpy_type': 'int64',
               'pandas_type': 'int64'},
+             {'field_name': 'new_col',
+              'metadata': None,
+              'name': 'new_col',
+              'numpy_type': 'bool',
+              'pandas_type': 'bool'},
              {'field_name': 'null',
               'metadata': None,
               'name': 'null',"""

    assert diff == expected_arrow_diff + expected_pandas_diff
def test_pickle(df_all_types):
    obj1 = make_meta(df_all_types, origin="df_all_types")
    s = pickle.dumps(obj1)
    obj2 = pickle.loads(s)
    assert obj1 == obj2
def generate_mp():
    return MetaPartition(
        label=uuid.uuid4().hex,
        schema=make_meta(get_dataframe_alltypes(), origin="alltypes"),
        file="fakefile",
    )
def test_empty_dataframe_from_schema_columns(df_all_types):
    schema = make_meta(df_all_types, origin="1")
    actual_df = empty_dataframe_from_schema(schema, ["uint64", "int64"])

    expected_df = df_all_types.loc[[], ["uint64", "int64"]]
    pdt.assert_frame_equal(actual_df, expected_df)
Exemple #22
0
def test_query_indices_external(store, metadata_version):
    expected = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": "uuid+namespace-attribute12_underscored",
        "partitions": {
            "part_1": {
                "files": {
                    "core_data": "file.parquest"
                }
            },
            "part_2": {
                "files": {
                    "core_data": "file2.parquest"
                }
            },
        },
        "indices": {
            "product_id":
            "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet",
            "location_id": {
                "1": ["part_1"],
                "2": ["part_2"],
                "3": ["part_1"],
                "4": ["part_2"],
            },
        },
    }
    store.put(
        "uuid+namespace-attribute12_underscored.by-dataset-metadata.json",
        simplejson.dumps(expected).encode("utf-8"),
    )
    df = pd.DataFrame({
        "product_id": [1, 2, 100, 34],
        "partition": [
            np.array(["part_1"], dtype=object),
            np.array(["part_2"], dtype=object),
            np.array(["part_1", "part_2"], dtype=object),
            np.array(["part_1"], dtype=object),
        ],
    })
    schema = pa.schema([
        pa.field("partition", pa.list_(pa.string())),
        pa.field("product_id", pa.int64()),
    ])
    table = pa.Table.from_pandas(df, schema=schema)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(
        "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet",
        buf.getvalue().to_pybytes(),
    )
    store_schema_metadata(
        make_meta(df, origin="core"),
        "uuid+namespace-attribute12_underscored",
        store,
        "core_data",
    )

    dmd = DatasetMetadata.load_from_store(
        "uuid+namespace-attribute12_underscored", store)

    dmd = dmd.load_index("product_id", store)
    assert dmd.query(product_id=2) == ["part_2"]
    dmd = dmd.load_all_indices(store)
    assert dmd.query(product_id=2, location_id=2) == ["part_2"]
    assert dmd.query(product_id=100, location_id=3) == ["part_1"]
    assert dmd.query(product_id=2, location_id=2,
                     something_else="bla") == ["part_2"]

    additional_index = ExplicitSecondaryIndex.from_v2(
        "another_column", {"1": ["part_2", "part_3"]})
    assert dmd.query(indices=[additional_index],
                     another_column="1",
                     product_id=2,
                     location_id=2) == ["part_2"]
def test_schema_dataframe_rountrip(index, df_all_types):
    df = pd.DataFrame(df_all_types, index=index)

    schema = make_meta(df, origin="1")
    actual_df = empty_dataframe_from_schema(schema, date_as_object=True)
    validate_compatible([schema, make_meta(actual_df, origin="2")])
Exemple #24
0
 def time_make_meta(self):
     make_meta(self.df, origin="df")
def test_reorder(df_all_types):
    df2 = df_all_types.copy()
    df2 = df2.reindex(reversed(df_all_types.columns), axis=1)
    expected = make_meta(df_all_types, origin="df_all_types")
    actual = make_meta(df2, origin="df2")
    assert expected == actual
def test_unicode_col():
    df = pd.DataFrame({"fö": [1]})
    make_meta(df, origin="df")
Exemple #27
0
 def setup(self, num_schemas):
     self.df = get_dataframe_alltypes()
     schema = make_meta(self.df, origin="df")
     self.schemas = [deepcopy(schema) for _ in range(num_schemas)]
Exemple #28
0
def test_dynamic_partitions_with_garbage(store):
    """
    In case there are unknown files, dataset and indices still load correctly
    """
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition_suffix = "suffix"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0"), ("product", "P-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1"), ("product", "P-0")],
        "{}.parquet".format(partition_suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/product=P-0/{}".format(partition_suffix): {
            "files": {
                "core": partition0_core
            }
        },
        "location=L-1/product=P-0/{}".format(partition_suffix): {
            "files": {
                "core": partition1_core
            }
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/product=P-0/{}".format(partition_suffix)],
            "L-1": ["location=L-1/product=P-0/{}".format(partition_suffix)],
        },
        "product": {
            "P-0": [
                "location=L-0/product=P-0/{}".format(partition_suffix),
                "location=L-1/product=P-0/{}".format(partition_suffix),
            ]
        },
    }

    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store_schema_metadata(
        make_meta(pd.DataFrame({
            "location": ["L-0"],
            "product": ["P-0"]
        }),
                  origin="1"),
        dataset_uuid,
        store,
        "core",
    )

    # the following files are garbage and should not interfere with the indices and/or partitions
    for suffix in ["", ".json", ".msgpack", ".my_own_file_format"]:
        store.put("this_should_not_exist{}".format(suffix), b"ignore me")
        store.put("{}/this_should_not_exist{}".format(dataset_uuid, suffix),
                  b"ignore me")
        store.put(
            "{}/{}/this_should_not_exist{}".format(dataset_uuid, "core",
                                                   suffix),
            b"ignore me",
        )
        store.put(
            "{}/{}/location=L-0/this_should_not_exist{}".format(
                dataset_uuid, "core", suffix),
            b"ignore me",
        )

    dmd = DatasetMetadata.load_from_dict(metadata, store)
    dmd = dmd.load_partition_indices()
    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    # Sorting may differ in the index list. This is ok for runtime
    # but does produce flaky tests thus sort them.
    sorted_result = {
        column: {label: sorted(x)
                 for label, x in index.items()}
        for column, index in dmd_dict["indices"].items()
    }
    assert sorted_result == expected_indices
Exemple #29
0
def test_dynamic_partitions(store):
    """
    Do not specify partitions in metadata, but read them dynamically from store
    """
    partition_suffix = "suffix"
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    partition0_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_core = create_partition_key(
        dataset_uuid,
        "core",
        [("location", "L-1")],
        "{}.parquet".format(partition_suffix),
    )
    partition0_ext = create_partition_key(
        dataset_uuid,
        "extension",
        [("location", "L-0")],
        "{}.parquet".format(partition_suffix),
    )
    partition1_ext = create_partition_key(
        dataset_uuid,
        "extension",
        [("location", "L-1")],
        "{}.parquet".format(partition_suffix),
    )
    metadata = {"dataset_metadata_version": 4, "dataset_uuid": dataset_uuid}
    expected_partitions = {
        "location=L-0/{}".format(partition_suffix): {
            "files": {
                "core": partition0_core,
                "extension": partition0_ext
            }
        },
        "location=L-1/{}".format(partition_suffix): {
            "files": {
                "core": partition1_core,
                "extension": partition1_ext
            }
        },
    }
    expected_indices = {
        "location": {
            "L-0": ["location=L-0/{}".format(partition_suffix)],
            "L-1": ["location=L-1/{}".format(partition_suffix)],
        }
    }

    # put two partitions for two tables each to store
    store.put(
        "{}{}.json".format(dataset_uuid, naming.METADATA_BASE_SUFFIX),
        simplejson.dumps(metadata).encode("utf-8"),
    )
    store.put(partition0_core, b"test")
    store.put(partition1_core, b"test")
    store.put(partition0_ext, b"test")
    store.put(partition1_ext, b"test")
    store_schema_metadata(
        make_meta(
            pd.DataFrame({"location": ["L-0/{}".format(partition_suffix)]}),
            origin="stored",
        ),
        dataset_uuid,
        store,
        "core",
    )

    # instantiate metadata to write table metadatad
    core_schema = make_meta(
        pd.DataFrame({
            "column_0": pd.Series([1], dtype=int),
            "column_1": pd.Series([1], dtype=int),
            "location": pd.Series(["str"]),
        }),
        origin="core",
    )
    extension_schema = make_meta(
        pd.DataFrame({
            "column_77": pd.Series([1], dtype=int),
            "column_78": pd.Series([1], dtype=int),
            "location": pd.Series(["str"]),
        }),
        origin="extension",
    )
    store_schema_metadata(core_schema, dataset_uuid, store, "core")
    store_schema_metadata(extension_schema, dataset_uuid, store, "extension")
    dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
    # reload metadata to use table metadata
    dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
    dmd = dmd.load_partition_indices()

    dmd_dict = dmd.to_dict()
    assert dmd_dict["partitions"] == expected_partitions
    assert dmd_dict["indices"] == expected_indices
Exemple #30
0
    def __init__(
        self,
        label: Optional[str],
        file: Optional[str] = None,
        table_name: str = SINGLE_TABLE,
        data: Optional[pd.DataFrame] = None,
        indices: Optional[Dict[Any, Any]] = None,
        metadata_version: Optional[int] = None,
        schema: Optional[SchemaWrapper] = None,
        partition_keys: Optional[Sequence[str]] = None,
        logical_conjunction: Optional[List[Tuple[Any, str, Any]]] = None,
    ):
        """
        Initialize the :mod:`kartothek.io` base class MetaPartition.

        The `MetaPartition` is used as a wrapper around the kartothek
        `Partition` and primarily deals with dataframe manipulations,
        in- and output to store.

        The :class:`kartothek.io_components.metapartition` is immutable, i.e. all member
        functions will return a new MetaPartition object where the new
        attribute is changed

        Parameters
        ----------
        label
            partition label
        files
            A dictionary with references to the files in store where the
            keys represent file labels and the keys file prefixes.
        metadata
            The metadata of the partition
        data
            A dictionary including the materialized in-memory DataFrames
            corresponding to the file references in `files`.
        indices
            Kartothek index dictionary,
        metadata_version
        table_meta
            The dataset table schemas
        partition_keys
            The dataset partition keys
        logical_conjunction
            A logical conjunction to assign to the MetaPartition. By assigning
            this, the MetaPartition will only be able to load data respecting
            this conjunction.
        """

        if metadata_version is None:
            self.metadata_version = naming.DEFAULT_METADATA_VERSION
        else:
            self.metadata_version = metadata_version
        verify_metadata_version(self.metadata_version)
        self.schema = schema
        self.table_name = table_name
        if data is not None and schema is None:
            self.schema = make_meta(data,
                                    origin=f"{table_name}/{label}",
                                    partition_keys=partition_keys)

        indices = indices or {}
        for column, index_dct in indices.items():
            if isinstance(index_dct, dict):
                indices[column] = ExplicitSecondaryIndex(column=column,
                                                         index_dct=index_dct)
        self.logical_conjunction = logical_conjunction
        self.metapartitions = [{
            "label": label,
            "data": data,
            "file": file or None,
            "indices": indices,
            "logical_conjunction": logical_conjunction,
        }]
        self.partition_keys = partition_keys or []