Example #1
0
def test_create_dataset_header(store, metadata_storage_format, frozen_time):
    table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")}
    new_dataset = create_empty_dataset_header(
        store=store,
        table_meta=table_meta,
        dataset_uuid="new_dataset_uuid",
        metadata_storage_format=metadata_storage_format,
        metadata_version=4,
    )

    expected_dataset = DatasetMetadata(
        uuid="new_dataset_uuid",
        metadata_version=4,
        explicit_partitions=False,
        table_meta=table_meta,
    )
    assert new_dataset == expected_dataset

    storage_keys = list(store.keys())
    assert len(storage_keys) == 2

    loaded = DatasetMetadata.load_from_store(store=store,
                                             uuid="new_dataset_uuid")
    assert loaded == expected_dataset

    # If the read succeeds, the schema is written
    read_schema_metadata(dataset_uuid=new_dataset.uuid,
                         store=store,
                         table="table")
Example #2
0
    def validate_schema_compatible(self, store: StoreInput,
                                   dataset_uuid: str) -> "MetaPartition":
        """
        Validates that the currently held DataFrames match the schema of the existing dataset.

        Parameters
        ----------
        store
            If it is a function, the result of calling it must be a KeyValueStore.
        dataset_uuid
            The dataset UUID the partition will be assigned to
        """

        # Load the reference meta of the existing dataset. Using the built-in
        # `load_all_table_meta` would not be helpful here as it would be a no-op
        # as we have already loaded the meta from the input DataFrame.
        store = ensure_store(store)
        reference_meta = read_schema_metadata(dataset_uuid=dataset_uuid,
                                              store=store,
                                              table=self.table_name)
        try:
            validate_compatible([self.schema, reference_meta])
        except ValueError as e:
            raise ValueError(
                f"Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}"
            )
        return self
Example #3
0
def persist_common_metadata(
    schemas: Iterable[SchemaWrapper],
    update_dataset: Optional[DatasetFactory],
    store: KeyValueStore,
    dataset_uuid: str,
    table_name: str,
):

    if not schemas:
        return None
    schemas_set = set(schemas)
    del schemas

    if update_dataset:
        schemas_set.add(
            read_schema_metadata(dataset_uuid=dataset_uuid,
                                 store=store,
                                 table=table_name))

    schemas_sorted = sorted(schemas_set, key=lambda s: sorted(s.origin))

    try:
        result = validate_compatible(schemas_sorted)
    except ValueError as e:
        raise ValueError(
            "Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}".
            format(dataset_uuid=dataset_uuid, e=e))
    if result:
        store_schema_metadata(schema=result,
                              dataset_uuid=dataset_uuid,
                              store=store,
                              table=table_name)
    return result
Example #4
0
def test_schema_roundtrip(df_all_types, store):
    expected_meta = make_meta(df_all_types, origin="df_all_types")
    store_schema_metadata(
        expected_meta, dataset_uuid="dataset_uuid", store=store, table="table"
    )
    result = read_schema_metadata(
        dataset_uuid="dataset_uuid", store=store, table="table"
    )
    assert result == expected_meta
Example #5
0
    def load_schema(self, store: StoreInput,
                    dataset_uuid: str) -> "MetaPartition":
        """
        Loads all table metadata in memory and stores it under the `tables` attribute

        """

        if self.schema is None:
            store = ensure_store(store)
            self.schema = read_schema_metadata(dataset_uuid=dataset_uuid,
                                               store=store,
                                               table=self.table_name)
        return self
Example #6
0
def persist_common_metadata(partition_list, update_dataset, store,
                            dataset_uuid):
    # hash the schemas for quick equality check with possible false negatives
    # (e.g. other pandas version or null schemas)
    tm_dct = defaultdict(set)
    for mp in partition_list:
        for tab, tm in mp.table_meta.items():
            tm_dct[tab].add(tm)

    if update_dataset:
        if set(tm_dct.keys()) and set(update_dataset.tables) != set(
                tm_dct.keys()):
            raise ValueError((
                "Input partitions for update have different tables than dataset:\n"
                "Input partition tables: {}\n"
                "Tables of existing dataset: {}").format(
                    set(tm_dct.keys()), update_dataset.tables))
        for table in update_dataset.tables:
            tm_dct[table].add(
                read_schema_metadata(dataset_uuid=dataset_uuid,
                                     store=store,
                                     table=table))

    result = {}

    # sort tables and schemas to have reproducible error messages
    for table in sorted(tm_dct.keys()):
        schemas = sorted(tm_dct[table], key=lambda s: sorted(s.origin))
        try:
            result[table] = validate_compatible(schemas)
        except ValueError as e:
            raise ValueError(
                "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}"
                .format(table=table, dataset_uuid=dataset_uuid, e=e))

    validate_shared_columns(list(result.values()))

    for table, schema in result.items():
        store_schema_metadata(schema=schema,
                              dataset_uuid=dataset_uuid,
                              store=store,
                              table=table)
    return result
Example #7
0
def test_compat_old_rw_path(df_all_types, store):
    # strip down DF before some column types weren't supported before anyway
    df = df_all_types[
        [
            c
            for c in df_all_types.columns
            if (
                not c.startswith("array_")  # array types (always null)
                and c != "unicode"  # unicode type (alway null)
                and "8" not in c  # 8 bit types are casted to 64 bit
                and "16" not in c  # 16 bit types are casted to 64 bit
                and "32" not in c  # 32 bit types are casted to 64 bit
            )
        ]
    ]
    expected_meta = make_meta(df, origin="df")

    # old schema write path
    old_meta = dask_make_meta(df)
    pa_table = pa.Table.from_pandas(old_meta)
    buf = pa.BufferOutputStream()
    pq.write_table(pa_table, buf, version="2.0")
    key_old = _get_common_metadata_key("dataset_uuid_old", "table")
    store.put(key_old, buf.getvalue().to_pybytes())

    actual_meta = read_schema_metadata(
        dataset_uuid="dataset_uuid_old", store=store, table="table"
    )
    validate_compatible([actual_meta, expected_meta])

    store_schema_metadata(
        schema=make_meta(df, origin="df"),
        dataset_uuid="dataset_uuid_new",
        store=store,
        table="table",
    )
    key_new = _get_common_metadata_key("dataset_uuid_new", "table")
    actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store)
    actual_df["date"] = actual_df["date"].dt.date
    pdt.assert_frame_equal(actual_df, old_meta)
Example #8
0
    def load_from_dict(dct: Dict,
                       store: StoreInput,
                       load_schema: bool = True) -> "DatasetMetadata":
        """
        Load dataset metadata from a dictionary and resolve any external includes.

        Parameters
        ----------
        dct
        store
            Object that implements the .get method for file/object loading.
        load_schema
            Load table schema

        """
        # Use copy here to get an OrderedDict
        metadata = copy.copy(dct)

        if "metadata" not in metadata:
            metadata["metadata"] = OrderedDict()

        metadata_version = dct[naming.METADATA_VERSION_KEY]
        dataset_uuid = dct[naming.UUID_KEY]
        explicit_partitions = "partitions" in metadata
        storage_keys = None
        if not explicit_partitions:
            storage_keys = DatasetMetadata.storage_keys(dataset_uuid, store)
            partitions = _load_partitions_from_filenames(
                store=store,
                storage_keys=storage_keys,
                metadata_version=metadata_version,
            )
            metadata["partitions"] = partitions

        if metadata["partitions"]:
            tables = [
                tab
                for tab in list(metadata["partitions"].values())[0]["files"]
            ]
        else:
            table_set = set()
            if storage_keys is None:
                storage_keys = DatasetMetadata.storage_keys(
                    dataset_uuid, store)
            for key in storage_keys:
                if key.endswith(naming.TABLE_METADATA_FILE):
                    table_set.add(key.split("/")[1])
            tables = list(table_set)

        table_meta = {}
        if load_schema:
            for table in tables:
                table_meta[table] = read_schema_metadata(
                    dataset_uuid=dataset_uuid, store=store, table=table)

        metadata["table_meta"] = table_meta

        if "partition_keys" not in metadata:
            metadata["partition_keys"] = _get_partition_keys_from_partitions(
                metadata["partitions"])

        return DatasetMetadata.from_dict(
            metadata, explicit_partitions=explicit_partitions)