Esempio n. 1
0
    def validate_schema_compatible(self, store: StoreInput,
                                   dataset_uuid: str) -> "MetaPartition":
        """
        Validates that the currently held DataFrames match the schema of the existing dataset.

        Parameters
        ----------
        store
            If it is a function, the result of calling it must be a KeyValueStore.
        dataset_uuid
            The dataset UUID the partition will be assigned to
        """

        # Load the reference meta of the existing dataset. Using the built-in
        # `load_all_table_meta` would not be helpful here as it would be a no-op
        # as we have already loaded the meta from the input DataFrame.
        store = ensure_store(store)
        reference_meta = read_schema_metadata(dataset_uuid=dataset_uuid,
                                              store=store,
                                              table=self.table_name)
        try:
            validate_compatible([self.schema, reference_meta])
        except ValueError as e:
            raise ValueError(
                f"Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}"
            )
        return self
Esempio n. 2
0
def test_validate_compatible_other_pandas(df_all_types, remove_metadata,
                                          ignore_pandas):
    def _with_pandas(version):
        schema = make_meta(df_all_types, origin=version)
        metadata = schema.metadata
        pandas_metadata = simplejson.loads(metadata[b"pandas"].decode("utf8"))
        pandas_metadata["pandas_version"] = version
        metadata[b"pandas"] = simplejson.dumps(pandas_metadata).encode("utf8")
        schema = SchemaWrapper(pa.schema(schema, metadata), version)
        if remove_metadata:
            return schema.remove_metadata()
        else:
            return schema

    schema1 = make_meta(df_all_types, origin="all")
    schema2 = _with_pandas("0.19.0")
    schema3 = _with_pandas("0.99.0")
    if remove_metadata and not ignore_pandas:
        # This should fail as long as we have the metadata attached
        with pytest.raises(ValueError):
            validate_compatible([schema1, schema2, schema3],
                                ignore_pandas=ignore_pandas)
        schema1 = schema1.remove_metadata()
    validate_compatible([schema1, schema2, schema3],
                        ignore_pandas=ignore_pandas)
Esempio n. 3
0
def test_validate_empty_dataframe_corrupt_raises(
    df_all_types,
    df_all_types_schema,
    df_all_types_empty_schema,
    corrupt_column,
    corrupt_value,
    corrupt_dtype,
):
    # In case there is something wrong with the schema, raise!

    # First, an integer column carries a float or an object.
    df_corrupt = df_all_types.copy()
    # for value, dtype in [(-1.1, np.float64), ('a', np.object)]:
    df_corrupt[corrupt_column] = pd.Series([corrupt_value],
                                           dtype=corrupt_dtype)
    df_corrupt_meta = make_meta(df_corrupt, origin="1")
    # Raise when comparing the proper to the corrupt schema
    for schemas in permutations([df_all_types_schema, df_corrupt_meta]):
        with pytest.raises(ValueError):
            validate_compatible(schemas)
    # Also raise if there is a schema originating from an empty DF to make
    # sure the emptiness doesn't cancel the validation
    for schemas in permutations(
        [df_all_types_schema, df_corrupt_meta, df_all_types_empty_schema]):
        with pytest.raises(ValueError):
            validate_compatible(schemas)
Esempio n. 4
0
def test_validate_compatible_different(df_all_types):
    df2 = df_all_types.loc[:, df_all_types.columns[:2]].copy()
    schema1 = make_meta(df_all_types, origin="1")
    schema2 = make_meta(df2, origin="2")
    with pytest.raises(ValueError) as exc:
        validate_compatible([schema1, schema2])
    assert str(exc.value).startswith("Schema violation")
Esempio n. 5
0
def test_validate_empty_dataframe(df_all_types, df_all_types_schema,
                                  df_all_types_empty_schema):
    # Do not raise in case one of the schemas is of an empty dataframe
    # Test all permutations to avoid that the implementation is sensitive on whether
    # the first schema is empty/non-empty
    for schemas in permutations(
        [df_all_types_schema, df_all_types_empty_schema]):
        validate_compatible(schemas)
    validate_compatible([df_all_types_empty_schema, df_all_types_empty_schema])
Esempio n. 6
0
def test_validate_different_cats_different_type():
    input_df = pd.DataFrame(
        {"categories": pd.Series([u"a", u"b", u"c", u"a"], dtype="category")})
    input_df_2 = pd.DataFrame(
        {"categories": pd.Series([b"f", b"e", b"e", b"f"], dtype="category")})

    meta = make_meta(input_df, origin="1")
    meta_2 = make_meta(input_df_2, origin="2")
    with pytest.raises(ValueError):
        validate_compatible([meta, meta_2])
Esempio n. 7
0
def test_validate_different_cats_same_type():
    input_df = pd.DataFrame(
        {"categories": pd.Series(["a", "b", "c", "a"], dtype="category")})
    input_df_2 = pd.DataFrame(
        {"categories": pd.Series(["f", "e", "e", "f"], dtype="category")})
    input_df_3 = pd.DataFrame({"categories": pd.Series(["f", "e", "e", "f"])})

    meta = make_meta(input_df, origin="1")
    meta_2 = make_meta(input_df_2, origin="2")
    meta_3 = make_meta(input_df_3, origin="3")
    validate_compatible([meta, meta_2, meta_3])
Esempio n. 8
0
def test_validate_compatible_same(df_all_types):
    schema1 = make_meta(df_all_types, origin="1")
    schema2 = make_meta(df_all_types, origin="2")
    schema3 = make_meta(df_all_types, origin="3")
    validate_compatible([])
    validate_compatible([schema1])
    validate_compatible([schema1, schema2])
    validate_compatible([schema1, schema2, schema3])
Esempio n. 9
0
def persist_common_metadata(
    schemas: Iterable[SchemaWrapper],
    update_dataset: Optional[DatasetFactory],
    store: KeyValueStore,
    dataset_uuid: str,
    table_name: str,
):

    if not schemas:
        return None
    schemas_set = set(schemas)
    del schemas

    if update_dataset:
        schemas_set.add(
            read_schema_metadata(dataset_uuid=dataset_uuid,
                                 store=store,
                                 table=table_name))

    schemas_sorted = sorted(schemas_set, key=lambda s: sorted(s.origin))

    try:
        result = validate_compatible(schemas_sorted)
    except ValueError as e:
        raise ValueError(
            "Schemas for dataset '{dataset_uuid}' are not compatible!\n\n{e}".
            format(dataset_uuid=dataset_uuid, e=e))
    if result:
        store_schema_metadata(schema=result,
                              dataset_uuid=dataset_uuid,
                              store=store,
                              table=table_name)
    return result
Esempio n. 10
0
def test_compat_old_rw_path(df_all_types, store):
    # strip down DF before some column types weren't supported before anyway
    df = df_all_types[
        [
            c
            for c in df_all_types.columns
            if (
                not c.startswith("array_")  # array types (always null)
                and c != "unicode"  # unicode type (alway null)
                and "8" not in c  # 8 bit types are casted to 64 bit
                and "16" not in c  # 16 bit types are casted to 64 bit
                and "32" not in c  # 32 bit types are casted to 64 bit
            )
        ]
    ]
    expected_meta = make_meta(df, origin="df")

    # old schema write path
    old_meta = dask_make_meta(df)
    pa_table = pa.Table.from_pandas(old_meta)
    buf = pa.BufferOutputStream()
    pq.write_table(pa_table, buf, version="2.0")
    key_old = _get_common_metadata_key("dataset_uuid_old", "table")
    store.put(key_old, buf.getvalue().to_pybytes())

    actual_meta = read_schema_metadata(
        dataset_uuid="dataset_uuid_old", store=store, table="table"
    )
    validate_compatible([actual_meta, expected_meta])

    store_schema_metadata(
        schema=make_meta(df, origin="df"),
        dataset_uuid="dataset_uuid_new",
        store=store,
        table="table",
    )
    key_new = _get_common_metadata_key("dataset_uuid_new", "table")
    actual_df = ParquetSerializer.restore_dataframe(key=key_new, store=store)
    actual_df["date"] = actual_df["date"].dt.date
    pdt.assert_frame_equal(actual_df, old_meta)
Esempio n. 11
0
    def add_metapartition(
        self,
        metapartition: "MetaPartition",
        schema_validation: bool = True,
    ):
        """
        Adds a metapartition to the internal list structure to enable batch processing.

        Parameters
        ----------
        metapartition
            The MetaPartition to be added.
        schema_validation
            If True (default), ensure that the `table_meta` of both `MetaPartition` objects are the same
        """
        if self.is_sentinel:
            return metapartition

        existing_label = [mp_["label"] for mp_ in self.metapartitions]

        if any([
                mp_["label"] in existing_label
                for mp_ in metapartition.metapartitions
        ]):
            raise RuntimeError(
                "Duplicate labels for nested metapartitions are not allowed!")
        schema = metapartition.schema

        if schema_validation and schema:
            # This ensures that only schema-compatible metapartitions can be nested
            # The returned schema by validate_compatible is the reference schema with the most
            # information, i.e. the fewest null columns
            schema = validate_compatible([self.schema, metapartition.schema])

        new_object = MetaPartition(
            label="NestedMetaPartition",
            metadata_version=metapartition.metadata_version,
            schema=schema,
            partition_keys=metapartition.partition_keys or None,
            logical_conjunction=metapartition.logical_conjunction or None,
            table_name=metapartition.table_name,
        )

        # Add metapartition information to the new object
        new_metapartitions = self.metapartitions.copy()
        new_metapartitions.extend(metapartition.metapartitions.copy())
        new_object.metapartitions = new_metapartitions

        return new_object
Esempio n. 12
0
def persist_common_metadata(partition_list, update_dataset, store,
                            dataset_uuid):
    # hash the schemas for quick equality check with possible false negatives
    # (e.g. other pandas version or null schemas)
    tm_dct = defaultdict(set)
    for mp in partition_list:
        for tab, tm in mp.table_meta.items():
            tm_dct[tab].add(tm)

    if update_dataset:
        if set(tm_dct.keys()) and set(update_dataset.tables) != set(
                tm_dct.keys()):
            raise ValueError((
                "Input partitions for update have different tables than dataset:\n"
                "Input partition tables: {}\n"
                "Tables of existing dataset: {}").format(
                    set(tm_dct.keys()), update_dataset.tables))
        for table in update_dataset.tables:
            tm_dct[table].add(
                read_schema_metadata(dataset_uuid=dataset_uuid,
                                     store=store,
                                     table=table))

    result = {}

    # sort tables and schemas to have reproducible error messages
    for table in sorted(tm_dct.keys()):
        schemas = sorted(tm_dct[table], key=lambda s: sorted(s.origin))
        try:
            result[table] = validate_compatible(schemas)
        except ValueError as e:
            raise ValueError(
                "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}"
                .format(table=table, dataset_uuid=dataset_uuid, e=e))

    validate_shared_columns(list(result.values()))

    for table, schema in result.items():
        store_schema_metadata(schema=schema,
                              dataset_uuid=dataset_uuid,
                              store=store,
                              table=table)
    return result
Esempio n. 13
0
def test_validate_schema_non_overlapping_nulls(df_all_types_schema):
    """
    Test that two schemas with non-overlapping null columns are valid
    """
    first_ix = np.random.randint(len(df_all_types_schema))
    second_ix = first_ix
    while second_ix == first_ix:
        second_ix = np.random.randint(len(df_all_types_schema))

    first_null = pa.field(name=df_all_types_schema.names[first_ix], type=pa.null())
    first_schema = df_all_types_schema.set(first_ix, first_null)

    second_null = pa.field(name=df_all_types_schema.names[second_ix], type=pa.null())
    second_schema = df_all_types_schema.set(second_ix, second_null)

    for schemas in permutations([first_schema, second_schema]):
        reference_schema = validate_compatible(schemas)

        # The reference schema should be the original schema
        # with the columns reconstructed
        assert df_all_types_schema == reference_schema
Esempio n. 14
0
    def concat_metapartitions(metapartitions, label_merger=None):
        LOGGER.debug("Concatenating metapartitions")

        new_metadata_version = -1
        data = []
        schema = []
        for mp in metapartitions:
            new_metadata_version = max(new_metadata_version,
                                       mp.metadata_version)
            data.append(mp.data)
            schema.append(mp.schema)
            # Don't care about the partition_keys. If we try to merge
            # MetaPartitions without alignment the schemas won't match.
            partition_keys = mp.partition_keys

        categoricals = [
            col for col, dtype in data[0].items()
            if pd.api.types.is_categorical_dtype(dtype)
        ]
        if categoricals:
            data = align_categories(data, categoricals)
        new_df = pd.concat(data)

        new_schema = validate_compatible(schema)

        new_label = MetaPartition._merge_labels(metapartitions, label_merger)

        new_mp = MetaPartition(
            label=new_label,
            data=new_df,
            metadata_version=new_metadata_version,
            schema=new_schema,
            partition_keys=partition_keys,
        )

        return new_mp
Esempio n. 15
0
def test_schema_dataframe_rountrip(index, df_all_types):
    df = pd.DataFrame(df_all_types, index=index)

    schema = make_meta(df, origin="1")
    actual_df = empty_dataframe_from_schema(schema, date_as_object=True)
    validate_compatible([schema, make_meta(actual_df, origin="2")])
Esempio n. 16
0
 def peakmem_validate_compatible(self, num_schemas, has_na):
     validate_compatible(self.schemas)
Esempio n. 17
0
 def time_validate_compatible(self, num_schemas, has_na):
     validate_compatible(self.schemas)