Example #1
0
def test_validate_shared_columns_fail(df_all_types, remove_metadata):
    df2 = df_all_types.copy()
    df2["uint16"] = df2["uint16"].astype(float)
    schema1 = make_meta(df_all_types, origin="1")
    schema2 = make_meta(df2, origin="2")
    if remove_metadata:
        schema1 = schema1.remove_metadata()
        schema2 = schema2.remove_metadata()
    with pytest.raises(ValueError) as exc:
        validate_shared_columns([schema1, schema2])
    assert str(exc.value).startswith('Found incompatible entries for column "uint16"')
Example #2
0
def test_validate_shared_columns_no_share(df_all_types):
    schema1 = make_meta(df_all_types.loc[:, df_all_types.columns[0:2]], origin="1")
    schema2 = make_meta(df_all_types.loc[:, df_all_types.columns[2:4]], origin="2")
    schema3 = make_meta(df_all_types.loc[:, df_all_types.columns[4:6]], origin="3")
    validate_shared_columns([])
    validate_shared_columns([schema1])
    validate_shared_columns([schema1, schema2])
    validate_shared_columns([schema1, schema2, schema3])
Example #3
0
def persist_common_metadata(partition_list, update_dataset, store,
                            dataset_uuid):
    # hash the schemas for quick equality check with possible false negatives
    # (e.g. other pandas version or null schemas)
    tm_dct = defaultdict(set)
    for mp in partition_list:
        for tab, tm in mp.table_meta.items():
            tm_dct[tab].add(tm)

    if update_dataset:
        if set(tm_dct.keys()) and set(update_dataset.tables) != set(
                tm_dct.keys()):
            raise ValueError((
                "Input partitions for update have different tables than dataset:\n"
                "Input partition tables: {}\n"
                "Tables of existing dataset: {}").format(
                    set(tm_dct.keys()), update_dataset.tables))
        for table in update_dataset.tables:
            tm_dct[table].add(
                read_schema_metadata(dataset_uuid=dataset_uuid,
                                     store=store,
                                     table=table))

    result = {}

    # sort tables and schemas to have reproducible error messages
    for table in sorted(tm_dct.keys()):
        schemas = sorted(tm_dct[table], key=lambda s: sorted(s.origin))
        try:
            result[table] = validate_compatible(schemas)
        except ValueError as e:
            raise ValueError(
                "Schemas for table '{table}' of dataset '{dataset_uuid}' are not compatible!\n\n{e}"
                .format(table=table, dataset_uuid=dataset_uuid, e=e))

    validate_shared_columns(list(result.values()))

    for table, schema in result.items():
        store_schema_metadata(schema=schema,
                              dataset_uuid=dataset_uuid,
                              store=store,
                              table=table)
    return result
Example #4
0
def check_datasets(datasets: Dict[str, DatasetMetadata],
                   cube: Cube) -> Dict[str, DatasetMetadata]:
    """
    Apply sanity checks to persisteted Karothek datasets.

    The following checks will be applied:

    - seed dataset present
    - metadata version correct
    - only the cube-specific table is present
    - partition keys are correct
    - no overlapping payload columns exists
    - datatypes are consistent
    - dimension columns are present everywhere
    - required index structures are present (more are allowed)

      - ``PartitionIndex`` for every partition key
      - for seed dataset, ``ExplicitSecondaryIndex`` for every dimension column
      - for all datasets, ``ExplicitSecondaryIndex`` for every index column

    Parameters
    ----------
    datasets
        Datasets.
    cube
        Cube specification.

    Returns
    -------
    datasets: Dict[str, DatasetMetadata]
        Same as input, but w/ partition indices loaded.

    Raises
    ------
    ValueError
        If sanity check failed.
    """
    if cube.seed_dataset not in datasets:
        raise ValueError('Seed data ("{}") is missing.'.format(
            cube.seed_dataset))

    _check_datasets(
        datasets=datasets,
        f=lambda ds: ds.metadata_version,
        expected=KTK_CUBE_METADATA_VERSION,
        what="metadata version",
    )
    datasets = {
        name: ds.load_partition_indices()
        for name, ds in datasets.items()
    }
    _check_datasets(
        datasets=datasets,
        f=lambda ds: set(ds.table_meta.keys()),
        expected={SINGLE_TABLE},
        what="table",
    )
    _check_overlap(datasets, cube)

    # check column types
    validate_shared_columns(
        [ds.table_meta[SINGLE_TABLE] for ds in datasets.values()])

    _check_partition_columns(datasets, cube)
    _check_dimension_columns(datasets, cube)
    _check_indices(datasets, cube)

    return datasets
def test_validate_shared_columns_same(df_all_types):
    schema1 = make_meta(df_all_types, origin="1")
    schema2 = make_meta(df_all_types, origin="2")
    schema3 = make_meta(df_all_types, origin="3").remove_metadata()
    validate_shared_columns([])
    validate_shared_columns([schema1])
    validate_shared_columns([schema1, schema2])
    with pytest.raises(ValueError):
        validate_shared_columns([schema1, schema2, schema3])
    validate_shared_columns([schema1, schema2, schema3], ignore_pandas=True)
    validate_shared_columns(
        [schema1.remove_metadata(),
         schema2.remove_metadata(), schema3])
Example #6
0
 def time_validate_shared_columns(self, num_schemas):
     validate_shared_columns(self.schemas)
Example #7
0
 def peakmem_validate_shared_columns(self, num_schemas):
     validate_shared_columns(self.schemas)