Exemple #1
0
def test_align_datasets_exact(dataset, evaluation_dataset, store_session):
    with pytest.raises(RuntimeError):
        list(
            align_datasets(
                left_dataset_uuid=dataset.uuid,
                right_dataset_uuid=evaluation_dataset.uuid,
                store=store_session,
                match_how="exact",
            ))

    generator = align_datasets(
        left_dataset_uuid=dataset.uuid,
        right_dataset_uuid=dataset.uuid,
        store=store_session,
        match_how="exact",
    )
    assert isinstance(generator, types.GeneratorType)
    list_metapartitions = list(generator)

    # Two separate cluster_groups (e.g. cluster_1*)
    assert len(list_metapartitions) == 2

    mp_list = list_metapartitions[0]
    assert len(mp_list) == 2, [mp.label for mp in mp_list]
    assert [mp.label for mp in mp_list] == ["cluster_1", "cluster_1"]

    mp_list = list_metapartitions[1]
    assert len(mp_list) == 2, [mp.label for mp in mp_list]
    assert [mp.label for mp in mp_list] == ["cluster_2", "cluster_2"]
Exemple #2
0
def test_align_datasets_prefix(dataset, evaluation_dataset, store_session):
    generator = align_datasets(
        left_dataset_uuid=dataset.uuid,
        right_dataset_uuid=evaluation_dataset.uuid,
        store=store_session,
        match_how="prefix",
    )
    assert isinstance(generator, types.GeneratorType)
    list_metapartitions = list(generator)

    # Two separate cluster_groups (e.g. cluster_1*)
    assert len(list_metapartitions) == 2

    mp_list = list_metapartitions[0]

    assert len(mp_list) == 3, [mp.label for mp in mp_list]

    mp_list = list_metapartitions[1]
    assert len(mp_list) == 3, [mp.label for mp in mp_list]

    # Test sorting of datasets by length, i.e. order of dataframes is different
    generator = align_datasets(
        left_dataset_uuid=evaluation_dataset.uuid,
        right_dataset_uuid=dataset.uuid,
        store=store_session,
        match_how="prefix",
    )
    list_metapartitions = list(generator)
    mp_list = list_metapartitions[0]
Exemple #3
0
def test_align_datasets_prefix__equal_number_of_partitions(
        dataset, evaluation_dataset, store_session):
    """
    Test a scenario where the simple prefix match algorithm didn't find any
    matches in case of equal number of partitions in both datasets.
    """

    # Create a reference dataset which matches the problem (equal number of
    # partitions and suitable for prefix matching)
    mp = MetaPartition(label="cluster_1_1",
                       metadata_version=dataset.metadata_version)
    mp2 = MetaPartition(label="cluster_2_1",
                        metadata_version=dataset.metadata_version)
    metapartitions = [mp, mp2]
    store_dataset_from_partitions(
        partition_list=metapartitions,
        dataset_uuid="reference_dataset_uuid",
        store=store_session,
    )

    generator = align_datasets(
        left_dataset_uuid=dataset.uuid,
        right_dataset_uuid="reference_dataset_uuid",
        store=store_session,
        match_how="prefix",
    )
    assert isinstance(generator, types.GeneratorType)
    list_metapartitions = list(generator)

    # Two separate cluster_groups (e.g. cluster_1*)
    assert len(list_metapartitions) == 2

    mp_list = list_metapartitions[0]

    assert len(mp_list) == 2

    mp_list = list_metapartitions[1]
    assert len(mp_list) == 2

    # Test sorting of datasets by length, i.e. order of dataframes is different
    generator = align_datasets(
        left_dataset_uuid=evaluation_dataset.uuid,
        right_dataset_uuid=dataset.uuid,
        store=store_session,
        match_how="prefix",
    )
    list_metapartitions = list(generator)
    mp_list = list_metapartitions[0]
Exemple #4
0
def test_align_datasets_right(dataset, evaluation_dataset, store_session):
    generator = align_datasets(
        left_dataset_uuid=dataset.uuid,
        right_dataset_uuid=evaluation_dataset.uuid,
        store=store_session,
        match_how="right",
    )
    assert isinstance(generator, types.GeneratorType)
    list_metapartitions = list(generator)

    assert len(list_metapartitions) == len(evaluation_dataset.partitions)

    mp_list = list_metapartitions[0]
    assert len(mp_list) == 3, [mp.label for mp in mp_list]
    expected = ["cluster_1_1", "cluster_1", "cluster_2"]
    assert [mp.label for mp in mp_list] == expected

    mp_list = list_metapartitions[1]
    assert len(mp_list) == 3, [mp.label for mp in mp_list]
    expected = ["cluster_1_2", "cluster_1", "cluster_2"]
    assert [mp.label for mp in mp_list] == expected

    mp_list = list_metapartitions[2]
    assert len(mp_list) == 3, [mp.label for mp in mp_list]
    expected = ["cluster_2_1", "cluster_1", "cluster_2"]
    assert [mp.label for mp in mp_list] == expected

    mp_list = list_metapartitions[3]
    assert len(mp_list) == 3, [mp.label for mp in mp_list]
    expected = ["cluster_2_2", "cluster_1", "cluster_2"]
    assert [mp.label for mp in mp_list] == expected
Exemple #5
0
def merge_datasets_as_delayed(
    left_dataset_uuid,
    right_dataset_uuid,
    store,
    merge_tasks,
    match_how="exact",
    label_merger=None,
    metadata_merger=None,
):
    """
    A dask.delayed graph to perform the merge of two full kartothek datasets.

    Parameters
    ----------
    left_dataset_uuid : basestring
        UUID for left dataset (order does not matter in all merge schemas)
    right_dataset_uuid : basestring
        UUID for right dataset (order does not matter in all merge schemas)
    match_how : basestring or callable, {left, right, prefix, exact}
        Define the partition label matching scheme.
        Available implementations are:

    Parameters
    ----------
    left_dataset_uuid : str
        UUID for left dataset (order does not matter in all merge schemas)
    right_dataset_uuid : str
        UUID for right dataset (order does not matter in all merge schemas)
    match_how : Union[str, Callable]
        Define the partition label matching scheme.
        Available implementations are:

        * left (right) : The left (right) partitions are considered to be
                            the base partitions and **all** partitions of the
                            right (left) dataset are joined to the left
                            partition. This should only be used if one of the
                            datasets contain very few partitions.
        * prefix : The labels of the partitions of the dataset with fewer
                    partitions are considered to be the prefixes to the
                    right dataset
        * exact : All partition labels of the left dataset need to have
                    an exact match in the right dataset
        * callable : A callable with signature func(left, right) which
                        returns a boolean to determine if the partitions match

        If True, an exact match of partition labels between the to-be-merged
        datasets is required in order to merge.
        If False (Default), the partition labels of the dataset with fewer
        partitions are interpreted as prefixes.
    merge_tasks : List[Dict]
        A list of merge tasks. Each item in this list is a dictionary giving
        explicit instructions for a specific merge.
        Each dict should contain key/values:

        * `left`: The table for the left dataframe
        * `right`: The table for the right dataframe
        * 'output_label' : The table for the merged dataframe
        * `merge_func`: A callable with signature
                        `merge_func(left_df, right_df, merge_kwargs)` to
                        handle the data preprocessing and merging.
                        Default pandas.merge
        * 'merge_kwargs' : The kwargs to be passed to the `merge_func`

        Example:

        .. code::

            >>> merge_tasks = [
            ...     {
            ...         "left": "left_dict",
            ...         "right": "right_dict",
            ...         "merge_kwargs": {"kwargs of merge_func": ''},
            ...         "output_label": 'merged_core_data'
            ...     },
            ... ]

    """
    _check_callable(store)

    mps = align_datasets(
        left_dataset_uuid=left_dataset_uuid,
        right_dataset_uuid=right_dataset_uuid,
        store=store,
        match_how=match_how,
    )
    mps = map_delayed(
        mps,
        _load_and_merge_mps,
        store=store,
        label_merger=label_merger,
        metadata_merger=metadata_merger,
        merge_tasks=merge_tasks,
    )

    return mps