Ejemplo n.º 1
0
def test_index_raises_nested_dtype():
    with pytest.raises(NotImplementedError) as exc:
        ExplicitSecondaryIndex(
            column="col",
            dtype=pa.list_(pa.int8()),
            index_storage_key="dataset_uuid/some_index.parquet",
        )
    assert str(exc.value) == "Indices w/ nested types are not supported"
Ejemplo n.º 2
0
def test_serialization(key):
    """Check index remains consistent after serializing and de-serializing"""
    index = ExplicitSecondaryIndex(
        column="col", index_dct={key: ["part_2", "part_4", "part_1"]}
    )
    index2 = pickle.loads(pickle.dumps(index))

    assert index == index2
Ejemplo n.º 3
0
def test_index_normalize_during_init():
    index = ExplicitSecondaryIndex(
        column="col",
        dtype=pa.int8(),
        index_dct={"1": ["a", "b"], 1: ["a", "c"], 2.0: ["d"]},
    )
    expected = {1: ["a", "b", "c"], 2: ["d"]}
    assert index.index_dct == expected
Ejemplo n.º 4
0
def test_update_dataset_with_partitions_no_index_input_info(
        store, metadata_version, bound_update_dataset):
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": [1]}))],
            "indices": {
                "p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})
            },
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": [2]}))],
            "indices": {
                "p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})
            },
        },
    ]
    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=lambda: store,
        metadata={"dataset": "metadata"},
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )

    # The input information doesn't explicitly provide index information
    # Since the dataset has an index, it must be updated either way
    part3 = {
        "label": "cluster_3",
        "data": [("core", pd.DataFrame({"p": [3]}))]
    }
    dataset_updated = bound_update_dataset(
        [part3],
        store=lambda: store,
        dataset_uuid=dataset.uuid,
        delete_scope=[{
            "p": 1
        }],
        metadata={"extra": "metadata"},
        default_metadata_version=metadata_version,
        secondary_indices=["p"],
    )
    dataset_updated = dataset_updated.load_all_indices(store)
    assert 3 in dataset_updated.indices["p"].to_dict()
Ejemplo n.º 5
0
def test_index_uint():
    index = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            14671423800646041619: ["part_1", "part_2"],
            np.iinfo(np.uint64).max: ["part_1"],
        },
    )
    assert index.dtype == "uint64"
Ejemplo n.º 6
0
def test_index_as_flat_series_highly_degenerated_sym():
    dim = 4
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            k: ["part_{}".format(i) for i in range(0, dim)]
            for k in range(0, dim)
        },
        dtype=pa.int64(),
    )
    ser = index1.as_flat_series()
    expected = pd.Series(
        ["part_{}".format(i) for i in range(0, dim)] * dim,
        index=pd.Index(np.array([[i] * dim for i in range(0, dim)]).ravel(),
                       name="col"),
        name="partition",
    )
    assert_series_equal(ser, expected)
Ejemplo n.º 7
0
def test_index_ts_inference(store):
    index = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            pd.Timestamp("2017-01-01"): ["part_1", "part_2"],
            pd.Timestamp("2017-01-02"): ["part_3"],
        },
    )
    assert index.dtype == pa.timestamp("ns")
Ejemplo n.º 8
0
class Index(AsvBenchmarkConfig):
    params = (
        [10 * 1, 10**3],  # values
        [10 * 1, 10**3],  # partitions
        [(int, pa.int64()), (str, pa.string())],  # types
    )
    param_names = ["number_values", "number_partitions", "dtype"]

    def setup(self, number_values, number_partitions, dtype):
        py_type, arrow_type = dtype
        index_dct = {
            py_type(val): [str(part) for part in range(number_partitions)]
            for val in range(0, number_values)
        }
        self.column_name = "column"
        self.ktk_index = ExplicitSecondaryIndex(column=self.column_name,
                                                index_dct=index_dct,
                                                dtype=arrow_type)
        self.tmp_dir = tempfile.mkdtemp()
        self.store = storefact.get_store_from_url("hfs://{}".format(
            self.tmp_dir))
        self.dataset_uuid = "some_uuid"
        self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid)

        self.ktk_index_not_loaded = ExplicitSecondaryIndex(
            column=self.column_name, index_storage_key=self.storage_key)

        self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store)

    def teardown(self, number_values, number_partitions, dtype):
        shutil.rmtree(self.tmp_dir)

    def time_load_index(self, number_values, number_partitions, arrow_type):
        self.ktk_index_not_loaded.load(self.store)

    def time_query_value(self, number_values, number_partitions, arrow_type):
        self.ktk_index.query(number_values / 2)

    def time_as_series(self, number_values, number_partitions, arrow_type):
        self.ktk_index.as_flat_series()

    def time_as_series_partitions_as_index(self, number_values,
                                           number_partitions, arrow_type):
        self.ktk_index.as_flat_series(partitions_as_index=True)
Ejemplo n.º 9
0
def test_add_column_to_existing_index(
    store_factory, metadata_version, bound_build_dataset_indices
):
    dataset_uuid = "dataset_uuid"
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": [1, 2], "x": [100, 4500]}))],
            "indices": {
                "p": ExplicitSecondaryIndex(
                    "p", index_dct={1: ["cluster_1"], 2: ["cluster_1"]}
                )
            },
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": [4, 3], "x": [500, 10]}))],
            "indices": {
                "p": ExplicitSecondaryIndex(
                    "p", index_dct={4: ["cluster_2"], 3: ["cluster_2"]}
                )
            },
        },
    ]

    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=store_factory,
        dataset_uuid=dataset_uuid,
        metadata_version=metadata_version,
    )
    assert dataset.load_all_indices(store=store_factory()).indices.keys() == {"p"}

    # Create indices
    bound_build_dataset_indices(store_factory, dataset_uuid, columns=["x"])

    # Assert indices are properly created
    mps = read_dataset_as_metapartitions(store=store_factory, dataset_uuid=dataset_uuid)
    for column_name in ["p", "x"]:
        assert all([mp.indices[column_name] for mp in mps])

    dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True)
    assert dataset_factory.indices.keys() == {"p", "x"}
Ejemplo n.º 10
0
def test_build_indices():
    columns = ["location", "product"]
    df = pd.DataFrame(
        OrderedDict(
            [("location", ["Loc1", "Loc2"]), ("product", ["Product1", "Product2"])]
        )
    )
    mp = MetaPartition(label="partition_label", data=df)
    result_mp = mp.build_indices(columns)
    result = result_mp.indices
    loc_index = ExplicitSecondaryIndex(
        "location", {"Loc1": ["partition_label"], "Loc2": ["partition_label"]}
    )
    prod_index = ExplicitSecondaryIndex(
        "product", {"Product1": ["partition_label"], "Product2": ["partition_label"]}
    )

    assert result["location"] == loc_index
    assert result["product"] == prod_index
Ejemplo n.º 11
0
def persist_indices(store, dataset_uuid, indices):
    store = _instantiate_store(store)
    output_filenames = {}
    for column, index in indices.items():
        # backwards compat
        if isinstance(index, dict):
            legacy_storage_key = "{dataset_uuid}.{column}{suffix}".format(
                dataset_uuid=dataset_uuid,
                column=column,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
            )
            index = ExplicitSecondaryIndex(
                column=column,
                index_dct=index,
                index_storage_key=legacy_storage_key)
        elif isinstance(index, PartitionIndex):
            continue
        output_filenames[column] = index.store(store=store,
                                               dataset_uuid=dataset_uuid)
    return output_filenames
Ejemplo n.º 12
0
    def setup(self, number_values, number_partitions, dtype):
        py_type, arrow_type = dtype
        index_dct = {
            py_type(val): [str(part) for part in range(number_partitions)]
            for val in range(0, number_values)
        }
        self.column_name = "column"
        self.ktk_index = ExplicitSecondaryIndex(column=self.column_name,
                                                index_dct=index_dct,
                                                dtype=arrow_type)
        self.tmp_dir = tempfile.mkdtemp()
        self.store = storefact.get_store_from_url("hfs://{}".format(
            self.tmp_dir))
        self.dataset_uuid = "some_uuid"
        self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid)

        self.ktk_index_not_loaded = ExplicitSecondaryIndex(
            column=self.column_name, index_storage_key=self.storage_key)

        self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store)
Ejemplo n.º 13
0
def test_update_dataset_with_partitions__reducer_nonexistent(
        store_factory, metadata_version, frozen_time_em, bound_update_dataset,
        store):

    part3 = {
        "label": "cluster_3",
        "data": [("core", pd.DataFrame({"p": [3]}))],
        "indices": {
            "p": ExplicitSecondaryIndex("p", index_dct={3: ["cluster_3"]})
        },
    }
    dataset_updated = bound_update_dataset(
        [part3],
        store=store_factory,
        dataset_uuid="dataset_uuid",
        delete_scope=[{
            "p": 1
        }],
        metadata={"extra": "metadata"},
        default_metadata_version=metadata_version,
        secondary_indices=["p"],
    )
    dataset_updated = dataset_updated.load_index("p", store)
    ind_updated = dataset_updated.indices["p"]
    cluster_3_label = ind_updated.eval_operator(op="==", value=3).pop()

    expected_metadata = {"extra": "metadata"}

    expected_metadata["creation_time"] = TIME_TO_FREEZE_ISO

    assert dataset_updated.metadata == expected_metadata
    assert list(dataset_updated.partitions) == [cluster_3_label]

    updated_part_c3 = dataset_updated.partitions[cluster_3_label]

    assert updated_part_c3.label == cluster_3_label
    assert dataset_updated.uuid == "dataset_uuid"

    store_files = list(store.keys())
    # 1 dataset metadata file and 1 index file and 1 partition files
    # note: the update writes a new index file but due to frozen_time this gets
    # the same name as the previous one and overwrites it.
    expected_number_files = 3

    # common metadata for v4 datasets (1 table)
    expected_number_files += 1
    assert len(store_files) == expected_number_files
    exp_updated_idx = {3: [cluster_3_label]}
    assert dataset_updated.indices["p"].index_dct == exp_updated_idx

    # Ensure the dataset can be loaded properly
    stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store)
    stored_dataset = stored_dataset.load_index("p", store)
    assert dataset_updated == stored_dataset
def test_merge_indices():
    indices = [
        MetaPartition(
            label="label1",
            indices={"location": {
                "Loc1": ["label1"],
                "Loc2": ["label1"]
            }},
        ),
        MetaPartition(
            label="label2",
            indices={
                "location": {
                    "Loc3": ["label2"],
                    "Loc2": ["label2"]
                },
                "product": {
                    "Product1": ["label2"],
                    "Product2": ["label2"]
                },
            },
        ),
    ]
    result = MetaPartition.merge_indices(indices)
    expected = {
        "location":
        ExplicitSecondaryIndex(
            "location",
            {
                "Loc1": ["label1"],
                "Loc2": ["label1", "label2"],
                "Loc3": ["label2"]
            },
        ),
        "product":
        ExplicitSecondaryIndex("product", {
            "Product1": ["label2"],
            "Product2": ["label2"]
        }),
    }
    assert result == expected
Ejemplo n.º 15
0
def test_index_as_flat_series():
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={1: ["part_1", "part_2"], 2: ["part_1"]},
        dtype=pa.int64(),
    )
    ser = index1.as_flat_series()
    expected = pd.Series(
        ["part_1", "part_2", "part_1"],
        index=pd.Index([1, 1, 2], name="col"),
        name="partition",
    )
    assert_series_equal(ser, expected)

    ser_comp = index1.as_flat_series(compact=True)
    expected = pd.Series(
        [["part_1", "part_2"], ["part_1"]],
        index=pd.Index([1, 2], name="col"),
        name="partition",
    )
    assert_series_equal(ser_comp, expected)
Ejemplo n.º 16
0
def test_index_store_roundtrip_implicit_key(store, col):
    index1 = ExplicitSecondaryIndex(
        column=col, index_dct={1: ["part_1", "part_2"], 3: ["part_3"]}, dtype=pa.int64()
    )
    key1 = index1.store(store, "dataset_uuid")
    index1.index_storage_key = key1

    index2 = ExplicitSecondaryIndex(column=col, index_storage_key=key1).load(store)
    assert index1 == index2
    key2 = index2.store(store, "dataset_uuid")

    index3 = ExplicitSecondaryIndex(column=col, index_storage_key=key2).load(store)
    assert index1 == index3
    assert index2 == index3
Ejemplo n.º 17
0
def test_index_as_flat_series_highly_degenerated_asym():
    """
    Ensure that the generation of the series is not bound by col numbers or nans in the matrix
    """
    dim = 4
    ind_dct = {
        k: ["part_{}".format(i) for i in range(0, dim)]
        for k in range(0, dim)
    }
    ind_dct[0] = ["part_1"]
    ind_dct[2] = ["part_2", "part_5"]
    index1 = ExplicitSecondaryIndex(column="col",
                                    index_dct=ind_dct,
                                    dtype=pa.int64())
    ser = index1.as_flat_series()
    partition = [
        "part_1",
        "part_0",
        "part_1",
        "part_2",
        "part_3",
        "part_2",
        "part_5",
        "part_0",
        "part_1",
        "part_2",
        "part_3",
    ]
    index_values = [0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3]
    expected = pd.Series(partition,
                         index=pd.Index(index_values, name="col", dtype=int),
                         name="partition")
    assert_series_equal(ser, expected)

    ser_inv = index1.as_flat_series(partitions_as_index=True)
    expected_inv = pd.Series(index_values,
                             index=pd.Index(partition, name="partition"),
                             name="col")
    assert_series_equal(ser_inv, expected_inv)
Ejemplo n.º 18
0
def test_commit_dataset_only_delete(store, metadata_version):
    partitions = [
        {
            "label": "cluster_1",
            "data": [("core", pd.DataFrame({"p": [1]}))],
            "indices": {
                "p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]})
            },
        },
        {
            "label": "cluster_2",
            "data": [("core", pd.DataFrame({"p": [2]}))],
            "indices": {
                "p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]})
            },
        },
    ]
    dataset = store_dataframes_as_dataset(
        dfs=partitions,
        store=lambda: store,
        metadata={"dataset": "metadata"},
        dataset_uuid="dataset_uuid",
        metadata_version=metadata_version,
    )
    dataset = dataset.load_index("p", store)
    assert len(dataset.partitions) == 2

    delete_scope = [{"p": 1}]
    updated_dataset = commit_dataset(
        store=store,
        dataset_uuid=dataset.uuid,
        new_partitions=None,
        delete_scope=delete_scope,
        partition_on=None,
    )
    assert len(updated_dataset.partitions) == 1
    assert list(updated_dataset.partitions.keys()) == ["cluster_2"]
    assert updated_dataset.explicit_partitions is True
Ejemplo n.º 19
0
def test_dataset_get_indices_as_dataframe_duplicates():
    ds = DatasetMetadata(
        "some_uuid",
        indices={
            "l_external_code": ExplicitSecondaryIndex(
                "l_external_code", {"1": ["part1", "part2"], "2": ["part1", "part2"]}
            ),
            "p_external_code": ExplicitSecondaryIndex(
                "p_external_code", {"1": ["part1"], "2": ["part2"]}
            ),
        },
    )
    expected = pd.DataFrame(
        OrderedDict(
            [
                ("p_external_code", ["1", "1", "2", "2"]),
                ("l_external_code", ["1", "2", "1", "2"]),
            ]
        ),
        index=pd.Index(["part1", "part1", "part2", "part2"], name="partition"),
    )
    result = ds.get_indices_as_dataframe()
    pdt.assert_frame_equal(result, expected)
Ejemplo n.º 20
0
def test_builder_full(metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "partitions": {
            "run_id=1/L=1/P=1/part_1": {
                "files": {
                    "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet",
                    "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet",
                }
            }
        },
        "metadata": {
            "key": "value",
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "indices": {
            "col1": {
                "a": ["run_id=1/L=1/P=1/part_1"],
                "b": ["run_id=2/L=1/P=1/part_1"],
            },
            "col2": "uuid.col2.by-dataset-index.parquet",
        },
        "partition_keys": ["L", "P"],
    }

    builder = DatasetMetadataBuilder("uuid",
                                     metadata_version=metadata_version,
                                     partition_keys=["L", "P"])
    part_2 = Partition(
        label="run_id=1/L=1/P=1/part_1",
        files={
            "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet",
            "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet",
        },
    )
    builder.add_partition("run_id=1/L=1/P=1/part_1", part_2)
    builder.add_metadata("key", "value")
    builder.add_external_index("col2")
    builder.add_embedded_index(
        "col1",
        ExplicitSecondaryIndex("col1", {
            "a": ["run_id=1/L=1/P=1/part_1"],
            "b": ["run_id=2/L=1/P=1/part_1"]
        }),
    )
    key, result = builder.to_json()
    result = simplejson.loads(result)
    assert key == "uuid.by-dataset-metadata.json"
    assert result == expected
Ejemplo n.º 21
0
def test_raises_on_new_index_creation(backend_identifier, store_factory,
                                      bound_update_dataset,
                                      define_indices_on_partition):
    # This test can be removed once the variable index input is removed in
    # favour of the test `test_update_secondary_indices_subset`
    if backend_identifier == "dask.dataframe" and define_indices_on_partition:
        pytest.skip(
        )  # Constructs a dataframe which ignores index information passed as dict

    dataset_uuid = "dataset_uuid"
    index_column = "p"
    partitions = [{
        "label": "cluster_1",
        "data": [("core", pd.DataFrame({index_column: [1, 2]}))]
    }]

    new_partition = {
        "label": "cluster_2",
        "data": [("core", pd.DataFrame({index_column: [2, 3]}))],
    }

    dataset_update_secondary_indices = [index_column]
    if define_indices_on_partition:
        dataset_update_secondary_indices = None
        new_partition["indices"] = {
            index_column:
            ExplicitSecondaryIndex(
                index_column,
                {
                    k: [new_partition["label"]]
                    for k in new_partition["data"][0][1]
                    [index_column].unique()
                },
            )
        }

    # Create dataset without secondary indices
    store_dataframes_as_dataset(dfs=partitions,
                                store=store_factory,
                                dataset_uuid=dataset_uuid)

    with pytest.raises(Exception,
                       match="Incorrect indices provided for dataset"):
        bound_update_dataset(
            [new_partition],
            store=store_factory,
            dataset_uuid=dataset_uuid,
            secondary_indices=dataset_update_secondary_indices,
        )
Ejemplo n.º 22
0
def test_index_update(inplace):
    original_index = ExplicitSecondaryIndex(column="col",
                                            index_dct={
                                                1: ["part_1", "part_2"],
                                                3: ["part_3"]
                                            })

    new_index = ExplicitSecondaryIndex(column="col",
                                       index_dct={
                                           1: ["part_4"],
                                           4: ["part_4"]
                                       })

    updated_index = original_index.update(new_index, inplace=inplace)

    expected_index = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            1: ["part_2", "part_4", "part_1"],
            3: ["part_3"],
            4: ["part_4"]
        },
    )
    assert updated_index == expected_index
Ejemplo n.º 23
0
def test_index_as_flat_series_date():
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            datetime.date(2017, 1, 2): ["part_1", "part_2"],
            datetime.date(2018, 2, 3): ["part_1"],
        },
        dtype=pa.date32(),
    )
    ser = index1.as_flat_series()
    ser = ser.sort_index()
    expected = pd.Series(
        ["part_1", "part_2", "part_1"],
        index=pd.Index(
            [
                datetime.date(2017, 1, 2),
                datetime.date(2017, 1, 2),
                datetime.date(2018, 2, 3),
            ],
            name="col",
        ),
        name="partition",
    )
    assert_series_equal(ser, expected)
Ejemplo n.º 24
0
def test_index_store_roundtrip_explicit_key(store):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={1: ["part_1", "part_2"], 3: ["part_3"]},
        index_storage_key=storage_key,
        dtype=pa.int64(),
    )
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store)
    assert index1 == index2
    key2 = index2.store(store, "dataset_uuid")

    index3 = ExplicitSecondaryIndex(column="col", index_storage_key=key2).load(store)
    assert index1 == index3
    assert index2 == index3
Ejemplo n.º 25
0
    def build_indices(self, columns: Iterable[str]):
        """
        This builds the indices for this metapartition for the given columns. The indices for the passed columns
        are rebuilt, so exisiting index entries in the metapartition are overwritten.

        :param columns: A list of columns from which the indices over all dataframes in the metapartition
            are overwritten
        :return: self
        """
        if self.label is None:
            return self

        new_indices = {}
        for col in columns:

            possible_values: Set[str] = set()

            df = self.data
            if not self.is_sentinel and col not in df:
                raise RuntimeError(
                    "Column `{corrupt_col}` could not be found in the partition `{partition_label}` Please check for any typos and validate your dataset."
                    .format(corrupt_col=col, partition_label=self.label))

            possible_values = possible_values | set(df[col].dropna().unique())

            if self.schema is not None:
                dtype = self.schema.field(col).type
            else:
                dtype = None

            new_index = ExplicitSecondaryIndex(
                column=col,
                index_dct={value: [self.label]
                           for value in possible_values},
                dtype=dtype,
            )
            if (col in self.indices) and self.indices[col].loaded:
                new_indices[col] = self.indices[col].update(new_index)
            else:
                new_indices[col] = new_index

        return self.copy(indices=new_indices)
Ejemplo n.º 26
0
def test_index_normalize_remove_values(inplace):
    original_index = ExplicitSecondaryIndex(
        column="col", dtype=pa.int64(), index_dct={1: ["a", "b", "c"], 2: ["d"]}
    )

    new_index1 = original_index.copy().remove_values([1, 3], inplace=inplace)
    expected_index1 = ExplicitSecondaryIndex(
        column="col", dtype=pa.int64(), index_dct={2: ["d"]}
    )
    assert new_index1 == expected_index1

    new_index2 = original_index.copy().remove_values([1.0, 3.0], inplace=inplace)
    expected_index2 = ExplicitSecondaryIndex(
        column="col", dtype=pa.int64(), index_dct={2: ["d"]}
    )
    assert new_index2 == expected_index2

    new_index3 = original_index.copy().remove_values(["1", "3"], inplace=inplace)
    expected_index3 = ExplicitSecondaryIndex(
        column="col", dtype=pa.int64(), index_dct={2: ["d"]}
    )
    assert new_index3 == expected_index3
Ejemplo n.º 27
0
    def add_external_index(self, column, filename=None):
        """
        Add a reference to an external index.

        Parameters
        ----------
        column: str
            Name of the indexed column

        Returns
        -------
        storage_key: str
            The location where the external index should be stored.
        """
        if filename is None:
            filename = "{uuid}.{column_name}".format(uuid=self.uuid,
                                                     column_name=column)
            filename += naming.EXTERNAL_INDEX_SUFFIX
        self.indices[column] = ExplicitSecondaryIndex(
            column, index_storage_key=filename)
        return filename
Ejemplo n.º 28
0
def test_store_dataframes_as_dataset_does_not_allow_invalid_indices(
        store_factory):
    partitions = [{
        "label": "part1",
        "data": [("core", pd.DataFrame({"p": [1, 2]}))],
        "indices": {
            "x": ExplicitSecondaryIndex("x", {
                1: ["part1"],
                2: ["part2"]
            })
        },
    }]

    with pytest.raises(
            ValueError,
            match="In table core, no column corresponding to index x"):
        store_dataframes_as_dataset(
            dfs=partitions,
            store=store_factory,
            metadata={"dataset": "metadata"},
            dataset_uuid="dataset_uuid",
        )
Ejemplo n.º 29
0
def test_index_normalize_during_init_warn_collision(collision, caplog):
    index_dct = {1: ["a", "c"], 2.0: ["d"]}
    if collision:
        index_dct["1"] = ["a", "b"]

    caplog.set_level(logging.DEBUG)
    ExplicitSecondaryIndex(column="col", dtype=pa.int8(), index_dct=index_dct)

    warn = [
        t[2] for t in caplog.record_tuples
        if t[0] == "kartothek.core.index" and t[1] == logging.WARN
    ]

    if collision:
        assert any(
            msg.startswith(
                "Value normalization for index column col resulted in 1 collision(s)."
            ) for msg in warn)
    else:
        assert not any(
            msg.startswith("Value normalization for index column")
            for msg in warn)
Ejemplo n.º 30
0
def test_builder_to_dataset(metadata_version, frozen_time):
    expected = {
        "dataset_uuid": "uuid",
        "dataset_metadata_version": metadata_version,
        "partitions": {
            "part_2": {
                "files": {
                    "core": "uuid/core/part_2.parquet"
                }
            }
        },
        "metadata": {
            "key": "value",
            "creation_time": TIME_TO_FREEZE_ISO
        },
        "indices": {
            "col1": {
                "a": ["part1"],
                "b": ["part2"]
            }
        },
    }

    builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version)
    part_2 = Partition("part_2", {"core": "uuid/core/part_2.parquet"})
    builder.add_partition("part_2", part_2)
    builder.add_metadata("key", "value")
    builder.add_embedded_index(
        "col1", ExplicitSecondaryIndex("col1", {
            "a": ["part1"],
            "b": ["part2"]
        }))

    result = builder.to_dataset()
    expected_from_dict = DatasetMetadata.from_dict(expected)
    assert result == expected_from_dict