def test_index_raises_nested_dtype(): with pytest.raises(NotImplementedError) as exc: ExplicitSecondaryIndex( column="col", dtype=pa.list_(pa.int8()), index_storage_key="dataset_uuid/some_index.parquet", ) assert str(exc.value) == "Indices w/ nested types are not supported"
def test_serialization(key): """Check index remains consistent after serializing and de-serializing""" index = ExplicitSecondaryIndex( column="col", index_dct={key: ["part_2", "part_4", "part_1"]} ) index2 = pickle.loads(pickle.dumps(index)) assert index == index2
def test_index_normalize_during_init(): index = ExplicitSecondaryIndex( column="col", dtype=pa.int8(), index_dct={"1": ["a", "b"], 1: ["a", "c"], 2.0: ["d"]}, ) expected = {1: ["a", "b", "c"], 2: ["d"]} assert index.index_dct == expected
def test_update_dataset_with_partitions_no_index_input_info( store, metadata_version, bound_update_dataset): partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1]}))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]}) }, }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2]}))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]}) }, }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=lambda: store, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", metadata_version=metadata_version, ) # The input information doesn't explicitly provide index information # Since the dataset has an index, it must be updated either way part3 = { "label": "cluster_3", "data": [("core", pd.DataFrame({"p": [3]}))] } dataset_updated = bound_update_dataset( [part3], store=lambda: store, dataset_uuid=dataset.uuid, delete_scope=[{ "p": 1 }], metadata={"extra": "metadata"}, default_metadata_version=metadata_version, secondary_indices=["p"], ) dataset_updated = dataset_updated.load_all_indices(store) assert 3 in dataset_updated.indices["p"].to_dict()
def test_index_uint(): index = ExplicitSecondaryIndex( column="col", index_dct={ 14671423800646041619: ["part_1", "part_2"], np.iinfo(np.uint64).max: ["part_1"], }, ) assert index.dtype == "uint64"
def test_index_as_flat_series_highly_degenerated_sym(): dim = 4 index1 = ExplicitSecondaryIndex( column="col", index_dct={ k: ["part_{}".format(i) for i in range(0, dim)] for k in range(0, dim) }, dtype=pa.int64(), ) ser = index1.as_flat_series() expected = pd.Series( ["part_{}".format(i) for i in range(0, dim)] * dim, index=pd.Index(np.array([[i] * dim for i in range(0, dim)]).ravel(), name="col"), name="partition", ) assert_series_equal(ser, expected)
def test_index_ts_inference(store): index = ExplicitSecondaryIndex( column="col", index_dct={ pd.Timestamp("2017-01-01"): ["part_1", "part_2"], pd.Timestamp("2017-01-02"): ["part_3"], }, ) assert index.dtype == pa.timestamp("ns")
class Index(AsvBenchmarkConfig): params = ( [10 * 1, 10**3], # values [10 * 1, 10**3], # partitions [(int, pa.int64()), (str, pa.string())], # types ) param_names = ["number_values", "number_partitions", "dtype"] def setup(self, number_values, number_partitions, dtype): py_type, arrow_type = dtype index_dct = { py_type(val): [str(part) for part in range(number_partitions)] for val in range(0, number_values) } self.column_name = "column" self.ktk_index = ExplicitSecondaryIndex(column=self.column_name, index_dct=index_dct, dtype=arrow_type) self.tmp_dir = tempfile.mkdtemp() self.store = storefact.get_store_from_url("hfs://{}".format( self.tmp_dir)) self.dataset_uuid = "some_uuid" self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid) self.ktk_index_not_loaded = ExplicitSecondaryIndex( column=self.column_name, index_storage_key=self.storage_key) self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store) def teardown(self, number_values, number_partitions, dtype): shutil.rmtree(self.tmp_dir) def time_load_index(self, number_values, number_partitions, arrow_type): self.ktk_index_not_loaded.load(self.store) def time_query_value(self, number_values, number_partitions, arrow_type): self.ktk_index.query(number_values / 2) def time_as_series(self, number_values, number_partitions, arrow_type): self.ktk_index.as_flat_series() def time_as_series_partitions_as_index(self, number_values, number_partitions, arrow_type): self.ktk_index.as_flat_series(partitions_as_index=True)
def test_add_column_to_existing_index( store_factory, metadata_version, bound_build_dataset_indices ): dataset_uuid = "dataset_uuid" partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2], "x": [100, 4500]}))], "indices": { "p": ExplicitSecondaryIndex( "p", index_dct={1: ["cluster_1"], 2: ["cluster_1"]} ) }, }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": [4, 3], "x": [500, 10]}))], "indices": { "p": ExplicitSecondaryIndex( "p", index_dct={4: ["cluster_2"], 3: ["cluster_2"]} ) }, }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) assert dataset.load_all_indices(store=store_factory()).indices.keys() == {"p"} # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["x"]) # Assert indices are properly created mps = read_dataset_as_metapartitions(store=store_factory, dataset_uuid=dataset_uuid) for column_name in ["p", "x"]: assert all([mp.indices[column_name] for mp in mps]) dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert dataset_factory.indices.keys() == {"p", "x"}
def test_build_indices(): columns = ["location", "product"] df = pd.DataFrame( OrderedDict( [("location", ["Loc1", "Loc2"]), ("product", ["Product1", "Product2"])] ) ) mp = MetaPartition(label="partition_label", data=df) result_mp = mp.build_indices(columns) result = result_mp.indices loc_index = ExplicitSecondaryIndex( "location", {"Loc1": ["partition_label"], "Loc2": ["partition_label"]} ) prod_index = ExplicitSecondaryIndex( "product", {"Product1": ["partition_label"], "Product2": ["partition_label"]} ) assert result["location"] == loc_index assert result["product"] == prod_index
def persist_indices(store, dataset_uuid, indices): store = _instantiate_store(store) output_filenames = {} for column, index in indices.items(): # backwards compat if isinstance(index, dict): legacy_storage_key = "{dataset_uuid}.{column}{suffix}".format( dataset_uuid=dataset_uuid, column=column, suffix=naming.EXTERNAL_INDEX_SUFFIX, ) index = ExplicitSecondaryIndex( column=column, index_dct=index, index_storage_key=legacy_storage_key) elif isinstance(index, PartitionIndex): continue output_filenames[column] = index.store(store=store, dataset_uuid=dataset_uuid) return output_filenames
def setup(self, number_values, number_partitions, dtype): py_type, arrow_type = dtype index_dct = { py_type(val): [str(part) for part in range(number_partitions)] for val in range(0, number_values) } self.column_name = "column" self.ktk_index = ExplicitSecondaryIndex(column=self.column_name, index_dct=index_dct, dtype=arrow_type) self.tmp_dir = tempfile.mkdtemp() self.store = storefact.get_store_from_url("hfs://{}".format( self.tmp_dir)) self.dataset_uuid = "some_uuid" self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid) self.ktk_index_not_loaded = ExplicitSecondaryIndex( column=self.column_name, index_storage_key=self.storage_key) self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store)
def test_update_dataset_with_partitions__reducer_nonexistent( store_factory, metadata_version, frozen_time_em, bound_update_dataset, store): part3 = { "label": "cluster_3", "data": [("core", pd.DataFrame({"p": [3]}))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={3: ["cluster_3"]}) }, } dataset_updated = bound_update_dataset( [part3], store=store_factory, dataset_uuid="dataset_uuid", delete_scope=[{ "p": 1 }], metadata={"extra": "metadata"}, default_metadata_version=metadata_version, secondary_indices=["p"], ) dataset_updated = dataset_updated.load_index("p", store) ind_updated = dataset_updated.indices["p"] cluster_3_label = ind_updated.eval_operator(op="==", value=3).pop() expected_metadata = {"extra": "metadata"} expected_metadata["creation_time"] = TIME_TO_FREEZE_ISO assert dataset_updated.metadata == expected_metadata assert list(dataset_updated.partitions) == [cluster_3_label] updated_part_c3 = dataset_updated.partitions[cluster_3_label] assert updated_part_c3.label == cluster_3_label assert dataset_updated.uuid == "dataset_uuid" store_files = list(store.keys()) # 1 dataset metadata file and 1 index file and 1 partition files # note: the update writes a new index file but due to frozen_time this gets # the same name as the previous one and overwrites it. expected_number_files = 3 # common metadata for v4 datasets (1 table) expected_number_files += 1 assert len(store_files) == expected_number_files exp_updated_idx = {3: [cluster_3_label]} assert dataset_updated.indices["p"].index_dct == exp_updated_idx # Ensure the dataset can be loaded properly stored_dataset = DatasetMetadata.load_from_store("dataset_uuid", store) stored_dataset = stored_dataset.load_index("p", store) assert dataset_updated == stored_dataset
def test_merge_indices(): indices = [ MetaPartition( label="label1", indices={"location": { "Loc1": ["label1"], "Loc2": ["label1"] }}, ), MetaPartition( label="label2", indices={ "location": { "Loc3": ["label2"], "Loc2": ["label2"] }, "product": { "Product1": ["label2"], "Product2": ["label2"] }, }, ), ] result = MetaPartition.merge_indices(indices) expected = { "location": ExplicitSecondaryIndex( "location", { "Loc1": ["label1"], "Loc2": ["label1", "label2"], "Loc3": ["label2"] }, ), "product": ExplicitSecondaryIndex("product", { "Product1": ["label2"], "Product2": ["label2"] }), } assert result == expected
def test_index_as_flat_series(): index1 = ExplicitSecondaryIndex( column="col", index_dct={1: ["part_1", "part_2"], 2: ["part_1"]}, dtype=pa.int64(), ) ser = index1.as_flat_series() expected = pd.Series( ["part_1", "part_2", "part_1"], index=pd.Index([1, 1, 2], name="col"), name="partition", ) assert_series_equal(ser, expected) ser_comp = index1.as_flat_series(compact=True) expected = pd.Series( [["part_1", "part_2"], ["part_1"]], index=pd.Index([1, 2], name="col"), name="partition", ) assert_series_equal(ser_comp, expected)
def test_index_store_roundtrip_implicit_key(store, col): index1 = ExplicitSecondaryIndex( column=col, index_dct={1: ["part_1", "part_2"], 3: ["part_3"]}, dtype=pa.int64() ) key1 = index1.store(store, "dataset_uuid") index1.index_storage_key = key1 index2 = ExplicitSecondaryIndex(column=col, index_storage_key=key1).load(store) assert index1 == index2 key2 = index2.store(store, "dataset_uuid") index3 = ExplicitSecondaryIndex(column=col, index_storage_key=key2).load(store) assert index1 == index3 assert index2 == index3
def test_index_as_flat_series_highly_degenerated_asym(): """ Ensure that the generation of the series is not bound by col numbers or nans in the matrix """ dim = 4 ind_dct = { k: ["part_{}".format(i) for i in range(0, dim)] for k in range(0, dim) } ind_dct[0] = ["part_1"] ind_dct[2] = ["part_2", "part_5"] index1 = ExplicitSecondaryIndex(column="col", index_dct=ind_dct, dtype=pa.int64()) ser = index1.as_flat_series() partition = [ "part_1", "part_0", "part_1", "part_2", "part_3", "part_2", "part_5", "part_0", "part_1", "part_2", "part_3", ] index_values = [0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3] expected = pd.Series(partition, index=pd.Index(index_values, name="col", dtype=int), name="partition") assert_series_equal(ser, expected) ser_inv = index1.as_flat_series(partitions_as_index=True) expected_inv = pd.Series(index_values, index=pd.Index(partition, name="partition"), name="col") assert_series_equal(ser_inv, expected_inv)
def test_commit_dataset_only_delete(store, metadata_version): partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1]}))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={1: ["cluster_1"]}) }, }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2]}))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={2: ["cluster_2"]}) }, }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=lambda: store, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", metadata_version=metadata_version, ) dataset = dataset.load_index("p", store) assert len(dataset.partitions) == 2 delete_scope = [{"p": 1}] updated_dataset = commit_dataset( store=store, dataset_uuid=dataset.uuid, new_partitions=None, delete_scope=delete_scope, partition_on=None, ) assert len(updated_dataset.partitions) == 1 assert list(updated_dataset.partitions.keys()) == ["cluster_2"] assert updated_dataset.explicit_partitions is True
def test_dataset_get_indices_as_dataframe_duplicates(): ds = DatasetMetadata( "some_uuid", indices={ "l_external_code": ExplicitSecondaryIndex( "l_external_code", {"1": ["part1", "part2"], "2": ["part1", "part2"]} ), "p_external_code": ExplicitSecondaryIndex( "p_external_code", {"1": ["part1"], "2": ["part2"]} ), }, ) expected = pd.DataFrame( OrderedDict( [ ("p_external_code", ["1", "1", "2", "2"]), ("l_external_code", ["1", "2", "1", "2"]), ] ), index=pd.Index(["part1", "part1", "part2", "part2"], name="partition"), ) result = ds.get_indices_as_dataframe() pdt.assert_frame_equal(result, expected)
def test_builder_full(metadata_version, frozen_time): expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "partitions": { "run_id=1/L=1/P=1/part_1": { "files": { "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet", "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet", } } }, "metadata": { "key": "value", "creation_time": TIME_TO_FREEZE_ISO }, "indices": { "col1": { "a": ["run_id=1/L=1/P=1/part_1"], "b": ["run_id=2/L=1/P=1/part_1"], }, "col2": "uuid.col2.by-dataset-index.parquet", }, "partition_keys": ["L", "P"], } builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version, partition_keys=["L", "P"]) part_2 = Partition( label="run_id=1/L=1/P=1/part_1", files={ "core": "uuid/core/run_id=1/L=1/P=1/part_1.parquet", "helper": "uuid/helper/run_id=1/L=1/P=1/part_1.parquet", }, ) builder.add_partition("run_id=1/L=1/P=1/part_1", part_2) builder.add_metadata("key", "value") builder.add_external_index("col2") builder.add_embedded_index( "col1", ExplicitSecondaryIndex("col1", { "a": ["run_id=1/L=1/P=1/part_1"], "b": ["run_id=2/L=1/P=1/part_1"] }), ) key, result = builder.to_json() result = simplejson.loads(result) assert key == "uuid.by-dataset-metadata.json" assert result == expected
def test_raises_on_new_index_creation(backend_identifier, store_factory, bound_update_dataset, define_indices_on_partition): # This test can be removed once the variable index input is removed in # favour of the test `test_update_secondary_indices_subset` if backend_identifier == "dask.dataframe" and define_indices_on_partition: pytest.skip( ) # Constructs a dataframe which ignores index information passed as dict dataset_uuid = "dataset_uuid" index_column = "p" partitions = [{ "label": "cluster_1", "data": [("core", pd.DataFrame({index_column: [1, 2]}))] }] new_partition = { "label": "cluster_2", "data": [("core", pd.DataFrame({index_column: [2, 3]}))], } dataset_update_secondary_indices = [index_column] if define_indices_on_partition: dataset_update_secondary_indices = None new_partition["indices"] = { index_column: ExplicitSecondaryIndex( index_column, { k: [new_partition["label"]] for k in new_partition["data"][0][1] [index_column].unique() }, ) } # Create dataset without secondary indices store_dataframes_as_dataset(dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid) with pytest.raises(Exception, match="Incorrect indices provided for dataset"): bound_update_dataset( [new_partition], store=store_factory, dataset_uuid=dataset_uuid, secondary_indices=dataset_update_secondary_indices, )
def test_index_update(inplace): original_index = ExplicitSecondaryIndex(column="col", index_dct={ 1: ["part_1", "part_2"], 3: ["part_3"] }) new_index = ExplicitSecondaryIndex(column="col", index_dct={ 1: ["part_4"], 4: ["part_4"] }) updated_index = original_index.update(new_index, inplace=inplace) expected_index = ExplicitSecondaryIndex( column="col", index_dct={ 1: ["part_2", "part_4", "part_1"], 3: ["part_3"], 4: ["part_4"] }, ) assert updated_index == expected_index
def test_index_as_flat_series_date(): index1 = ExplicitSecondaryIndex( column="col", index_dct={ datetime.date(2017, 1, 2): ["part_1", "part_2"], datetime.date(2018, 2, 3): ["part_1"], }, dtype=pa.date32(), ) ser = index1.as_flat_series() ser = ser.sort_index() expected = pd.Series( ["part_1", "part_2", "part_1"], index=pd.Index( [ datetime.date(2017, 1, 2), datetime.date(2017, 1, 2), datetime.date(2018, 2, 3), ], name="col", ), name="partition", ) assert_series_equal(ser, expected)
def test_index_store_roundtrip_explicit_key(store): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex( column="col", index_dct={1: ["part_1", "part_2"], 3: ["part_3"]}, index_storage_key=storage_key, dtype=pa.int64(), ) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2 key2 = index2.store(store, "dataset_uuid") index3 = ExplicitSecondaryIndex(column="col", index_storage_key=key2).load(store) assert index1 == index3 assert index2 == index3
def build_indices(self, columns: Iterable[str]): """ This builds the indices for this metapartition for the given columns. The indices for the passed columns are rebuilt, so exisiting index entries in the metapartition are overwritten. :param columns: A list of columns from which the indices over all dataframes in the metapartition are overwritten :return: self """ if self.label is None: return self new_indices = {} for col in columns: possible_values: Set[str] = set() df = self.data if not self.is_sentinel and col not in df: raise RuntimeError( "Column `{corrupt_col}` could not be found in the partition `{partition_label}` Please check for any typos and validate your dataset." .format(corrupt_col=col, partition_label=self.label)) possible_values = possible_values | set(df[col].dropna().unique()) if self.schema is not None: dtype = self.schema.field(col).type else: dtype = None new_index = ExplicitSecondaryIndex( column=col, index_dct={value: [self.label] for value in possible_values}, dtype=dtype, ) if (col in self.indices) and self.indices[col].loaded: new_indices[col] = self.indices[col].update(new_index) else: new_indices[col] = new_index return self.copy(indices=new_indices)
def test_index_normalize_remove_values(inplace): original_index = ExplicitSecondaryIndex( column="col", dtype=pa.int64(), index_dct={1: ["a", "b", "c"], 2: ["d"]} ) new_index1 = original_index.copy().remove_values([1, 3], inplace=inplace) expected_index1 = ExplicitSecondaryIndex( column="col", dtype=pa.int64(), index_dct={2: ["d"]} ) assert new_index1 == expected_index1 new_index2 = original_index.copy().remove_values([1.0, 3.0], inplace=inplace) expected_index2 = ExplicitSecondaryIndex( column="col", dtype=pa.int64(), index_dct={2: ["d"]} ) assert new_index2 == expected_index2 new_index3 = original_index.copy().remove_values(["1", "3"], inplace=inplace) expected_index3 = ExplicitSecondaryIndex( column="col", dtype=pa.int64(), index_dct={2: ["d"]} ) assert new_index3 == expected_index3
def add_external_index(self, column, filename=None): """ Add a reference to an external index. Parameters ---------- column: str Name of the indexed column Returns ------- storage_key: str The location where the external index should be stored. """ if filename is None: filename = "{uuid}.{column_name}".format(uuid=self.uuid, column_name=column) filename += naming.EXTERNAL_INDEX_SUFFIX self.indices[column] = ExplicitSecondaryIndex( column, index_storage_key=filename) return filename
def test_store_dataframes_as_dataset_does_not_allow_invalid_indices( store_factory): partitions = [{ "label": "part1", "data": [("core", pd.DataFrame({"p": [1, 2]}))], "indices": { "x": ExplicitSecondaryIndex("x", { 1: ["part1"], 2: ["part2"] }) }, }] with pytest.raises( ValueError, match="In table core, no column corresponding to index x"): store_dataframes_as_dataset( dfs=partitions, store=store_factory, metadata={"dataset": "metadata"}, dataset_uuid="dataset_uuid", )
def test_index_normalize_during_init_warn_collision(collision, caplog): index_dct = {1: ["a", "c"], 2.0: ["d"]} if collision: index_dct["1"] = ["a", "b"] caplog.set_level(logging.DEBUG) ExplicitSecondaryIndex(column="col", dtype=pa.int8(), index_dct=index_dct) warn = [ t[2] for t in caplog.record_tuples if t[0] == "kartothek.core.index" and t[1] == logging.WARN ] if collision: assert any( msg.startswith( "Value normalization for index column col resulted in 1 collision(s)." ) for msg in warn) else: assert not any( msg.startswith("Value normalization for index column") for msg in warn)
def test_builder_to_dataset(metadata_version, frozen_time): expected = { "dataset_uuid": "uuid", "dataset_metadata_version": metadata_version, "partitions": { "part_2": { "files": { "core": "uuid/core/part_2.parquet" } } }, "metadata": { "key": "value", "creation_time": TIME_TO_FREEZE_ISO }, "indices": { "col1": { "a": ["part1"], "b": ["part2"] } }, } builder = DatasetMetadataBuilder("uuid", metadata_version=metadata_version) part_2 = Partition("part_2", {"core": "uuid/core/part_2.parquet"}) builder.add_partition("part_2", part_2) builder.add_metadata("key", "value") builder.add_embedded_index( "col1", ExplicitSecondaryIndex("col1", { "a": ["part1"], "b": ["part2"] })) result = builder.to_dataset() expected_from_dict = DatasetMetadata.from_dict(expected) assert result == expected_from_dict