Beispiel #1
0
def test_index_store_roundtrip_implicit_key(store, col):
    index1 = ExplicitSecondaryIndex(
        column=col, index_dct={1: ["part_1", "part_2"], 3: ["part_3"]}, dtype=pa.int64()
    )
    key1 = index1.store(store, "dataset_uuid")
    index1.index_storage_key = key1

    index2 = ExplicitSecondaryIndex(column=col, index_storage_key=key1).load(store)
    assert index1 == index2
    key2 = index2.store(store, "dataset_uuid")

    index3 = ExplicitSecondaryIndex(column=col, index_storage_key=key2).load(store)
    assert index1 == index3
    assert index2 == index3
Beispiel #2
0
class IndexBase(AsvBenchmarkConfig):
    def setup(self, number_values, number_partitions, dtype):
        py_type, arrow_type = dtype
        self.partition_values = generate_partition_values(number_partitions)
        index_dct = {
            py_type(val): list(
                np.random.choice(self.partition_values,
                                 number_partitions // 2))
            for val in range(number_values)
        }
        self.column_name = "column"
        self.ktk_index = ExplicitSecondaryIndex(column=self.column_name,
                                                index_dct=index_dct,
                                                dtype=arrow_type)
        self.tmp_dir = tempfile.mkdtemp()
        self.store = storefact.get_store_from_url("hfs://{}".format(
            self.tmp_dir))
        self.dataset_uuid = "some_uuid"
        self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid)

        self.ktk_index_not_loaded = ExplicitSecondaryIndex(
            column=self.column_name, index_storage_key=self.storage_key)

        self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store)

    def teardown(self, number_values, number_partitions, dtype):
        shutil.rmtree(self.tmp_dir)
Beispiel #3
0
def test_index_store_roundtrip_explicit_key(store):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={1: ["part_1", "part_2"], 3: ["part_3"]},
        index_storage_key=storage_key,
        dtype=pa.int64(),
    )
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store)
    assert index1 == index2
    key2 = index2.store(store, "dataset_uuid")

    index3 = ExplicitSecondaryIndex(column="col", index_storage_key=key2).load(store)
    assert index1 == index3
    assert index2 == index3
Beispiel #4
0
def test_load_from_store_with_indices(store):
    meta_dct = {
        "dataset_metadata_version": 4,
        "dataset_uuid": "uuid",
        "partitions": {
            "product_id=1/part_1": {
                "files": {
                    "core_data":
                    "dataset_uuid/table/location_id=1/part_1.parquet"
                }
            }
        },
        "indices": {
            "product_id": {
                "1": ["part_1"],
                "2": ["part_1"],
                "100": ["part_1"],
                "34": ["part_1"],
            }
        },
    }
    store.put("uuid.by-dataset-metadata.json",
              simplejson.dumps(meta_dct).encode("utf-8"))
    df = pd.DataFrame({"index": [1], "location_id": [1], "product_id": [1]})
    store_schema_metadata(make_meta(df, origin="core"), "uuid", store,
                          "core_data")

    storage_key = "uuid/some_index.parquet"
    index2 = ExplicitSecondaryIndex(
        column="location_id",
        index_dct={
            1: ["part_1", "part_2"],
            3: ["part_3"]
        },
        index_storage_key=storage_key,
        dtype=pa.int64(),
    )
    index2.store(store, "dataset_uuid")

    dmd = DatasetMetadata.load_from_store(store=store, uuid="uuid")
    assert "location_id" not in dmd.indices

    dmd = DatasetMetadata.load_from_store(store=store,
                                          uuid="uuid",
                                          load_all_indices=True)
    assert "location_id" in dmd.indices
Beispiel #5
0
def test_serialization_no_indices(store):
    index = ExplicitSecondaryIndex(column="col", index_dct={1: ["part_1"]})
    storage_key = index.store(store=store, dataset_uuid="uuid")

    # Create index without `index_dct`
    index = ExplicitSecondaryIndex(column="col", index_storage_key=storage_key)

    index2 = pickle.loads(pickle.dumps(index))

    assert index == index2
Beispiel #6
0
def test_index_store_roundtrip_ts(store, dtype, timestamps):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct=dict(zip(timestamps, [["part_1", "part_2"], ["part_3"]])),
        index_storage_key=storage_key,
        dtype=dtype,
    )
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store)
    assert index1 == index2
Beispiel #7
0
def test_index_empty(store, dtype):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(
        column="col", index_dct={}, dtype=dtype, index_storage_key=storage_key
    )
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store)
    assert index1 == index2

    index3 = pickle.loads(pickle.dumps(index1))
    assert index1 == index3
Beispiel #8
0
def test_index_large(store):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={i: ["part_1"]
                   for i in range(100_000)},
        index_storage_key=storage_key,
    )
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col",
                                    index_storage_key=key1).load(store)
    assert index1 == index2

    index3 = pickle.loads(pickle.dumps(index1))
    assert index1 == index3
Beispiel #9
0
def test_pickle_without_load(store):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(column="col",
                                    index_dct={1: ["part_1"]},
                                    index_storage_key=storage_key)
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1)
    assert index2 != index1

    index3 = pickle.loads(pickle.dumps(index2))
    assert index3 == index2

    index4 = index3.load(store)
    assert index4 == index1
    assert index4 != index2
Beispiel #10
0
def test_index_store_roundtrip_ts(store):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct={
            pd.Timestamp("2017-01-01"): ["part_1", "part_2"],
            pd.Timestamp("2017-01-02"): ["part_3"],
        },
        index_storage_key=storage_key,
        dtype=pa.timestamp("ns"),
    )
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col",
                                    index_storage_key=key1).load(store)
    assert index1 == index2
Beispiel #11
0
class Index(AsvBenchmarkConfig):
    params = (
        [10 * 1, 10**3],  # values
        [10 * 1, 10**3],  # partitions
        [(int, pa.int64()), (str, pa.string())],  # types
    )
    param_names = ["number_values", "number_partitions", "dtype"]

    def setup(self, number_values, number_partitions, dtype):
        py_type, arrow_type = dtype
        index_dct = {
            py_type(val): [str(part) for part in range(number_partitions)]
            for val in range(0, number_values)
        }
        self.column_name = "column"
        self.ktk_index = ExplicitSecondaryIndex(column=self.column_name,
                                                index_dct=index_dct,
                                                dtype=arrow_type)
        self.tmp_dir = tempfile.mkdtemp()
        self.store = storefact.get_store_from_url("hfs://{}".format(
            self.tmp_dir))
        self.dataset_uuid = "some_uuid"
        self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid)

        self.ktk_index_not_loaded = ExplicitSecondaryIndex(
            column=self.column_name, index_storage_key=self.storage_key)

        self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store)

    def teardown(self, number_values, number_partitions, dtype):
        shutil.rmtree(self.tmp_dir)

    def time_load_index(self, number_values, number_partitions, arrow_type):
        self.ktk_index_not_loaded.load(self.store)

    def time_query_value(self, number_values, number_partitions, arrow_type):
        self.ktk_index.query(number_values / 2)

    def time_as_series(self, number_values, number_partitions, arrow_type):
        self.ktk_index.as_flat_series()

    def time_as_series_partitions_as_index(self, number_values,
                                           number_partitions, arrow_type):
        self.ktk_index.as_flat_series(partitions_as_index=True)
Beispiel #12
0
def persist_indices(store, dataset_uuid, indices):
    store = _instantiate_store(store)
    output_filenames = {}
    for column, index in indices.items():
        # backwards compat
        if isinstance(index, dict):
            legacy_storage_key = "{dataset_uuid}.{column}{suffix}".format(
                dataset_uuid=dataset_uuid,
                column=column,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
            )
            index = ExplicitSecondaryIndex(
                column=column,
                index_dct=index,
                index_storage_key=legacy_storage_key)
        elif isinstance(index, PartitionIndex):
            continue
        output_filenames[column] = index.store(store=store,
                                               dataset_uuid=dataset_uuid)
    return output_filenames