Esempio n. 1
0
def _get_store(path):
    url = "hfs://{}".format(path)
    store = storefact.get_store_from_url(url)
    store.delete = partial(_check_and_delete,
                           store=store,
                           delete_orig=store.delete)
    return store
Esempio n. 2
0
def test_complete():
    url, expected = ACTUAL_URL
    store = storefact.get_store_from_url(url)
    assert store.bucket_name == expected["bucket_name"]
    assert store._client.project == 'central-splice-296415'
    with pytest.raises(RefreshError):
        store.get("somekey")
Esempio n. 3
0
 def setup(self, num_partitions, max_depth, num_leafs):
     self.store = get_store_from_url("hfs://{}".format(tempfile.mkdtemp()))
     dataset_metadata = generate_metadata(max_depth, num_leafs)
     self.partitions = [
         generate_mp(dataset_metadata) for _ in range(num_partitions)
     ]
     self.dataset_uuid = "dataset_uuid"
     self.user_dataset_metadata = {}
def reference_store():
    path = os.path.join(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
        "..",
        "reference-data",
        "arrow-compat",
    )
    return get_store_from_url("hfs://{}".format(path))
Esempio n. 5
0
def test_dask_partitions(metadata_version):
    """
    Create partitions for one table with dask
    and check that it can be read with kartothek
    """
    import dask.dataframe

    bucket_dir = tempfile.mkdtemp()
    dataset_uuid = "uuid+namespace-attribute12_underscored"
    os.mkdir("{}/{}".format(bucket_dir, dataset_uuid))
    table_dir = "{}/{}/core".format(bucket_dir, dataset_uuid)
    os.mkdir(table_dir)
    store = storefact.get_store_from_url("hfs://{}".format(bucket_dir))

    locations = ["L-{}".format(i) for i in range(2)]
    df = pd.DataFrame()
    for location in locations:
        core = pd.DataFrame(
            data={
                "date": np.array(
                    ["2017-11-23", "2017-11-23", "2017-11-24", "2017-11-24"]
                ),
                "product": np.array(["P-0", "P-1", "P-0", "P-1"]),
                "location": location,
                "value": np.array(random.sample(range(1, 100), 4)),
            }
        )
        df = pd.concat([df, core])

    ddf = dask.dataframe.from_pandas(df, npartitions=1)
    dask.dataframe.to_parquet(ddf, table_dir, partition_on=["location"])

    partition0 = "{}/core/location=L-0/part.0.parquet".format(dataset_uuid)
    partition1 = "{}/core/location=L-1/part.0.parquet".format(dataset_uuid)
    metadata = {
        "dataset_metadata_version": metadata_version,
        "dataset_uuid": dataset_uuid,
    }
    expected_partitions = {
        "partitions": {
            "location=L-0": {"files": {"core": partition0}},
            "location=L-1": {"files": {"core": partition1}},
        }
    }
    expected_tables = {"tables": {"core": ["date", "product", "value"]}}

    store.put(
        "{}.by-dataset-metadata.json".format(dataset_uuid),
        simplejson.dumps(metadata).encode(),
    )

    metadata.update(expected_partitions)
    metadata.update(expected_tables)
    dmd = DatasetMetadata.load_from_store(dataset_uuid, store)
    actual_partitions = dmd.to_dict()["partitions"]
    # we partition on location ID which has two values
    assert len(actual_partitions) == 2
    assert dmd.partition_keys == ["location"]
Esempio n. 6
0
def store_input_types(request, tmpdir):
    url = f"hfs://{tmpdir}"

    if request.param == "URL":
        return url
    elif request.param == "KeyValue":
        return get_store_from_url(url)
    elif request.param == "Callable":
        return no_pickle_factory(url)
    else:
        raise RuntimeError(f"Encountered unknown store type {type(request.param)}")
Esempio n. 7
0
    def setup(self, number_values, number_partitions, dtype):
        py_type, arrow_type = dtype
        index_dct = {
            py_type(val): [str(part) for part in range(number_partitions)]
            for val in range(0, number_values)
        }
        self.column_name = "column"
        self.ktk_index = ExplicitSecondaryIndex(column=self.column_name,
                                                index_dct=index_dct,
                                                dtype=arrow_type)
        self.tmp_dir = tempfile.mkdtemp()
        self.store = storefact.get_store_from_url("hfs://{}".format(
            self.tmp_dir))
        self.dataset_uuid = "some_uuid"
        self.storage_key = self.ktk_index.store(self.store, self.dataset_uuid)

        self.ktk_index_not_loaded = ExplicitSecondaryIndex(
            column=self.column_name, index_storage_key=self.storage_key)

        self.ktk_index_loaded = self.ktk_index_not_loaded.load(self.store)
def test_normalize_store(tmpdir, _type):

    store_url = f"hfs://{tmpdir}"
    store = get_store_from_url(store_url)
    store.put("test", b"")

    @normalize_args
    def func(store):
        assert isinstance(store, Callable)
        return store().keys()

    if _type == "callable":
        store_test = partial(get_store_from_url, store_url)
    elif _type == "url":
        store_test = store_url
    elif _type == "simplekv":
        store_test = store
    else:
        raise AssertionError(f"unknown parametrization {_type}")
    assert func(store_test)
Esempio n. 9
0
 def setup(self, num_partitions):
     self.store = get_store_from_url("hfs://{}".format(tempfile.mkdtemp()))
     self.partitions = [generate_mp() for _ in range(num_partitions)]
     self.dataset_uuid = "dataset_uuid"
Esempio n. 10
0
#!/usr/bin/env python
import os

import pyarrow as pa
from storefact import get_store_from_url

from kartothek.core.testing import get_dataframe_alltypes
from kartothek.serialization import ParquetSerializer

if __name__ == "__main__":
    ser = ParquetSerializer()
    dir_path = os.path.dirname(os.path.realpath(__file__))
    store = get_store_from_url(f"hfs://{dir_path}")

    df = get_dataframe_alltypes()
    df["byte"] = b"\x82\xd6\xc1\x06Z\x08\x11\xe9\x85eJ\x00\x07\xf8\n\x10"
    ref_file = f"{pa.__version__}"
    ser.store(store, ref_file, df)
Esempio n. 11
0
def test_lazy_store_accepts_decorated_store():
    store = get_store_from_url("memory://")
    pstore = PrefixDecorator("pre", store)
    assert lazy_store(pstore)() is pstore
Esempio n. 12
0
def store2(tmpdir):
    path = tmpdir.join("store2").strpath
    url = "hfs://{}".format(path)
    return storefact.get_store_from_url(url)
Esempio n. 13
0
def no_pickle_store_from_url(url):
    store = storefact.get_store_from_url(url)
    return NoPickleDecorator(store)
Esempio n. 14
0
def no_pickle_store(url):
    store = get_store_from_url(url)
    mark_nopickle(store)
    return store
Esempio n. 15
0
def test_roundtrip():
    assert isinstance(storefact.get_store_from_url(u'memory://#wrap:readonly'),
                      simplekv.decorator.ReadOnlyDecorator)
Esempio n. 16
0
def test_ensure_store_returns_same_store():
    store = get_store_from_url("memory://")
    assert ensure_store(lambda: store) is store
Esempio n. 17
0
def test_lazy_store_returns_same_store():
    store = get_store_from_url("memory://")
    assert lazy_store(lambda: store)() is store
Esempio n. 18
0
 def setup(self, num_rows, chunk_size):
     self.df = get_dataframe_not_nested(num_rows)
     self.serialiser = ParquetSerializer(chunk_size=chunk_size)
     self.store = get_store_from_url("memory://")
     self.key = self.serialiser.store(self.store, "key_prefix", self.df)
     self.predicates = [[("int16", "==", 123)]]
Esempio n. 19
0
def store_session(tmpdir_factory):
    path = tmpdir_factory.mktemp("fsstore_test")
    path = path.realpath()
    url = "hfs://{}".format(path)
    return storefact.get_store_from_url(url)