def test_records_to_db(url):
    s: Storage = Storage.from_url(url)
    api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls()
    if not s.get_api().dialect_is_supported():
        warnings.warn(
            f"Skipping tests for database engine {s.storage_engine.__name__} (client library not installed)"
        )
        return
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    with api_cls.temp_local_database() as db_url:
        name = "_test"
        db_api: DatabaseStorageApi = Storage.from_url(db_url).get_api()
        # Records
        mdr = as_records(records)
        mem_api.put(name, mdr)
        conversion = Conversion(
            StorageFormat(LocalPythonStorageEngine, RecordsFormat),
            StorageFormat(s.storage_engine, DatabaseTableFormat),
        )
        copy_records_to_db.copy(name,
                                name,
                                conversion,
                                mem_api,
                                db_api,
                                schema=TestSchema4)
        with db_api.execute_sql_result(f"select * from {name}") as res:
            assert [dict(r) for r in res] == records
def test_data_copy_lookup():
    @datacopy(cost=NoOpCost,
              from_storage_classes=[FileSystemStorageClass],
              unregistered=True)
    def noop_all(*args):
        pass

    @datacopy(
        from_storage_classes=[DatabaseStorageClass],
        from_data_formats=[DatabaseTableFormat],
        to_storage_classes=[PythonStorageClass],
        to_data_formats=[RecordsFormat],
        cost=NetworkToMemoryCost,
        unregistered=True,
    )
    def db_to_mem(*args):
        pass

    lkup = get_datacopy_lookup(copiers=[noop_all, db_to_mem])
    dcp = lkup.get_lowest_cost(
        Conversion(
            StorageFormat(PostgresStorageEngine, DatabaseTableFormat),
            StorageFormat(LocalPythonStorageEngine, RecordsFormat),
        ))
    assert dcp is db_to_mem
def test_mem_to_mem(from_fmt, to_fmt):
    from_fmt, obj = from_fmt
    to_fmt, expected = to_fmt
    if from_fmt == to_fmt:
        return
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    from_name = "_from_test"
    to_name = "_to_test"
    mem_api.put(from_name, as_records(obj(), data_format=from_fmt))
    conversion = Conversion(
        StorageFormat(LocalPythonStorageEngine, from_fmt),
        StorageFormat(LocalPythonStorageEngine, to_fmt),
    )
    pth = get_datacopy_lookup().get_lowest_cost_path(conversion)
    for i, ce in enumerate(pth.conversions):
        ce.copier.copy(from_name,
                       to_name,
                       ce.conversion,
                       mem_api,
                       mem_api,
                       schema=TestSchema4)
        from_name = to_name
        to_name = to_name + str(i)
    to_name = from_name
    if isinstance(expected, pd.DataFrame):
        assert_dataframes_are_almost_equal(
            mem_api.get(to_name).records_object, expected)
    else:
        assert list(mem_api.get(to_name).records_object) == list(expected())
def copy_lowest_cost(
    env: Environment,
    sess: Session,
    sdb: StoredDataBlockMetadata,
    target_storage: Storage,
    target_format: DataFormat,
    eligible_storages: Optional[List[Storage]] = None,
) -> StoredDataBlockMetadata:
    if eligible_storages is None:
        eligible_storages = env.storages
    target_storage_format = StorageFormat(target_storage.storage_engine,
                                          target_format)
    cp = get_copy_path_for_sdb(sdb, target_storage_format, eligible_storages)
    if cp is None:
        raise CopyPathDoesNotExist(
            f"Copying {sdb} to format {target_format} on storage {target_storage}"
        )
    return convert_sdb(
        env,
        sess=sess,
        sdb=sdb,
        conversion_path=cp,
        target_storage=target_storage,
        storages=eligible_storages,
    )
Beispiel #5
0
def test_file_to_mem():
    dr = tempfile.gettempdir()
    s: Storage = Storage.from_url(f"file://{dr}")
    fs_api: FileSystemStorageApi = s.get_api()
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    name = "_test"
    fs_api.write_lines_to_file(name, ["f1,f2", "hi,2"])
    # Records
    records_obj = [{"f1": "hi", "f2": 2}]
    conversion = Conversion(
        StorageFormat(s.storage_engine, DelimitedFileFormat),
        StorageFormat(LocalPythonStorageEngine, RecordsFormat),
    )
    copy_delim_file_to_records.copy(name,
                                    name,
                                    conversion,
                                    fs_api,
                                    mem_api,
                                    schema=TestSchema4)
    assert mem_api.get(name).records_object == records_obj
def get_copy_path_for_sdb(sdb: StoredDataBlockMetadata,
                          target_format: StorageFormat,
                          storages: List[Storage]) -> Optional[ConversionPath]:
    source_format = StorageFormat(sdb.storage.storage_engine, sdb.data_format)
    if source_format == target_format:
        # Already exists, do nothing
        return ConversionPath()
    conversion = Conversion(source_format, target_format)
    conversion_path = get_datacopy_lookup(available_storage_engines=set(
        s.storage_engine
        for s in storages), ).get_lowest_cost_path(conversion, )
    return conversion_path
def test_obj_to_file():
    dr = tempfile.gettempdir()
    s: Storage = Storage.from_url(f"file://{dr}")
    fs_api: FileSystemStorageApi = s.get_api()
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    name = "_test"
    fmt = DelimitedFileObjectFormat
    obj = (lambda: StringIO("f1,f2\nhi,2"), )[0]
    mdr = as_records(obj(), data_format=fmt)
    mem_api.put(name, mdr)
    conversion = Conversion(
        StorageFormat(LocalPythonStorageEngine, fmt),
        StorageFormat(s.storage_engine, DelimitedFileFormat),
    )
    copy_file_object_to_delim_file.copy(name,
                                        name,
                                        conversion,
                                        mem_api,
                                        fs_api,
                                        schema=TestSchema4)
    with fs_api.open(name) as f:
        assert f.read() == obj().read()
Beispiel #8
0
def test_db_to_mem(url):
    s: Storage = Storage.from_url(url)
    api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls()
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    if not s.get_api().dialect_is_supported():
        return
    with api_cls.temp_local_database() as db_url:
        api: DatabaseStorageApi = Storage.from_url(db_url).get_api()
        name = "_test"
        api.execute_sql(f"create table {name} as select 1 a, 2 b")
        # Records
        conversion = Conversion(
            StorageFormat(s.storage_engine, DatabaseTableFormat),
            StorageFormat(LocalPythonStorageEngine, RecordsFormat),
        )
        copy_db_to_records.copy(name, name, conversion, api, mem_api)
        assert mem_api.get(name).records_object == [{"a": 1, "b": 2}]
        # DatabaseCursor
        conversion = Conversion(
            StorageFormat(s.storage_engine, DatabaseTableFormat),
            StorageFormat(LocalPythonStorageEngine, DatabaseCursorFormat),
        )
        copy_db_to_records.copy(name, name, conversion, api, mem_api)
        assert list(mem_api.get(name).records_object) == [{"a": 1, "b": 2}]
def test_records_to_file():
    dr = tempfile.gettempdir()
    s: Storage = Storage.from_url(f"file://{dr}")
    fs_api: FileSystemStorageApi = s.get_api()
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    name = "_test"
    fmt = RecordsFormat
    obj = [{"f1": "hi", "f2": 2}]
    mdr = as_records(obj, data_format=fmt)
    mem_api.put(name, mdr)
    conversion = Conversion(
        StorageFormat(LocalPythonStorageEngine, fmt),
        StorageFormat(s.storage_engine, DelimitedFileFormat),
    )
    copy_records_to_delim_file.copy(name,
                                    name,
                                    conversion,
                                    mem_api,
                                    fs_api,
                                    schema=TestSchema4)
    with fs_api.open(name) as f:
        recs = list(read_csv(f))
        recs = RecordsFormat.conform_records_to_schema(recs, TestSchema4)
        assert recs == obj
    lkup = get_datacopy_lookup(copiers=[noop_all, db_to_mem])
    dcp = lkup.get_lowest_cost(
        Conversion(
            StorageFormat(PostgresStorageEngine, DatabaseTableFormat),
            StorageFormat(LocalPythonStorageEngine, RecordsFormat),
        ))
    assert dcp is db_to_mem


@pytest.mark.parametrize(
    "conversion,length",
    [
        # Memory to DB
        (
            (
                StorageFormat(LocalPythonStorageEngine, RecordsFormat),
                StorageFormat(PostgresStorageEngine, DatabaseTableFormat),
            ),
            1,
        ),
        (
            (
                StorageFormat(LocalPythonStorageEngine, DataFrameFormat),
                StorageFormat(PostgresStorageEngine, DatabaseTableFormat),
            ),
            2,
        ),
        (
            (
                StorageFormat(LocalPythonStorageEngine,
                              DataFrameIteratorFormat),
Beispiel #11
0
 def get_storage_format(self) -> StorageFormat:
     return StorageFormat(self.storage.storage_engine, self.data_format)
def ensure_data_block_on_storage(
    env: Environment,
    sess: Session,
    block: DataBlockMetadata,
    storage: Storage,
    fmt: Optional[DataFormat] = None,
    eligible_storages: Optional[List[Storage]] = None,
) -> StoredDataBlockMetadata:
    if eligible_storages is None:
        eligible_storages = env.storages
    sdbs = sess.query(StoredDataBlockMetadata).filter(
        StoredDataBlockMetadata.data_block == block)
    match = sdbs.filter(StoredDataBlockMetadata.storage_url == storage.url)
    if fmt:
        match = match.filter(StoredDataBlockMetadata.data_format == fmt)
    matched_sdb = match.first()
    if matched_sdb is not None:
        return matched_sdb

    # logger.debug(f"{cnt} SDBs total")
    existing_sdbs = sdbs.filter(
        # DO NOT fetch memory SDBs that aren't of current runtime (since we can't get them!)
        # TODO: clean up memory SDBs when the memory goes away? Doesn't make sense to persist them really
        # Should be a separate in-memory lookup for memory SDBs, so they naturally expire?
        or_(
            ~StoredDataBlockMetadata.storage_url.startswith("python:"),
            StoredDataBlockMetadata.storage_url == storage.url,
        ), )
    # logger.debug(
    #     f"{existing_sdbs.count()} SDBs on-disk or in local memory (local: {self.ctx.local_python_storage.url})"
    # )
    if eligible_storages:
        existing_sdbs = existing_sdbs.filter(
            StoredDataBlockMetadata.storage_url.in_(
                s.url for s in eligible_storages), )
    # logger.debug(f"{existing_sdbs.count()} SDBs in eligible storages")
    fmt = fmt or storage.storage_engine.get_natural_format()
    target_storage_format = StorageFormat(storage.storage_engine, fmt)

    # Compute conversion costs
    eligible_conversion_paths = (
        [])  #: List[List[Tuple[ConversionCostLevel, Type[Converter]]]] = []
    existing_sdbs = list(existing_sdbs)
    for sdb in existing_sdbs:
        conversion_path = get_copy_path_for_sdb(sdb, target_storage_format,
                                                eligible_storages)
        if conversion_path is not None:
            eligible_conversion_paths.append(
                (conversion_path.total_cost, conversion_path, sdb))
    if not eligible_conversion_paths:
        raise NotImplementedError(
            f"No converter to {target_storage_format} for existing StoredDataBlocks {existing_sdbs}"
        )
    cost, conversion_path, in_sdb = min(eligible_conversion_paths,
                                        key=lambda x: x[0])
    return convert_sdb(
        env,
        sess=sess,
        sdb=in_sdb,
        conversion_path=conversion_path,
        target_storage=storage,
        storages=eligible_storages,
    )