def test_records_to_db(url):
    s: Storage = Storage.from_url(url)
    api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls()
    if not s.get_api().dialect_is_supported():
        warnings.warn(
            f"Skipping tests for database engine {s.storage_engine.__name__} (client library not installed)"
        )
        return
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    with api_cls.temp_local_database() as db_url:
        name = "_test"
        db_api: DatabaseStorageApi = Storage.from_url(db_url).get_api()
        # Records
        mdr = as_records(records)
        mem_api.put(name, mdr)
        conversion = Conversion(
            StorageFormat(LocalPythonStorageEngine, RecordsFormat),
            StorageFormat(s.storage_engine, DatabaseTableFormat),
        )
        copy_records_to_db.copy(name,
                                name,
                                conversion,
                                mem_api,
                                db_api,
                                schema=TestSchema4)
        with db_api.execute_sql_result(f"select * from {name}") as res:
            assert [dict(r) for r in res] == records
def test_mem_to_mem(from_fmt, to_fmt):
    from_fmt, obj = from_fmt
    to_fmt, expected = to_fmt
    if from_fmt == to_fmt:
        return
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    from_name = "_from_test"
    to_name = "_to_test"
    mem_api.put(from_name, as_records(obj(), data_format=from_fmt))
    conversion = Conversion(
        StorageFormat(LocalPythonStorageEngine, from_fmt),
        StorageFormat(LocalPythonStorageEngine, to_fmt),
    )
    pth = get_datacopy_lookup().get_lowest_cost_path(conversion)
    for i, ce in enumerate(pth.conversions):
        ce.copier.copy(from_name,
                       to_name,
                       ce.conversion,
                       mem_api,
                       mem_api,
                       schema=TestSchema4)
        from_name = to_name
        to_name = to_name + str(i)
    to_name = from_name
    if isinstance(expected, pd.DataFrame):
        assert_dataframes_are_almost_equal(
            mem_api.get(to_name).records_object, expected)
    else:
        assert list(mem_api.get(to_name).records_object) == list(expected())
def test_data_block_methods():
    env = make_test_env()
    db = DataBlockMetadata(
        id=get_datablock_id(),
        inferred_schema_key="_test.TestSchema1",
        nominal_schema_key="_test.TestSchema2",
        realized_schema_key="_test.TestSchema3",
    )
    strg = env.get_default_local_python_storage()
    records = [{"a": 1}]
    mdr = as_records(records)
    sdb = StoredDataBlockMetadata(
        id=get_datablock_id(),
        data_block_id=db.id,
        data_block=db,
        storage_url=strg.url,
        data_format=RecordsFormat,
    )
    with env.session_scope() as sess:
        sess.add(db)
        sess.add(sdb)
        assert sdb.name is None
        name = sdb.get_name()
        assert len(name) > 10
        assert sdb.name == name
        strg.get_api().put(sdb.name, mdr)
        assert db.inferred_schema(env, sess) == TestSchema1
        assert db.nominal_schema(env, sess) == TestSchema2
        assert db.realized_schema(env, sess) == TestSchema3
        db.compute_record_count()
        assert db.record_count == 1
Exemple #4
0
def extract_dataframe(ctx: PipeContext) -> MemoryDataRecords:  # TODO optional
    extracted = ctx.get_state_value("extracted")
    if extracted:
        # Just emit once
        return  # TODO: typing fix here?
    ctx.emit_state_value("extracted", True)
    schema = ctx.get_config_value("schema")
    df = ctx.get_config_value("dataframe")
    return as_records(df, data_format=DataFrameFormat, schema=schema)
Exemple #5
0
def extract_csv(ctx: PipeContext) -> MemoryDataRecords:
    extracted = ctx.get_state_value("extracted")
    if extracted:
        return
        # Static resource, if already emitted, return
    path = ctx.get_config_value("path")
    f = open(path)
    ctx.emit_state_value("extracted", True)
    schema = ctx.get_config_value("schema")
    return as_records(f, data_format=DelimitedFileObjectFormat, schema=schema)
Exemple #6
0
def test_filesystem_api_core_operations(url):
    api: PythonStorageApi = Storage.from_url(url).get_api()
    name = "_test"
    api.put(name, as_records([{"a": 1}, {"b": 2}]))
    assert api.exists(name)
    assert not api.exists(name + "doesntexist")
    assert api.record_count(name) == 2
    api.create_alias(name, name + "alias")
    assert api.record_count(name + "alias") == 2
    api.copy(name, name + "copy")
    assert api.record_count(name + "copy") == 2
def copy_delim_file_to_records(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, FileSystemStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    with from_storage_api.open(from_name) as f:
        records = list(read_csv(f.readlines()))
        mdr = as_records(records, data_format=RecordsFormat, schema=schema)
        mdr = mdr.conform_to_schema()
        to_storage_api.put(to_name, mdr)
Exemple #8
0
def copy_df_iterator_to_records_iterator(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, PythonStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    mdr = from_storage_api.get(from_name)
    itr = (dataframe_to_records(df, schema) for df in mdr.records_object)
    to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema)
    to_mdr = to_mdr.conform_to_schema()
    to_storage_api.put(to_name, to_mdr)
Exemple #9
0
def copy_file_object_to_records(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, PythonStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    mdr = from_storage_api.get(from_name)
    obj = read_csv(mdr.records_object)
    to_mdr = as_records(obj, data_format=RecordsFormat, schema=schema)
    to_mdr = to_mdr.conform_to_schema()
    to_storage_api.put(to_name, to_mdr)
Exemple #10
0
def copy_file_object_iterator_to_records_iterator(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, PythonStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    mdr = from_storage_api.get(from_name)
    itr = (read_csv(chunk) for chunk in with_header(mdr.records_object))
    to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema)
    to_mdr = to_mdr.conform_to_schema()
    to_storage_api.put(to_name, to_mdr)
Exemple #11
0
def copy_df_to_records(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, PythonStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    mdr = from_storage_api.get(from_name)
    df = dataframe_to_records(mdr.records_object, schema)
    to_mdr = as_records(df, data_format=RecordsFormat, schema=schema)
    to_mdr = to_mdr.conform_to_schema()
    to_storage_api.put(to_name, to_mdr)
Exemple #12
0
def create_data_block_from_records(
    env: Environment,
    sess: Session,
    local_storage: Storage,
    records: Any,
    nominal_schema: Schema = None,
    inferred_schema: Schema = None,
    created_by_node_key: str = None,
) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]:
    from snapflow.storage.storage import LocalPythonStorageEngine

    logger.debug("CREATING DATA BLOCK")
    if isinstance(records, MemoryDataRecords):
        dro = records
        # Important: override nominal schema with DRO entry if it exists
        if dro.nominal_schema is not None:
            nominal_schema = env.get_schema(dro.nominal_schema, sess)
    else:
        dro = as_records(records, schema=nominal_schema)
    if not nominal_schema:
        nominal_schema = env.get_schema("Any", sess)
    if not inferred_schema:
        inferred_schema = dro.data_format.infer_schema_from_records(
            dro.records_object)
        env.add_new_generated_schema(inferred_schema, sess)
    realized_schema = cast_to_realized_schema(env, sess, inferred_schema,
                                              nominal_schema)
    dro = dro.conform_to_schema(realized_schema)
    block = DataBlockMetadata(
        id=get_datablock_id(),
        inferred_schema_key=inferred_schema.key if inferred_schema else None,
        nominal_schema_key=nominal_schema.key,
        realized_schema_key=realized_schema.key,
        record_count=dro.record_count,
        created_by_node_key=created_by_node_key,
    )
    sdb = StoredDataBlockMetadata(  # type: ignore
        id=get_datablock_id(),
        data_block_id=block.id,
        data_block=block,
        storage_url=local_storage.url,
        data_format=dro.data_format,
    )
    sess.add(block)
    sess.add(sdb)
    # sess.flush([block, sdb])
    local_storage.get_api().put(sdb.get_name(), dro)
    return block, sdb
def copy_delim_file_to_file_object(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, FileSystemStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    with from_storage_api.open(from_name) as f:
        mdr = as_records(f,
                         data_format=DelimitedFileObjectFormat,
                         schema=schema)
        mdr = mdr.conform_to_schema()
        to_storage_api.put(to_name, mdr)
def copy_db_to_records(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, DatabaseStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    select_sql = f"select * from {from_name}"
    with from_storage_api.execute_sql_result(select_sql) as r:
        records = result_proxy_to_records(r)
        mdr = as_records(records, data_format=RecordsFormat, schema=schema)
        mdr = mdr.conform_to_schema()
        to_storage_api.put(to_name, mdr)
Exemple #15
0
def copy_records_iterator_to_records(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, PythonStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    mdr = from_storage_api.get(from_name)
    all_records = []
    for records in mdr.records_object:
        all_records.extend(records)
    to_mdr = as_records(all_records, data_format=RecordsFormat, schema=schema)
    to_mdr = to_mdr.conform_to_schema()
    to_storage_api.put(to_name, to_mdr)
def copy_db_to_cursor(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, DatabaseStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    select_sql = f"select * from {from_name}"
    conn = (
        from_storage_api.get_engine().connect()
    )  # Gonna leave this connection hanging... # TODO: add "closeable" to the MDR and handle?
    r = conn.execute(select_sql)
    mdr = as_records(r, data_format=DatabaseCursorFormat, schema=schema)
    mdr = mdr.conform_to_schema()
    to_storage_api.put(to_name, mdr)
Exemple #17
0
def copy_file_object_to_records_iterator(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, PythonStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    mdr = from_storage_api.get(from_name)
    # Note: must keep header on each chunk when iterating delimited file object!
    # TODO: ugly hard-coded 1000 here, but how could we ever make it configurable? Not a big deal I guess
    itr = (read_csv(chunk)
           for chunk in with_header(iterate_chunks(mdr.records_object, 1000)))
    to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema)
    to_mdr = to_mdr.conform_to_schema()
    to_storage_api.put(to_name, to_mdr)
Exemple #18
0
def copy_dataframe_iterator_to_dataframe(
    from_name: str,
    to_name: str,
    conversion: Conversion,
    from_storage_api: StorageApi,
    to_storage_api: StorageApi,
    schema: Schema,
):
    assert isinstance(from_storage_api, PythonStorageApi)
    assert isinstance(to_storage_api, PythonStorageApi)
    mdr = from_storage_api.get(from_name)
    all_dfs = []
    for df in mdr.records_object:
        all_dfs.append(df)
    to_mdr = as_records(pd.concat(all_dfs),
                        data_format=DataFrameFormat,
                        schema=schema)
    to_mdr = to_mdr.conform_to_schema()
    to_storage_api.put(to_name, to_mdr)
def test_obj_to_file():
    dr = tempfile.gettempdir()
    s: Storage = Storage.from_url(f"file://{dr}")
    fs_api: FileSystemStorageApi = s.get_api()
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    name = "_test"
    fmt = DelimitedFileObjectFormat
    obj = (lambda: StringIO("f1,f2\nhi,2"), )[0]
    mdr = as_records(obj(), data_format=fmt)
    mem_api.put(name, mdr)
    conversion = Conversion(
        StorageFormat(LocalPythonStorageEngine, fmt),
        StorageFormat(s.storage_engine, DelimitedFileFormat),
    )
    copy_file_object_to_delim_file.copy(name,
                                        name,
                                        conversion,
                                        mem_api,
                                        fs_api,
                                        schema=TestSchema4)
    with fs_api.open(name) as f:
        assert f.read() == obj().read()
def test_records_to_file():
    dr = tempfile.gettempdir()
    s: Storage = Storage.from_url(f"file://{dr}")
    fs_api: FileSystemStorageApi = s.get_api()
    mem_api: PythonStorageApi = new_local_python_storage().get_api()
    name = "_test"
    fmt = RecordsFormat
    obj = [{"f1": "hi", "f2": 2}]
    mdr = as_records(obj, data_format=fmt)
    mem_api.put(name, mdr)
    conversion = Conversion(
        StorageFormat(LocalPythonStorageEngine, fmt),
        StorageFormat(s.storage_engine, DelimitedFileFormat),
    )
    copy_records_to_delim_file.copy(name,
                                    name,
                                    conversion,
                                    mem_api,
                                    fs_api,
                                    schema=TestSchema4)
    with fs_api.open(name) as f:
        recs = list(read_csv(f))
        recs = RecordsFormat.conform_records_to_schema(recs, TestSchema4)
        assert recs == obj
    def handle_raw_output_object(
        self,
        execution_session: ExecutionSession,
        output_obj: DataInterfaceType,
        executable: Executable,
    ) -> Optional[StoredDataBlockMetadata]:
        logger.debug("HANDLING OUTPUT")
        # TODO: can i return an existing DataBlock? Or do I need to create a "clone"?
        #   Answer: ok to return as is (just mark it as 'output' in DBL)
        if isinstance(output_obj, StoredDataBlockMetadata):
            # TODO is it in local storage tho? we skip conversion below...
            # This is just special case right now to support SQL pipe
            # Will need better solution for explicitly creating DB/SDBs inside of pipes
            return output_obj
        elif isinstance(output_obj, DataBlockMetadata):
            raise NotImplementedError
        elif isinstance(output_obj, ManagedDataBlock):
            raise NotImplementedError
        else:
            # TODO: handle DataBlock stream output (iterator that goes into separate blocks)
            nominal_output_schema = executable.bound_interface.resolve_nominal_output_schema(
                self.env,
                execution_session.metadata_session,
            )  # TODO: could check output to see if it is LocalRecords with a schema too?
            logger.debug(
                f"Resolved output schema {nominal_output_schema} {executable.bound_interface}"
            )
            output_obj = wrap_records_object(output_obj)
            if records_object_is_definitely_empty(output_obj):
                # TODO
                # Are we sure we'd never want to process an empty object?
                # Like maybe create the db table, but leave it empty? could be useful
                return None
            dro = as_records(output_obj, schema=nominal_output_schema)
            block, sdb = create_data_block_from_records(
                self.env,
                execution_session.metadata_session,
                self.ctx.local_python_storage,
                dro,
                created_by_node_key=executable.node_key,
            )

        # TODO: need target_format option too
        if self.ctx.target_storage is None or self.ctx.target_storage == sdb.storage:
            # Already good on target storage
            if sdb.data_format.is_storable():
                # And its storable
                return sdb

        # check if existing storage_format is compatible with target storage,
        # and it's storable, then use instead of natural (no need to convert)
        target_format = self.ctx.target_storage.storage_engine.get_natural_format(
        )
        if self.ctx.target_storage.storage_engine.is_supported_format(
                sdb.data_format):
            if sdb.data_format.is_storable():
                target_format = sdb.data_format

        assert target_format.is_storable()

        # Place output in target storage
        return copy_lowest_cost(
            self.ctx.env,
            execution_session.metadata_session,
            sdb=sdb,
            target_storage=self.ctx.target_storage,
            target_format=target_format,
            eligible_storages=self.ctx.storages,
        )