def test_records_to_db(url): s: Storage = Storage.from_url(url) api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls() if not s.get_api().dialect_is_supported(): warnings.warn( f"Skipping tests for database engine {s.storage_engine.__name__} (client library not installed)" ) return mem_api: PythonStorageApi = new_local_python_storage().get_api() with api_cls.temp_local_database() as db_url: name = "_test" db_api: DatabaseStorageApi = Storage.from_url(db_url).get_api() # Records mdr = as_records(records) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, RecordsFormat), StorageFormat(s.storage_engine, DatabaseTableFormat), ) copy_records_to_db.copy(name, name, conversion, mem_api, db_api, schema=TestSchema4) with db_api.execute_sql_result(f"select * from {name}") as res: assert [dict(r) for r in res] == records
def test_mem_to_mem(from_fmt, to_fmt): from_fmt, obj = from_fmt to_fmt, expected = to_fmt if from_fmt == to_fmt: return mem_api: PythonStorageApi = new_local_python_storage().get_api() from_name = "_from_test" to_name = "_to_test" mem_api.put(from_name, as_records(obj(), data_format=from_fmt)) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, from_fmt), StorageFormat(LocalPythonStorageEngine, to_fmt), ) pth = get_datacopy_lookup().get_lowest_cost_path(conversion) for i, ce in enumerate(pth.conversions): ce.copier.copy(from_name, to_name, ce.conversion, mem_api, mem_api, schema=TestSchema4) from_name = to_name to_name = to_name + str(i) to_name = from_name if isinstance(expected, pd.DataFrame): assert_dataframes_are_almost_equal( mem_api.get(to_name).records_object, expected) else: assert list(mem_api.get(to_name).records_object) == list(expected())
def test_data_block_methods(): env = make_test_env() db = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key="_test.TestSchema1", nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema3", ) strg = env.get_default_local_python_storage() records = [{"a": 1}] mdr = as_records(records) sdb = StoredDataBlockMetadata( id=get_datablock_id(), data_block_id=db.id, data_block=db, storage_url=strg.url, data_format=RecordsFormat, ) with env.session_scope() as sess: sess.add(db) sess.add(sdb) assert sdb.name is None name = sdb.get_name() assert len(name) > 10 assert sdb.name == name strg.get_api().put(sdb.name, mdr) assert db.inferred_schema(env, sess) == TestSchema1 assert db.nominal_schema(env, sess) == TestSchema2 assert db.realized_schema(env, sess) == TestSchema3 db.compute_record_count() assert db.record_count == 1
def extract_dataframe(ctx: PipeContext) -> MemoryDataRecords: # TODO optional extracted = ctx.get_state_value("extracted") if extracted: # Just emit once return # TODO: typing fix here? ctx.emit_state_value("extracted", True) schema = ctx.get_config_value("schema") df = ctx.get_config_value("dataframe") return as_records(df, data_format=DataFrameFormat, schema=schema)
def extract_csv(ctx: PipeContext) -> MemoryDataRecords: extracted = ctx.get_state_value("extracted") if extracted: return # Static resource, if already emitted, return path = ctx.get_config_value("path") f = open(path) ctx.emit_state_value("extracted", True) schema = ctx.get_config_value("schema") return as_records(f, data_format=DelimitedFileObjectFormat, schema=schema)
def test_filesystem_api_core_operations(url): api: PythonStorageApi = Storage.from_url(url).get_api() name = "_test" api.put(name, as_records([{"a": 1}, {"b": 2}])) assert api.exists(name) assert not api.exists(name + "doesntexist") assert api.record_count(name) == 2 api.create_alias(name, name + "alias") assert api.record_count(name + "alias") == 2 api.copy(name, name + "copy") assert api.record_count(name + "copy") == 2
def copy_delim_file_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, FileSystemStorageApi) assert isinstance(to_storage_api, PythonStorageApi) with from_storage_api.open(from_name) as f: records = list(read_csv(f.readlines())) mdr = as_records(records, data_format=RecordsFormat, schema=schema) mdr = mdr.conform_to_schema() to_storage_api.put(to_name, mdr)
def copy_df_iterator_to_records_iterator( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) itr = (dataframe_to_records(df, schema) for df in mdr.records_object) to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_file_object_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) obj = read_csv(mdr.records_object) to_mdr = as_records(obj, data_format=RecordsFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_file_object_iterator_to_records_iterator( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) itr = (read_csv(chunk) for chunk in with_header(mdr.records_object)) to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_df_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) df = dataframe_to_records(mdr.records_object, schema) to_mdr = as_records(df, data_format=RecordsFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def create_data_block_from_records( env: Environment, sess: Session, local_storage: Storage, records: Any, nominal_schema: Schema = None, inferred_schema: Schema = None, created_by_node_key: str = None, ) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]: from snapflow.storage.storage import LocalPythonStorageEngine logger.debug("CREATING DATA BLOCK") if isinstance(records, MemoryDataRecords): dro = records # Important: override nominal schema with DRO entry if it exists if dro.nominal_schema is not None: nominal_schema = env.get_schema(dro.nominal_schema, sess) else: dro = as_records(records, schema=nominal_schema) if not nominal_schema: nominal_schema = env.get_schema("Any", sess) if not inferred_schema: inferred_schema = dro.data_format.infer_schema_from_records( dro.records_object) env.add_new_generated_schema(inferred_schema, sess) realized_schema = cast_to_realized_schema(env, sess, inferred_schema, nominal_schema) dro = dro.conform_to_schema(realized_schema) block = DataBlockMetadata( id=get_datablock_id(), inferred_schema_key=inferred_schema.key if inferred_schema else None, nominal_schema_key=nominal_schema.key, realized_schema_key=realized_schema.key, record_count=dro.record_count, created_by_node_key=created_by_node_key, ) sdb = StoredDataBlockMetadata( # type: ignore id=get_datablock_id(), data_block_id=block.id, data_block=block, storage_url=local_storage.url, data_format=dro.data_format, ) sess.add(block) sess.add(sdb) # sess.flush([block, sdb]) local_storage.get_api().put(sdb.get_name(), dro) return block, sdb
def copy_delim_file_to_file_object( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, FileSystemStorageApi) assert isinstance(to_storage_api, PythonStorageApi) with from_storage_api.open(from_name) as f: mdr = as_records(f, data_format=DelimitedFileObjectFormat, schema=schema) mdr = mdr.conform_to_schema() to_storage_api.put(to_name, mdr)
def copy_db_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, DatabaseStorageApi) assert isinstance(to_storage_api, PythonStorageApi) select_sql = f"select * from {from_name}" with from_storage_api.execute_sql_result(select_sql) as r: records = result_proxy_to_records(r) mdr = as_records(records, data_format=RecordsFormat, schema=schema) mdr = mdr.conform_to_schema() to_storage_api.put(to_name, mdr)
def copy_records_iterator_to_records( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) all_records = [] for records in mdr.records_object: all_records.extend(records) to_mdr = as_records(all_records, data_format=RecordsFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_db_to_cursor( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, DatabaseStorageApi) assert isinstance(to_storage_api, PythonStorageApi) select_sql = f"select * from {from_name}" conn = ( from_storage_api.get_engine().connect() ) # Gonna leave this connection hanging... # TODO: add "closeable" to the MDR and handle? r = conn.execute(select_sql) mdr = as_records(r, data_format=DatabaseCursorFormat, schema=schema) mdr = mdr.conform_to_schema() to_storage_api.put(to_name, mdr)
def copy_file_object_to_records_iterator( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) # Note: must keep header on each chunk when iterating delimited file object! # TODO: ugly hard-coded 1000 here, but how could we ever make it configurable? Not a big deal I guess itr = (read_csv(chunk) for chunk in with_header(iterate_chunks(mdr.records_object, 1000))) to_mdr = as_records(itr, data_format=RecordsIteratorFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def copy_dataframe_iterator_to_dataframe( from_name: str, to_name: str, conversion: Conversion, from_storage_api: StorageApi, to_storage_api: StorageApi, schema: Schema, ): assert isinstance(from_storage_api, PythonStorageApi) assert isinstance(to_storage_api, PythonStorageApi) mdr = from_storage_api.get(from_name) all_dfs = [] for df in mdr.records_object: all_dfs.append(df) to_mdr = as_records(pd.concat(all_dfs), data_format=DataFrameFormat, schema=schema) to_mdr = to_mdr.conform_to_schema() to_storage_api.put(to_name, to_mdr)
def test_obj_to_file(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_api: PythonStorageApi = new_local_python_storage().get_api() name = "_test" fmt = DelimitedFileObjectFormat obj = (lambda: StringIO("f1,f2\nhi,2"), )[0] mdr = as_records(obj(), data_format=fmt) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, fmt), StorageFormat(s.storage_engine, DelimitedFileFormat), ) copy_file_object_to_delim_file.copy(name, name, conversion, mem_api, fs_api, schema=TestSchema4) with fs_api.open(name) as f: assert f.read() == obj().read()
def test_records_to_file(): dr = tempfile.gettempdir() s: Storage = Storage.from_url(f"file://{dr}") fs_api: FileSystemStorageApi = s.get_api() mem_api: PythonStorageApi = new_local_python_storage().get_api() name = "_test" fmt = RecordsFormat obj = [{"f1": "hi", "f2": 2}] mdr = as_records(obj, data_format=fmt) mem_api.put(name, mdr) conversion = Conversion( StorageFormat(LocalPythonStorageEngine, fmt), StorageFormat(s.storage_engine, DelimitedFileFormat), ) copy_records_to_delim_file.copy(name, name, conversion, mem_api, fs_api, schema=TestSchema4) with fs_api.open(name) as f: recs = list(read_csv(f)) recs = RecordsFormat.conform_records_to_schema(recs, TestSchema4) assert recs == obj
def handle_raw_output_object( self, execution_session: ExecutionSession, output_obj: DataInterfaceType, executable: Executable, ) -> Optional[StoredDataBlockMetadata]: logger.debug("HANDLING OUTPUT") # TODO: can i return an existing DataBlock? Or do I need to create a "clone"? # Answer: ok to return as is (just mark it as 'output' in DBL) if isinstance(output_obj, StoredDataBlockMetadata): # TODO is it in local storage tho? we skip conversion below... # This is just special case right now to support SQL pipe # Will need better solution for explicitly creating DB/SDBs inside of pipes return output_obj elif isinstance(output_obj, DataBlockMetadata): raise NotImplementedError elif isinstance(output_obj, ManagedDataBlock): raise NotImplementedError else: # TODO: handle DataBlock stream output (iterator that goes into separate blocks) nominal_output_schema = executable.bound_interface.resolve_nominal_output_schema( self.env, execution_session.metadata_session, ) # TODO: could check output to see if it is LocalRecords with a schema too? logger.debug( f"Resolved output schema {nominal_output_schema} {executable.bound_interface}" ) output_obj = wrap_records_object(output_obj) if records_object_is_definitely_empty(output_obj): # TODO # Are we sure we'd never want to process an empty object? # Like maybe create the db table, but leave it empty? could be useful return None dro = as_records(output_obj, schema=nominal_output_schema) block, sdb = create_data_block_from_records( self.env, execution_session.metadata_session, self.ctx.local_python_storage, dro, created_by_node_key=executable.node_key, ) # TODO: need target_format option too if self.ctx.target_storage is None or self.ctx.target_storage == sdb.storage: # Already good on target storage if sdb.data_format.is_storable(): # And its storable return sdb # check if existing storage_format is compatible with target storage, # and it's storable, then use instead of natural (no need to convert) target_format = self.ctx.target_storage.storage_engine.get_natural_format( ) if self.ctx.target_storage.storage_engine.is_supported_format( sdb.data_format): if sdb.data_format.is_storable(): target_format = sdb.data_format assert target_format.is_storable() # Place output in target storage return copy_lowest_cost( self.ctx.env, execution_session.metadata_session, sdb=sdb, target_storage=self.ctx.target_storage, target_format=target_format, eligible_storages=self.ctx.storages, )