Example #1
0
File: helpers.py Project: kvh/dcp
def make_copy_request(
    from_url: str, to_url: str, fmt: str = None, schema: str = None
) -> CopyRequest:
    from_split = from_url.split("/")
    to_split = to_url.split("/")
    from_name = from_split[-1]
    to_name = to_split[-1]
    from_storage_url = "/".join(from_split[:-1])
    to_storage_url = "/".join(to_split[:-1])
    to_storage = Storage(to_storage_url)
    if fmt:
        to_fmt = get_format_for_nickname(fmt)
    else:
        to_fmt = to_storage.storage_engine.get_natural_format()
    if not from_storage_url:
        # No storage url then default to local file
        pth = os.getcwd()
        from_storage_url = f"file://{pth}"
    return CopyRequest(
        from_name=from_name,
        from_storage=Storage(from_storage_url),
        to_name=to_name,
        to_storage=to_storage,
        to_format=to_fmt,
    )
Example #2
0
 def append_records_to_stored_datablock(self, name: str, storage: Storage,
                                        sdb: StoredDataBlockMetadata):
     self.resolve_new_object_with_data_block(sdb, name, storage)
     if sdb.data_format is None:
         fmt = infer_format_for_name(name, storage)
         # if sdb.data_format and sdb.data_format != fmt:
         #     raise Exception(f"Format mismatch {fmt} - {sdb.data_format}")
         if fmt is None:
             raise Exception(f"Could not infer format {name} on {storage}")
         sdb.data_format = fmt
     # TODO: to_format
     # TODO: make sure this handles no-ops (empty object, same storage)
     # TODO: copy or alias? sometimes we are just moving temp obj to new name, dont need copy
     result = dcp.copy(
         from_name=name,
         from_storage=storage,
         to_name=sdb.get_name_for_storage(),
         to_storage=sdb.storage,
         to_format=sdb.data_format,
         available_storages=self.execution_context.storages,
         if_exists="append",
     )
     logger.debug(f"Copied {result}")
     logger.debug(f"REMOVING NAME {name}")
     storage.get_api().remove(name)
def import_storage_csv(ctx: DataFunctionContext,
                       name: str,
                       storage_url: str,
                       schema: Optional[str] = None):
    imported = ctx.get_state_value("imported")
    if imported:
        return
        # Static resource, if already emitted, return
    fs_api = Storage(storage_url).get_api()
    f = fs_api.open_name(name)
    ctx.emit_state_value("imported", True)
    ctx.emit(f, data_format=CsvFileFormat, schema=schema)
Example #4
0
def import_storage_csv(ctx: SnapContext):
    imported = ctx.get_state_value("imported")
    if imported:
        return
        # Static resource, if already emitted, return
    name = ctx.get_param("name")
    storage_url = ctx.get_param("storage_url")
    fs_api = Storage(storage_url).get_api()
    f = fs_api.open_name(name)
    ctx.emit_state_value("imported", True)
    schema = ctx.get_param("schema")
    ctx.emit(f, data_format=CsvFileObjectFormat, schema=schema)
Example #5
0
def test_database_handler():
    dburl = get_tmp_sqlite_db_url()
    s = Storage(dburl)
    name = "_test"
    handler = get_handler(DatabaseTableFormat, s.storage_engine)
    handler().create_empty(name, s, test_records_schema)
    s.get_api().bulk_insert_records(name, test_records)
    assert list(handler().infer_field_names(name, s)) == list(test_records[0].keys())
    assert handler().infer_field_type(name, s, "f1") == Text()
    assert handler().infer_field_type(name, s, "f2") == Integer()
    assert handler().infer_field_type(name, s, "f3") == DEFAULT_FIELD_TYPE
    assert handler().infer_field_type(name, s, "f4") == Date()
    assert handler().infer_field_type(name, s, "f5") == DEFAULT_FIELD_TYPE
Example #6
0
 def handle_emit(
     self,
     records_obj: Any = None,
     name: str = None,
     storage: Storage = None,
     output: str = DEFAULT_OUTPUT_NAME,
     data_format: DataFormat = None,
     schema: SchemaLike = None,
 ):
     logger.debug(
         f"HANDLING EMITTED OBJECT (of type '{type(records_obj).__name__}')"
     )
     # TODO: can i return an existing DataBlock? Or do I need to create a "clone"?
     #   Answer: ok to return as is (just mark it as 'output' in DBL)
     if isinstance(records_obj, StoredDataBlockMetadata):
         # TODO is it in local storage tho? we skip conversion below...
         # This is just special case right now to support SQL snap
         # Will need better solution for explicitly creating DB/SDBs inside of snaps
         return records_obj
     elif isinstance(records_obj, DataBlockMetadata):
         raise NotImplementedError
     elif isinstance(records_obj, ManagedDataBlock):
         raise NotImplementedError
     nominal_output_schema = schema
     if nominal_output_schema is None:
         nominal_output_schema = self.bound_interface.resolve_nominal_output_schema(
             self.env
         )  # TODO: could check output to see if it is LocalRecords with a schema too?
     if nominal_output_schema is not None:
         nominal_output_schema = self.env.get_schema(nominal_output_schema)
     sdb = self.get_stored_datablock_for_output(output)
     sdb.data_format = data_format
     db = sdb.data_block
     if db.nominal_schema_key and db.nominal_schema_key != nominal_output_schema.key:
         raise Exception(
             "Mismatch nominal schemas {db.nominal_schema_key} - {nominal_output_schema.key}"
         )
     db.nominal_schema_key = nominal_output_schema.key
     if records_obj is not None:
         name = "_tmp_obj_" + rand_str(10)
         storage = self.execution_context.local_storage
         storage.get_api().put(name, records_obj)
         if nominal_output_schema is not None:
             # TODO: still unclear on when and why to do this cast
             handler = get_handler_for_name(name, storage)
             handler().cast_to_schema(name, storage, nominal_output_schema)
     sdb.storage_url = storage.url
     assert name is not None
     assert storage is not None
     self.append_records_to_stored_datablock(name, storage, sdb)
     return sdb
Example #7
0
def test_make_copy_request():
    name = "orders.csv"
    to_name = "orders"
    to_storage = "mysql://localhost:3306/mydb"
    to_url = f"{to_storage}/{to_name}"
    req = make_copy_request(name, to_url)
    pth = os.getcwd()
    assert req == CopyRequest(
        from_name=name,
        from_storage=Storage(f"file://{pth}"),
        to_name=to_name,
        to_storage=Storage(to_storage),
        to_format=DatabaseTableFormat,
    )
Example #8
0
def test_memory_handlers(fmt: DataFormat, obj: Any):
    s = Storage("python://test")
    name = "_test"
    s.get_api().put(name, obj())
    handler = get_handler(fmt, s.storage_engine)
    assert list(handler().infer_field_names(name, s)) == list(test_records[0].keys())
    assert handler().infer_field_type(name, s, "f1") == Text()
    assert handler().infer_field_type(name, s, "f2") == Integer()
    assert handler().infer_field_type(name, s, "f3") == DEFAULT_FIELD_TYPE
    assert handler().infer_field_type(name, s, "f4") == Date()
    assert handler().infer_field_type(name, s, "f5") == DEFAULT_FIELD_TYPE

    handler().cast_to_field_type(name, s, "f4", Text())
    handler().cast_to_field_type(name, s, "f4", Date())
    round_trip_object = s.get_api().get(name)
    assert_objects_equal(round_trip_object, obj())
Example #9
0
def test_db_to_mem(url):
    s: Storage = Storage.from_url(url)
    api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls()
    mem_s = new_local_python_storage()
    mem_api: PythonStorageApi = mem_s.get_api()
    if not s.get_api().dialect_is_supported():
        return
    with api_cls.temp_local_database() as db_url:
        name = "_test"
        db_s = Storage.from_url(db_url)
        db_api: DatabaseStorageApi = db_s.get_api()
        db_api.execute_sql(f"create table {name} as select 1 a, 2 b")
        req = CopyRequest(name, db_s, name, mem_s, RecordsFormat,
                          test_records_schema)
        DatabaseTableToRecords().copy(req)
        assert mem_api.get(name) == [{"a": 1, "b": 2}]
Example #10
0
def provide_test_storages(
        function: DataFunction,
        target_storage: Storage) -> Iterator[Optional[Storage]]:
    if target_storage:
        yield target_storage  # TODO
    elif function.required_storage_engines:
        # TODO: multiple engines -- is it AND or OR?? each entry is AND and inside entry commas delim OR
        eng = get_engine_for_scheme(function.required_storage_engines[0])
        api_cls = eng.get_api_cls()
        if issubclass(api_cls, DatabaseApi):
            if not api_cls.dialect_is_supported():
                raise TestFeatureNotImplementedError(eng)
            with api_cls.temp_local_database() as url:
                yield Storage(url)
    elif "database" in function.required_storage_classes:
        yield Storage(get_tmp_sqlite_db_url())
    else:
        yield None
Example #11
0
 def append_records_to_stored_datablock(self, name: str, storage: Storage,
                                        sdb: StoredDataBlockMetadata):
     self.resolve_new_object_with_data_block(sdb, name, storage)
     if sdb.data_format is None:
         sdb.data_format = (self.execution_context.target_format or
                            sdb.storage.storage_engine.get_natural_format())
         # fmt = infer_format_for_name(name, storage)
         # # if sdb.data_format and sdb.data_format != fmt:
         # #     raise Exception(f"Format mismatch {fmt} - {sdb.data_format}")
         # if fmt is None:
         #     raise Exception(f"Could not infer format {name} on {storage}")
         # sdb.data_format = fmt
     # TODO: make sure this handles no-ops (empty object, same storage)
     # TODO: copy or alias? sometimes we are just moving temp obj to new name, dont need copy
     # to_name = sdb.get_name_for_storage()
     # if storage == sdb.storage:
     #     # Same storage
     #     if name == to_name:
     #         # Nothing to do
     #         logger.debug("Output already on storage with same name, nothing to do")
     #         return
     #     else:
     #         # Same storage, just new name
     #         # TODO: should be "rename" ideally (as it is if tmp gets deleted we lose it)
     #         logger.debug("Output already on storage, creating alias")
     #         storage.get_api().create_alias(name, to_name)
     #         return
     logger.debug(
         f"Copying output from {name} {storage} to {sdb.get_name_for_storage()} {sdb.storage} ({sdb.data_format})"
     )
     result = dcp.copy(
         from_name=name,
         from_storage=storage,
         to_name=sdb.get_name_for_storage(),
         to_storage=sdb.storage,
         to_format=sdb.data_format,
         available_storages=self.execution_context.storages,
         if_exists="append",
     )
     logger.debug(f"Copied {result}")
     logger.debug(f"REMOVING NAME {name}")
     storage.get_api().remove(name)
Example #12
0
def make_test_run_context(**kwargs) -> ExecutionContext:
    s = Storage.from_url(url=f"python://_test_default_{rand_str(6)}", )
    env = make_test_env()
    args = dict(
        env=env,
        local_storage=s,
        target_storage=s,
        storages=[s],
    )
    args.update(**kwargs)
    return ExecutionContext(**args)
Example #13
0
def infer_format_for_name(name: str, storage: Storage) -> DataFormat:
    format_handlers = get_handlers_for_storage(storage)
    for handler in format_handlers:
        fmt = handler().infer_data_format(name, storage)
        if fmt is not None:
            return fmt
    msg = f"Could not infer format of object '{name}' on storage {storage}"
    if storage.storage_engine is LocalPythonStorageEngine:
        obj = storage.get_api().get(name)
        msg = f"Could not infer format of object '{name}' `{obj}`"
    raise NotImplementedError(msg)
Example #14
0
def make_test_env(**kwargs) -> Environment:
    if "metadata_storage" not in kwargs:
        url = get_tmp_sqlite_db_url()
        metadata_storage = Storage.from_url(url)
        kwargs["metadata_storage"] = metadata_storage
    env = Environment(settings=SnapflowSettings(abort_on_function_error=True), **kwargs)
    test_module = SnapflowModule(
        "_test",
    )
    for schema in [TestSchema1, TestSchema2, TestSchema3, TestSchema4]:
        env.add_schema(schema)
    env.add_module(test_module)
    return env
Example #15
0
def test_records_to_db(url):
    s: Storage = Storage.from_url(url)
    api_cls: Type[DatabaseApi] = s.storage_engine.get_api_cls()
    if not s.get_api().dialect_is_supported():
        warnings.warn(
            f"Skipping tests for database engine {s.storage_engine.__name__} (client library not installed)"
        )
        return
    mem_s = new_local_python_storage()
    mem_api: PythonStorageApi = mem_s.get_api()
    with api_cls.temp_local_database() as db_url:
        name = "_test"
        db_s = Storage.from_url(db_url)
        db_api: DatabaseStorageApi = db_s.get_api()
        # Records
        mem_api.put(name, deepcopy(conformed_test_records))
        req = CopyRequest(name, mem_s, name, db_s, DatabaseTableFormat,
                          test_records_schema)
        RecordsToDatabaseTable().copy(req)
        with db_api.execute_sql_result(f"select * from {name}") as res:
            if url.startswith("sqlite"):
                assert [dict(r) for r in res] == test_records
            else:
                assert [dict(r) for r in res] == conformed_test_records
Example #16
0
def test_records_to_file():
    dr = tempfile.gettempdir()
    s: Storage = Storage.from_url(f"file://{dr}")
    fs_api: FileSystemStorageApi = s.get_api()
    mem_s = new_local_python_storage()
    mem_api: PythonStorageApi = mem_s.get_api()
    name = f"_test_{rand_str()}"
    obj = [{"f1": "hi", "f2": 2}]
    mem_api.put(name, obj)
    req = CopyRequest(name, mem_s, name, s, CsvFileFormat)
    RecordsToCsvFile().copy(req)
    with fs_api.open(name, newline="") as f:
        recs = list(read_csv(f))
        handler = get_handler(RecordsFormat, mem_s.storage_engine)
        mem_api.put(
            "output",
            recs,
        )
        handler().cast_to_schema("output", mem_s, schema=test_records_schema)
        recs = mem_api.get("output")
        assert recs == obj
Example #17
0
 def get_record_count(self, name: str, storage: Storage) -> Optional[int]:
     # Will come directly from storage engine most of time, except python memory implemented here
     if storage.storage_engine == LocalPythonStorageEngine:
         obj = storage.get_api().get(name)
         return len(obj)
     raise NotImplementedError
Example #18
0
 def storage(self) -> Storage:
     return Storage.from_url(self.storage_url)
Example #19
0
 def as_storage(self) -> Storage:
     return Storage(url=self.url)