def test_example():
    env = Environment(metadata_storage="sqlite://")
    g = Graph(env)
    env.add_module(core)
    df = pd.DataFrame({"a": range(10), "b": range(10)})
    g.create_node(key="n1", pipe="extract_dataframe", config={"dataframe": df})
    output = env.produce("n1", g)
    assert_almost_equal(output.as_dataframe(), df)
Beispiel #2
0
def test_simple_import():
    dburl = get_tmp_sqlite_db_url()
    env = Environment(metadata_storage=dburl)
    g = Graph(env)
    env.add_module(core)
    df = pd.DataFrame({"a": range(10), "b": range(10)})
    g.create_node(key="n1", function="import_dataframe", params={"dataframe": df})
    blocks = env.produce("n1", g)
    assert_almost_equal(blocks[0].as_dataframe(), df, check_dtype=False)
Beispiel #3
0
def create_data_block_from_sql(
    env: Environment,
    sql: str,
    sess: Session,
    db_api: DatabaseStorageApi,
    nominal_schema: Schema = None,
    inferred_schema: Schema = None,
    created_by_node_key: str = None,
) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]:
    # TODO: we are special casing sql right now, but could create another DataFormat (SqlQueryFormat, non-storable).
    #       but, not sure how well it fits paradigm (it's a fundamentally non-python operation, the only one for now --
    #       if we had an R runtime or any other shell command, they would also be in this bucket)
    #       fine here for now, but there is a generalization that might make the sql pipe less awkward (returning sdb)
    logger.debug("CREATING DATA BLOCK from sql")
    tmp_name = f"_tmp_{rand_str(10)}".lower()
    sql = db_api.clean_sub_sql(sql)
    create_sql = f"""
    create table {tmp_name} as
    select
    *
    from (
    {sql}
    ) as __sub
    """
    db_api.execute_sql(create_sql)
    cnt = db_api.count(tmp_name)
    if not nominal_schema:
        nominal_schema = env.get_schema("Any", sess)
    if not inferred_schema:
        inferred_schema = infer_schema_from_db_table(db_api, tmp_name)
        env.add_new_generated_schema(inferred_schema, sess)
    realized_schema = cast_to_realized_schema(env, sess, inferred_schema,
                                              nominal_schema)
    block = DataBlockMetadata(
        id=get_datablock_id(),
        inferred_schema_key=inferred_schema.key if inferred_schema else None,
        nominal_schema_key=nominal_schema.key,
        realized_schema_key=realized_schema.key,
        record_count=cnt,
        created_by_node_key=created_by_node_key,
    )
    storage_url = db_api.url
    sdb = StoredDataBlockMetadata(
        id=get_datablock_id(),
        data_block_id=block.id,
        data_block=block,
        storage_url=storage_url,
        data_format=DatabaseTableFormat,
    )
    sess.add(block)
    sess.add(sdb)
    # sess.flush([block, sdb])
    db_api.rename_table(tmp_name, sdb.get_name())
    return block, sdb
Beispiel #4
0
def make_test_env(**kwargs) -> Environment:
    if "metadata_storage" not in kwargs:
        url = "sqlite://"
        metadata_storage = Storage.from_url(url)
        kwargs["metadata_storage"] = metadata_storage
    env = Environment(**kwargs)
    test_module = SnapflowModule(
        "_test",
        schemas=[TestSchema1, TestSchema2, TestSchema3, TestSchema4],
    )
    env.add_module(test_module)
    return env
Beispiel #5
0
def test_default_module():
    DEFAULT_LOCAL_MODULE.library.snaps = {}

    @Snap
    def s1():
        pass

    assert len(DEFAULT_LOCAL_MODULE.library.snaps) == 1
    assert DEFAULT_LOCAL_MODULE.get_snap("s1") is s1

    env = Environment()
    env.add_snap(s1)
    assert env.get_snap("s1") is s1
Beispiel #6
0
def test_default_module():
    DEFAULT_LOCAL_MODULE.library.functions = {}

    @datafunction
    def s1():
        pass

    assert len(DEFAULT_LOCAL_MODULE.library.functions) == 1
    assert DEFAULT_LOCAL_MODULE.get_function("s1") is s1

    env = Environment()
    env.add_function(s1)
    assert env.get_function("s1") is s1
Beispiel #7
0
def make_test_env(**kwargs) -> Environment:
    if "metadata_storage" not in kwargs:
        url = get_tmp_sqlite_db_url()
        metadata_storage = Storage.from_url(url)
        kwargs["metadata_storage"] = metadata_storage
    env = Environment(settings=SnapflowSettings(abort_on_snap_error=True),
                      **kwargs)
    test_module = SnapflowModule(
        "_test",
        schemas=[TestSchema1, TestSchema2, TestSchema3, TestSchema4],
    )
    env.add_module(test_module)
    return env
Beispiel #8
0
def get_env(key="_test", db_url=None):
    if db_url is None:
        db_url = get_tmp_sqlite_db_url()
    env = Environment(key=key, metadata_storage=db_url)
    env.add_module(core)
    env.add_schema(Customer)
    env.add_schema(Metric)
    return env
Beispiel #9
0
def create_data_block_from_records(
    env: Environment,
    sess: Session,
    local_storage: Storage,
    records: Any,
    nominal_schema: Schema = None,
    inferred_schema: Schema = None,
    created_by_node_key: str = None,
) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]:
    from snapflow.storage.storage import LocalPythonStorageEngine

    logger.debug("CREATING DATA BLOCK")
    if isinstance(records, MemoryDataRecords):
        dro = records
        # Important: override nominal schema with DRO entry if it exists
        if dro.nominal_schema is not None:
            nominal_schema = env.get_schema(dro.nominal_schema, sess)
    else:
        dro = as_records(records, schema=nominal_schema)
    if not nominal_schema:
        nominal_schema = env.get_schema("Any", sess)
    if not inferred_schema:
        inferred_schema = dro.data_format.infer_schema_from_records(
            dro.records_object)
        env.add_new_generated_schema(inferred_schema, sess)
    realized_schema = cast_to_realized_schema(env, sess, inferred_schema,
                                              nominal_schema)
    dro = dro.conform_to_schema(realized_schema)
    block = DataBlockMetadata(
        id=get_datablock_id(),
        inferred_schema_key=inferred_schema.key if inferred_schema else None,
        nominal_schema_key=nominal_schema.key,
        realized_schema_key=realized_schema.key,
        record_count=dro.record_count,
        created_by_node_key=created_by_node_key,
    )
    sdb = StoredDataBlockMetadata(  # type: ignore
        id=get_datablock_id(),
        data_block_id=block.id,
        data_block=block,
        storage_url=local_storage.url,
        data_format=dro.data_format,
    )
    sess.add(block)
    sess.add(sdb)
    # sess.flush([block, sdb])
    local_storage.get_api().put(sdb.get_name(), dro)
    return block, sdb
Beispiel #10
0
def logs(env: Environment):
    """Show log of Snaps on DataBlocks"""
    with env.get_metadata_api().begin():
        query = env.md_api.execute(select(SnapLog).order_by(SnapLog.updated_at.desc()))
        drls = []
        for dfl in query:
            if dfl.data_block_logs:
                for drl in dfl.data_block_logs:
                    r = [
                        dfl.started_at.strftime("%F %T"),
                        dfl.node_key,
                        drl.direction.display,
                        drl.data_block_id,
                    ]
                    drls.append(r)
            else:
                drls.append(
                    [
                        dfl.started_at.strftime("%F %t"),
                        f"{dfl.node_key} nothing to do",
                        "-",
                        "-",
                    ]
                )
        headers = [
            "Started",
            "_Snap",
            "Direction",
            "DataBlock",
        ]
        echo_table(headers, drls)
Beispiel #11
0
def list_data_blocks(env: Environment):
    with env.get_metadata_api().begin():
        query = env.md_api.execute(
            select(DataBlockMetadata)
            .filter(~DataBlockMetadata.deleted)
            .order_by(DataBlockMetadata.created_at)
        )
        headers = [
            "ID",
            "Nominal schema",
            "Created by node",
            "# Records",
            "Stored",
        ]
        rows = [
            [
                r.id,
                r.nominal_schema_key,
                r.created_by_node_key,
                r.record_count,
                r.stored_data_blocks.count(),
            ]
            for r in query
        ]
        echo_table(headers, rows)
Beispiel #12
0
def logs(env: Environment):
    """Show log of Pipes on DataBlocks"""
    with env.session_scope() as sess:
        query = sess.query(PipeLog).order_by(PipeLog.updated_at.desc())
        drls = []
        for dfl in query:
            if dfl.data_block_logs:
                for drl in dfl.data_block_logs:
                    r = [
                        dfl.started_at.strftime("%F %T"),
                        dfl.node_key,
                        drl.direction.display,
                        drl.data_block_id,
                    ]
                    drls.append(r)
            else:
                drls.append([
                    dfl.started_at.strftime("%F %t"),
                    f"{dfl.node_key} nothing to do",
                    "-",
                    "-",
                ])
        headers = [
            "Started",
            "Pipe",
            "Direction",
            "DataBlock",
        ]
        echo_table(headers, drls)
def instantiate_node(
    env: Environment,
    graph: Graph,
    declared_node: DeclaredNode,
):
    if isinstance(declared_node.pipe, str):
        pipe = env.get_pipe(declared_node.pipe)
    else:
        pipe = make_pipe(declared_node.pipe)
    interface = pipe.get_interface()
    schema_translation = interface.assign_translations(
        declared_node.schema_translation)
    declared_inputs: Dict[str, DeclaredStreamInput] = {}
    if declared_node.upstream is not None:
        for name, stream_like in interface.assign_inputs(
                declared_node.upstream).items():
            declared_inputs[name] = DeclaredStreamInput(
                stream=ensure_stream(stream_like),
                declared_schema_translation=(schema_translation
                                             or {}).get(name),
            )
    n = Node(
        env=env,
        graph=graph,
        key=declared_node.key,
        pipe=pipe,
        config=declared_node.config,
        interface=interface,
        declared_inputs=declared_inputs,
        declared_schema_translation=schema_translation,
        output_alias=declared_node.output_alias,
    )
    return n
Beispiel #14
0
def reset_metadata(env: Environment):
    """Reset metadata, all or selectively"""
    # TODO
    raise NotImplementedError
    with env.session_scope() as sess:
        sess.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}pipe_log        cascade;"
        )
        sess.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}pipe_log_id_seq cascade;"
        )
        sess.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_log        cascade;"
        )
        sess.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_log_id_seq cascade;"
        )
        sess.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_metadata   cascade;"
        )
        sess.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_set_metadata        cascade;"
        )
        sess.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}stored_data_resource_metadata cascade;"
        )
Beispiel #15
0
def instantiate_node(
    env: Environment,
    graph: Graph,
    declared_node: DeclaredNode,
):
    if isinstance(declared_node.snap, str):
        snap = env.get_snap(declared_node.snap)
    else:
        snap = make_snap(declared_node.snap)
    interface = snap.get_interface()
    schema_translations = interface.assign_translations(
        declared_node.schema_translations
    )
    declared_inputs: Dict[str, DeclaredStreamInput] = {}
    if declared_node.inputs is not None:
        for name, stream_like in interface.assign_inputs(declared_node.inputs).items():
            declared_inputs[name] = DeclaredStreamInput(
                stream=ensure_stream(stream_like),
                declared_schema_translation=(schema_translations or {}).get(name),
            )
    n = Node(
        graph=graph,
        key=declared_node.key,
        snap=snap,
        params=declared_node.params,
        interface=interface,
        declared_inputs=declared_inputs,
        declared_schema_translation=schema_translations,
        output_alias=declared_node.output_alias,
    )
    return n
Beispiel #16
0
def test_env_init():
    env = Environment(
        f"_test_{rand_str()}",
        metadata_storage="sqlite://",
        settings=SnapflowSettings(add_core_module=False),
    )
    env_init(env)
Beispiel #17
0
def reset_metadata(env: Environment):
    """Reset metadata, all or selectively"""
    # TODO
    raise NotImplementedError
    with env.get_metadata_api().begin():
        env.md_api.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}snap_log        cascade;"
        )
        env.md_api.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}snap_log_id_seq cascade;"
        )
        env.md_api.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_log        cascade;"
        )
        env.md_api.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_log_id_seq cascade;"
        )
        env.md_api.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_resource_metadata   cascade;"
        )
        env.md_api.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}data_set_metadata        cascade;"
        )
        env.md_api.execute(
            f"drop table {SNAPFLOW_METADATA_TABLE_PREFIX}stored_data_resource_metadata cascade;"
        )
Beispiel #18
0
def test_env_config():
    cfg = EnvironmentConfiguration(
        f"_test_{rand_str()}",
        metadata_storage_url="sqlite://",
        settings=SnapflowSettings(add_core_module=False),
    )
    env = Environment.from_config(cfg)
    env_init(env)
Beispiel #19
0
def env_init(env: Environment):
    from . import _test_module

    # Test module / components
    with env.md_api.begin():
        assert len(env.get_module_order()) == 1
        env.add_module(_test_module)
        assert env.get_module_order() == [
            env.get_local_module().name,
            _test_module.name,
        ]
        assert env.get_schema("TestSchema") is _test_module.schemas.TestSchema
        assert env.get_snap("test_sql") is _test_module.snaps.test_sql
        # Test runtime / storage
        env.add_storage("postgresql://test")
        assert len(env.storages) == 2  # added plus default local memory
        assert len(env.runtimes) == 2  # added plus default local python
Beispiel #20
0
def app(ctx, debug: bool = False, metadata: Optional[str] = None):
    """Modern Data Pipelines"""
    logger.warning("The snapflow CLI is experimental and not officially supported yet")
    if debug:
        logger.add(sys.stderr, level="DEBUG")
    else:
        logger.add(sys.stderr, level="INFO")
    env = current_env()
    if env is None:
        env = Environment(metadata_storage=metadata)
    logger.info(f"Using environment '{env.metadata_storage.url}'")
    ctx.obj = env
Beispiel #21
0
 def from_config(cfg: ExecutionConfiguration) -> ExecutionContext:
     env = Environment.from_config(cfg.env_config)
     return ExecutionContext(
         env=env,
         local_storage=ensure_storage(cfg.local_storage_url),
         target_storage=ensure_storage(cfg.target_storage_url),
         target_format=None,  # TODO: from config
         storages=[ensure_storage(s) for s in cfg.storage_urls],
         # logger=ExecutionLogger(),  # TODO: from config
         execution_timelimit_seconds=cfg.execution_timelimit_seconds,
         abort_on_snap_error=env.settings.abort_on_snap_error,
         execution_config=cfg,
     )
Beispiel #22
0
def test_multi_env():
    db_url = get_tmp_sqlite_db_url()
    cfg = EnvironmentConfiguration(
        key=f"_test_{rand_str()}",
        metadata_storage_url=db_url,
        settings=SnapflowSettings(add_core_module=False),
    )
    env1 = Environment.from_config(cfg)
    with env1.md_api.begin():
        env1.md_api.add(DataBlockMetadata(realized_schema_key="Any"))
        env1.md_api.flush()
        assert env1.md_api.count(select(DataBlockMetadata)) == 1
    cfg = EnvironmentConfiguration(
        key=f"_test_{rand_str()}",
        metadata_storage_url=db_url,
        settings=SnapflowSettings(add_core_module=False),
    )
    env2 = Environment.from_config(cfg)
    with env2.md_api.begin():
        assert env2.md_api.count(select(DataBlockMetadata)) == 0
        env2.md_api.add(DataBlockMetadata(realized_schema_key="Any"))
        env2.md_api.flush()
        assert env2.md_api.count(select(DataBlockMetadata)) == 1
Beispiel #23
0
def run(env: Environment, node: str = None, deps: bool = False):
    """Run snapflow pipeline"""
    if node:
        if deps:
            env.produce(node)
        else:
            env.run_node(node)
    else:
        raise NotImplementedError
        env.run_graph()
Beispiel #24
0
def list_nodes(env: Environment):
    with env.session_scope() as sess:
        query = (sess.query(
            PipeLog.node_key,
            func.count(PipeLog.id),
            func.max(PipeLog.started_at),
            func.count(DataBlockLog.id),
        ).join(PipeLog.data_block_logs).group_by(PipeLog.node_key).all())
        headers = [
            "Node key",
            "Run count",
            "Last run at",
            "block count",
        ]
        rows = [(k, c, m.strftime("%F %T")) for k, c, m in query]
    echo_table(headers, rows)
Beispiel #25
0
 def resolve_nominal_output_schema(self,
                                   env: Environment) -> Optional[Schema]:
     if not self.output:
         return None
     if not self.output.is_generic:
         return env.get_schema(self.output.schema_like)
     output_generic = self.output.schema_like
     for input in self.inputs:
         if not input.declared_input.is_generic:
             continue
         if input.declared_input.schema_like == output_generic:
             schema = input.get_bound_nominal_schema()
             # We check if None -- there may be more than one input with same generic, we'll take any that are resolvable
             if schema is not None:
                 return schema
     raise Exception(f"Unable to resolve generic '{output_generic}'")
Beispiel #26
0
def apply_schema_translation_as_sql(env: Environment, name: str,
                                    translation: SchemaTranslation) -> str:
    if not translation.from_schema_key:
        raise NotImplementedError(
            f"Schema translation must provide `from_schema` when translating a database table {translation}"
        )
    sql = column_map(
        name,
        env.get_schema(translation.from_schema_key).field_names(),
        translation.as_dict(),
    )
    table_stmt = f"""
        (
            {sql}
        ) as __translated
        """
    return table_stmt
Beispiel #27
0
def list_data_blocks(env: Environment):
    with env.session_scope() as sess:
        query = (sess.query(DataBlockMetadata).filter(
            ~DataBlockMetadata.deleted).order_by(DataBlockMetadata.created_at))
        headers = [
            "ID",
            "Nominal schema",
            "Created by node",
            "# Records",
            "Stored",
        ]
        rows = [[
            r.id,
            r.nominal_schema_key,
            r.created_by_node_key,
            r.record_count,
            r.stored_data_blocks.count(),
        ] for r in query]
        echo_table(headers, rows)
Beispiel #28
0
def list_nodes(env: Environment):
    with env.get_metadata_api().begin():
        query = env.md_api.execute(
            select(
                SnapLog.node_key,
                func.count(SnapLog.id),
                func.max(SnapLog.started_at),
                func.count(DataBlockLog.id),
            )
            .join(SnapLog.data_block_logs)
            .group_by(SnapLog.node_key)
        ).all()
        headers = [
            "Node key",
            "Run count",
            "Last run at",
            "block count",
        ]
        rows = [(k, c, m.strftime("%F %T")) for k, c, m in query]
    echo_table(headers, rows)
Beispiel #29
0
 def realized_schema(self, env: Environment) -> Schema:
     return env.get_schema(self.realized_schema_key)
Beispiel #30
0
 def nominal_schema(self, env: Environment) -> Optional[Schema]:
     return env.get_schema(self.nominal_schema_key)