def test_exe_output(): env = make_test_env() env.add_module(core) g = Graph(env) # env.add_storage("python://test") # rt = env.runtimes[0] # TODO: this is error because no data copy between SAME storage engines (but DIFFERENT storage urls) currently # ec = env.get_run_context(g, current_runtime=rt, target_storage=env.storages[0]) # ec = env.get_run_context(g, current_runtime=rt, target_storage=rt.as_storage()) output_alias = "node_output" node = g.create_node(key="node", snap=snap_dl_source, output_alias=output_alias) exe = env.get_executable(node) result = ExecutionManager(exe).execute() with env.md_api.begin(): block = result.get_output_block(env) assert block is not None assert block.as_records() == mock_dl_output assert block.nominal_schema is TestSchema4 assert len(block.realized_schema.fields) == len(TestSchema4.fields) # Test alias was created correctly assert (env.md_api.execute( select(Alias).filter(Alias.alias == output_alias)). scalar_one_or_none().data_block_id == block.data_block_id) assert env.md_api.count(select(DataBlockLog)) == 1 dbl = env.md_api.execute(select(DataBlockLog)).scalar_one_or_none() assert dbl.data_block_id == block.data_block_id assert dbl.direction == Direction.OUTPUT
def test_worker_output(): env = make_test_env() env.add_module(core) g = Graph(env) # env.add_storage("python://test") with env.session_scope() as sess: rt = env.runtimes[0] # TODO: this is error because no data copy between SAME storage engines (but DIFFERENT storage urls) currently # ec = env.get_run_context(g, current_runtime=rt, target_storage=env.storages[0]) ec = env.get_run_context(g, current_runtime=rt, target_storage=rt.as_storage()) output_alias = "node_output" node = g.create_node(key="node", pipe=pipe_dl_source, output_alias=output_alias) w = Worker(ec) dfi_mgr = NodeInterfaceManager(ec, sess, node) bdfi = dfi_mgr.get_bound_interface() r = Executable( node.key, CompiledPipe(node.pipe.key, node.pipe), bdfi, ) run_result = w.execute(r) outputblock = run_result.output_block assert outputblock is not None outputblock = sess.merge(outputblock) block = outputblock.as_managed_data_block(ec, sess) assert block.as_records() == mock_dl_output assert block.nominal_schema is TestSchema4 assert len(block.realized_schema.fields) == len(TestSchema4.fields) # Test alias was created correctly assert ( sess.query(Alias).filter(Alias.alias == output_alias).first().data_block_id == block.data_block_id )
def test_declared_schema_translation(): ec = make_test_run_context() env = ec.env g = Graph(env) translation = {"f1": "mapped_f1"} n1 = g.create_node( key="node1", function=function_t1_to_t2, input="n0", schema_translation=translation, ) pi = n1.get_interface() # im = NodeInterfaceManager(ctx=ec, node=n1) block = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) # stream = block_as_stream(block, ec, pi.inputs[0].schema(env), translation) # bi = im.get_bound_stream_interface({"input": stream}) # assert len(bi.inputs) == 1 # input: StreamInput = bi.inputs[0] with env.md_api.begin(): schema_translation = get_schema_translation( env, block.realized_schema(env), target_schema=env.get_schema( pi.get_single_non_recursive_input().schema_like), declared_schema_translation=translation, ) assert schema_translation.as_dict() == translation
def test_natural_schema_translation(): # TODO ec = make_test_run_context() env = ec.env g = Graph(env) translation = {"f1": "mapped_f1"} n1 = g.create_node( key="node1", function=function_t1_to_t2, input="n0", schema_translation=translation, ) pi = n1.get_interface() # im = NodeInterfaceManager(ctx=ec, node=n1) block = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) with env.md_api.begin(): schema_translation = get_schema_translation( env, block.realized_schema(env), target_schema=env.get_schema( pi.get_single_non_recursive_input().schema_like), declared_schema_translation=translation, ) assert schema_translation.as_dict() == translation
def test_node_reset(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph batches = 2 cfg = {"batches": batches} source = g.create_node(customer_source, params=cfg) accum = g.create_node("core.accumulator", input=source) metrics = g.create_node(shape_metrics, input=accum) # Run first time produce(source, graph=g, target_storage=s, env=env) # Now reset node with env.md_api.begin(): state = source.get_state(env) assert state.state is not None source.reset(env) state = source.get_state(env) assert state is None blocks = produce(metrics, graph=g, target_storage=s, env=env) assert len(blocks) == 1 records = blocks[0].as_records() expected_records = [ {"metric": "row_count", "value": batch_size}, # Just one run of source, not two {"metric": "col_count", "value": 3}, ] assert records == expected_records
def test_example(): env = Environment(metadata_storage="sqlite://") g = Graph(env) env.add_module(core) df = pd.DataFrame({"a": range(10), "b": range(10)}) g.create_node(key="n1", pipe="extract_dataframe", config={"dataframe": df}) output = env.produce("n1", g) assert_almost_equal(output.as_dataframe(), df)
def test_simple_import(): dburl = get_tmp_sqlite_db_url() env = Environment(metadata_storage=dburl) g = Graph(env) env.add_module(core) df = pd.DataFrame({"a": range(10), "b": range(10)}) g.create_node(key="n1", function="import_dataframe", params={"dataframe": df}) blocks = env.produce("n1", g) assert_almost_equal(blocks[0].as_dataframe(), df, check_dtype=False)
def test_function_failure(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph batches = 2 cfg = {"batches": batches, "fail": True} source = g.create_node(customer_source, params=cfg) blocks = produce(source, graph=g, target_storage=s, env=env) assert len(blocks) == 1 records = blocks[0].as_records() assert len(records) == 2 with env.md_api.begin(): assert env.md_api.count(select(DataFunctionLog)) == 1 assert env.md_api.count(select(DataBlockLog)) == 1 pl = env.md_api.execute(select(DataFunctionLog)).scalar_one_or_none() assert pl.node_key == source.key assert pl.graph_id == g.get_metadata_obj().hash assert pl.node_start_state == {} assert pl.node_end_state == {"records_imported": chunk_size} assert pl.function_key == source.function.key assert pl.function_params == cfg assert pl.error is not None assert FAIL_MSG in pl.error["error"] ns = env.md_api.execute( select(NodeState).filter(NodeState.node_key == pl.node_key) ).scalar_one_or_none() assert ns.state == {"records_imported": chunk_size} # Run again without failing, should see different result source.params["fail"] = False blocks = produce(source, graph=g, target_storage=s, env=env) assert len(blocks) == 1 records = blocks[0].as_records() assert len(records) == batch_size with env.md_api.begin(): assert env.md_api.count(select(DataFunctionLog)) == 2 assert env.md_api.count(select(DataBlockLog)) == 2 pl = ( env.md_api.execute( select(DataFunctionLog).order_by(DataFunctionLog.completed_at.desc()) ) .scalars() .first() ) assert pl.node_key == source.key assert pl.graph_id == g.get_metadata_obj().hash assert pl.node_start_state == {"records_imported": chunk_size} assert pl.node_end_state == {"records_imported": chunk_size + batch_size} assert pl.function_key == source.function.key assert pl.function_params == cfg assert pl.error is None ns = env.md_api.execute( select(NodeState).filter(NodeState.node_key == pl.node_key) ).scalar_one_or_none() assert ns.state == {"records_imported": chunk_size + batch_size}
def test_node_no_inputs(): env = make_test_env() g = Graph(env) df = pipe(pipe_t1_source) node1 = g.create_node(key="node1", pipe=df) assert {node1: node1}[node1] is node1 # Test hash pi = node1.get_interface() assert pi.inputs == [] assert pi.output is not None assert node1.declared_inputs == {}
def test_node_no_inputs(): env = make_test_env() g = Graph(env) df = datafunction(function_t1_source) node1 = g.create_node(key="node1", function=df) assert {node1: node1}[node1] is node1 # Test hash pi = node1.get_interface() assert pi.inputs == {} assert pi.outputs != {} assert node1.declared_inputs == {}
def test_non_terminating_snap(): def never_stop(input: Optional[DataBlock] = None) -> DataFrame: pass env = make_test_env() g = Graph(env) node = g.create_node(key="node", snap=never_stop) exe = env.get_executable(node) result = ExecutionManager(exe).execute() assert result.get_output_block(env) is None
def test_node_inputs(): env = make_test_env() g = Graph(env) df = pipe(pipe_t1_source) node = g.create_node(key="node", pipe=df) df = pipe(pipe_t1_sink) node1 = g.create_node(key="node1", pipe=df, upstream=node) pi = node1.get_interface() assert len(pi.inputs) == 1 assert pi.output == make_default_output_annotation() assert list(node1.declared_inputs.keys()) == ["input"]
def test_node_inputs(): env = make_test_env() g = Graph(env) df = datafunction(function_t1_source) node = g.create_node(key="node", function=df) df = datafunction(function_t1_sink) node1 = g.create_node(key="node1", function=df, input=node) pi = node1.get_interface() assert len(pi.inputs) == 1 assert pi.outputs == DEFAULT_OUTPUTS assert list(node1.declared_inputs.keys()) == ["input"]
def test_node_params(): env = make_test_env() g = Graph(env) param_vals = [] def function_ctx(ctx: DataFunctionContext, test: str): param_vals.append(test) n = g.create_node(key="ctx", function=function_ctx, params={"test": 1}) env.run_node(n, g) assert param_vals == [1]
def test_node_inputs(): env = make_test_env() g = Graph(env) df = Snap(snap_t1_source) node = g.create_node(key="node", snap=df) df = Snap(snap_t1_sink) node1 = g.create_node(key="node1", snap=df, input=node) pi = node1.get_interface() assert len(pi.inputs) == 1 assert pi.output == make_default_output() assert list(node1.declared_inputs.keys()) == ["input"]
def test_node_config(): env = make_test_env() g = Graph(env) config_vals = [] def pipe_ctx(ctx: PipeContext): config_vals.append(ctx.get_config_value("test")) n = g.create_node(key="ctx", pipe=pipe_ctx, config={"test": 1, "extra_arg": 2}) with env.run(g) as exe: exe.execute(n) assert config_vals == [1]
def test_non_terminating_pipe(): def never_stop(input: Optional[DataBlock] = None) -> DataFrame: pass env = make_test_env() g = Graph(env) rt = env.runtimes[0] ec = env.get_run_context(g, current_runtime=rt) node = g.create_node(key="node", pipe=never_stop) em = ExecutionManager(ec) output = em.execute(node, to_exhaustion=True) assert output is None
def test_multi_env(): env1 = get_env(key="e1") g = Graph(env1) s = env1._local_python_storage # Initial graph batches = 2 source = g.create_node(customer_source, params={"batches": batches}) metrics = g.create_node(shape_metrics, input=source) # Run first time blocks = produce(metrics, graph=g, target_storage=s, env=env1) assert len(blocks) == 1 with env1.md_api.begin(): assert env1.md_api.count(select(DataFunctionLog)) == 2 assert env1.md_api.count(select(DataBlockLog)) == 3 env2 = get_env(key="e2", db_url=env1.metadata_storage.url) g = Graph(env2) s = env2._local_python_storage # Initial graph batches = 2 source = g.create_node(customer_source, params={"batches": batches}) metrics = g.create_node(shape_metrics, input=source) # Run first time blocks = produce(metrics, graph=g, target_storage=s, env=env2) assert len(blocks) == 1 with env2.md_api.begin(): assert env2.md_api.count(select(DataFunctionLog)) == 2 assert env2.md_api.count(select(DataBlockLog)) == 3
def test_exe(): env = make_test_env() g = Graph(env) node = g.create_node(key="node", snap=snap_t1_source) exe = env.get_executable(node) result = ExecutionManager(exe).execute() with env.md_api.begin(): assert not result.output_blocks assert env.md_api.count(select(SnapLog)) == 1 pl = env.md_api.execute(select(SnapLog)).scalar_one_or_none() assert pl.node_key == node.key assert pl.graph_id == g.get_metadata_obj().hash assert pl.node_start_state == {} assert pl.node_end_state == {} assert pl.snap_key == node.snap.key assert pl.snap_params == {}
def test_node_params(): env = make_test_env() g = Graph(env) param_vals = [] @Param("test", "str") def snap_ctx(ctx: SnapContext): param_vals.append(ctx.get_param("test")) n = g.create_node(key="ctx", snap=snap_ctx, params={ "test": 1, "extra_arg": 2 }) env.run_node(n, g) assert param_vals == [1]
def test_ref_input(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph batches = 2 cfg = {"batches": batches} source = g.create_node(customer_source, params=cfg) accum = g.create_node("core.accumulator", input=source) metrics = g.create_node(shape_metrics, input=accum) join_ref = g.create_node( with_latest_metrics, inputs={"metrics": metrics, "cust": source} ) join = g.create_node( with_latest_metrics_no_ref, inputs={"metrics": metrics, "cust": source} ) # Run once, for one metrics output output = produce(metrics, graph=g, target_storage=s, env=env) # Both joins work output = env.run_node(join_ref, g, target_storage=s) assert output.output_blocks output = env.run_node(join, g, target_storage=s) assert output.output_blocks # Run source to create new customers, but NOT new metrics output = env.run_node(source, g, target_storage=s) # This time only ref will still have a metrics input output = env.run_node(join_ref, g, target_storage=s) assert output.output_blocks output = env.run_node(join, g, target_storage=s) assert not output.output_blocks # Regular join has exhausted metrics
def test_generic_schema_resolution(): ec = make_test_run_context() env = ec.env g = Graph(env) n1 = g.create_node(key="node1", pipe=pipe_generic, upstream="n0") # pi = n1.get_interface() with env.session_scope() as sess: im = NodeInterfaceManager(ctx=ec, sess=sess, node=n1) block = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema2", ) sess.add(block) sess.flush([block]) stream = block_as_stream(block, ec, sess) bi = im.get_bound_interface({"input": stream}) assert len(bi.inputs) == 1 assert bi.resolve_nominal_output_schema(env, sess) is TestSchema1
def test_worker(): env = make_test_env() g = Graph(env) rt = env.runtimes[0] ec = env.get_run_context(g, current_runtime=rt) with env.session_scope() as sess: node = g.create_node(key="node", pipe=pipe_t1_source) w = Worker(ec) dfi_mgr = NodeInterfaceManager(ec, sess, node) bdfi = dfi_mgr.get_bound_interface() r = Executable( node.key, CompiledPipe(node.pipe.key, node.pipe), bdfi, ) run_result = w.execute(r) output = run_result.output_block assert output is None
def test_generic_schema_resolution(): ec = make_test_run_context() env = ec.env g = Graph(env) n1 = g.create_node(key="node1", function=function_generic, input="n0") # pi = n1.get_interface() with env.md_api.begin(): exe = Executable(node=n1, function=n1.function, execution_context=ec) im = NodeInterfaceManager(exe) block = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema2", ) env.md_api.add(block) env.md_api.flush([block]) stream = block_as_stream(block, ec) bi = im.get_bound_interface({"input": stream}) assert len(bi.inputs) == 1 assert bi.resolve_nominal_output_schema(env) is TestSchema1
def test_alternate_apis(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph batches = 2 source = g.create_node(customer_source, params={"batches": batches}) metrics = g.create_node(shape_metrics, input=source) # Run first time blocks = produce(metrics, graph=g, target_storage=s, env=env) assert len(blocks) == 1 output = blocks[0] assert output.nominal_schema_key.endswith("Metric") records = blocks[0].as_records() expected_records = [ {"metric": "row_count", "value": batch_size}, {"metric": "col_count", "value": 3}, ] assert records == expected_records
def test_non_terminating_function_with_reference_input(): def never_stop(input: Optional[Reference]) -> DataFrame: # Does not use input but doesn't matter cause reference pass env = make_test_env() g = Graph(env) source = g.create_node( function="core.import_dataframe", params={"dataframe": pd.DataFrame({"a": range(10)})}, ) node = g.create_node(key="node", function=never_stop, input=source) exe = env.get_executable(source) # TODO: reference inputs need to log too? (So they know when to update) # with env.md_api.begin(): # assert env.md_api.count(select(DataBlockLog)) == 1 result = ExecutionManager(exe).execute() exe = env.get_executable(node) result = ExecutionManager(exe).execute() assert result.get_output_block(env) is None
def setup(self): ctx = make_test_run_context() self.ctx = ctx self.env = ctx.env self.sess = self.env.md_api.begin() self.sess.__enter__() self.g = Graph(self.env) self.graph = self.g.get_metadata_obj() self.dr1t1 = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) self.dr2t1 = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) self.dr1t2 = DataBlockMetadata( nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema2", ) self.dr2t2 = DataBlockMetadata( nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema2", ) self.node_source = self.g.create_node(key="snap_source", snap=snap_t1_source) self.node1 = self.g.create_node(key="snap1", snap=snap_t1_sink, input="snap_source") self.node2 = self.g.create_node(key="snap2", snap=snap_t1_to_t2, input="snap_source") self.node3 = self.g.create_node(key="snap3", snap=snap_generic, input="snap_source") self.env.md_api.add(self.dr1t1) self.env.md_api.add(self.dr2t1) self.env.md_api.add(self.dr1t2) self.env.md_api.add(self.dr2t2) self.env.md_api.add(self.graph)
def setup(self): ctx = make_test_run_context() self.ctx = ctx self.env = ctx.env self.g = Graph(self.env) self.graph = self.g.get_metadata_obj() self.dr1t1 = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) self.dr2t1 = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) self.dr1t2 = DataBlockMetadata( nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema2", ) self.dr2t2 = DataBlockMetadata( nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema2", ) self.node_source = self.g.create_node(key="pipe_source", pipe=pipe_t1_source) self.node1 = self.g.create_node(key="pipe1", pipe=pipe_t1_sink, upstream="pipe_source") self.node2 = self.g.create_node(key="pipe2", pipe=pipe_t1_to_t2, upstream="pipe_source") self.node3 = self.g.create_node(key="pipe3", pipe=pipe_generic, upstream="pipe_source") self.sess = self.env._get_new_metadata_session() self.sess.add(self.dr1t1) self.sess.add(self.dr2t1) self.sess.add(self.dr1t2) self.sess.add(self.dr2t2) self.sess.add(self.graph)
def test_alternate_apis(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph N = 2 * 4 source = g.create_node(customer_source, config={"total_records": N}) metrics = g.create_node(shape_metrics, upstream=source) # Run first time output = produce(metrics, graph=g, target_storage=s, env=env) assert output.nominal_schema_key.endswith("Metric") records = output.as_records() expected_records = [ { "metric": "row_count", "value": 4 }, { "metric": "col_count", "value": 3 }, ] assert records == expected_records
def make_test_run_context(**kwargs) -> RunContext: s = Storage.from_url(url=f"python://_test_default_{rand_str(6)}", ) env = make_test_env() g = Graph(env) args = dict( graph=g, env=env, runtimes=[Runtime.from_storage(s)], storages=[s], local_python_storage=s, target_storage=s, ) args.update(**kwargs) return RunContext(**args)