def test_function_failure(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph batches = 2 cfg = {"batches": batches, "fail": True} source = g.create_node(customer_source, params=cfg) blocks = produce(source, graph=g, target_storage=s, env=env) assert len(blocks) == 1 records = blocks[0].as_records() assert len(records) == 2 with env.md_api.begin(): assert env.md_api.count(select(DataFunctionLog)) == 1 assert env.md_api.count(select(DataBlockLog)) == 1 pl = env.md_api.execute(select(DataFunctionLog)).scalar_one_or_none() assert pl.node_key == source.key assert pl.graph_id == g.get_metadata_obj().hash assert pl.node_start_state == {} assert pl.node_end_state == {"records_imported": chunk_size} assert pl.function_key == source.function.key assert pl.function_params == cfg assert pl.error is not None assert FAIL_MSG in pl.error["error"] ns = env.md_api.execute( select(NodeState).filter(NodeState.node_key == pl.node_key) ).scalar_one_or_none() assert ns.state == {"records_imported": chunk_size} # Run again without failing, should see different result source.params["fail"] = False blocks = produce(source, graph=g, target_storage=s, env=env) assert len(blocks) == 1 records = blocks[0].as_records() assert len(records) == batch_size with env.md_api.begin(): assert env.md_api.count(select(DataFunctionLog)) == 2 assert env.md_api.count(select(DataBlockLog)) == 2 pl = ( env.md_api.execute( select(DataFunctionLog).order_by(DataFunctionLog.completed_at.desc()) ) .scalars() .first() ) assert pl.node_key == source.key assert pl.graph_id == g.get_metadata_obj().hash assert pl.node_start_state == {"records_imported": chunk_size} assert pl.node_end_state == {"records_imported": chunk_size + batch_size} assert pl.function_key == source.function.key assert pl.function_params == cfg assert pl.error is None ns = env.md_api.execute( select(NodeState).filter(NodeState.node_key == pl.node_key) ).scalar_one_or_none() assert ns.state == {"records_imported": chunk_size + batch_size}
def test_exe(): env = make_test_env() g = Graph(env) node = g.create_node(key="node", snap=snap_t1_source) exe = env.get_executable(node) result = ExecutionManager(exe).execute() with env.md_api.begin(): assert not result.output_blocks assert env.md_api.count(select(SnapLog)) == 1 pl = env.md_api.execute(select(SnapLog)).scalar_one_or_none() assert pl.node_key == node.key assert pl.graph_id == g.get_metadata_obj().hash assert pl.node_start_state == {} assert pl.node_end_state == {} assert pl.snap_key == node.snap.key assert pl.snap_params == {}
class TestStreams: def setup(self): ctx = make_test_run_context() self.ctx = ctx self.env = ctx.env self.g = Graph(self.env) self.graph = self.g.get_metadata_obj() self.dr1t1 = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) self.dr2t1 = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) self.dr1t2 = DataBlockMetadata( nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema2", ) self.dr2t2 = DataBlockMetadata( nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema2", ) self.node_source = self.g.create_node(key="pipe_source", pipe=pipe_t1_source) self.node1 = self.g.create_node(key="pipe1", pipe=pipe_t1_sink, upstream="pipe_source") self.node2 = self.g.create_node(key="pipe2", pipe=pipe_t1_to_t2, upstream="pipe_source") self.node3 = self.g.create_node(key="pipe3", pipe=pipe_generic, upstream="pipe_source") self.sess = self.env._get_new_metadata_session() self.sess.add(self.dr1t1) self.sess.add(self.dr2t1) self.sess.add(self.dr1t2) self.sess.add(self.dr2t2) self.sess.add(self.graph) def teardown(self): self.sess.close() def test_stream_unprocessed_pristine(self): s = StreamBuilder(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() is None def test_stream_unprocessed_eligible(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl]) s = StreamBuilder(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() == self.dr1t1 def test_stream_unprocessed_ineligible_already_input(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = PipeLog( graph_id=self.graph.hash, node_key=self.node1.key, pipe_key=self.node1.pipe.key, runtime_url="test", ) drl2 = DataBlockLog( pipe_log=dfl2, data_block=self.dr1t1, direction=Direction.INPUT, ) self.sess.add_all([dfl, drl, dfl2, drl2]) s = StreamBuilder(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() is None def test_stream_unprocessed_ineligible_already_output(self): """ By default we don't input a block that has already been output by a DF, _even if that block was never input_, UNLESS input is a self reference (`this`). This is to prevent infinite loops. """ dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = PipeLog( graph_id=self.graph.hash, node_key=self.node1.key, pipe_key=self.node1.pipe.key, runtime_url="test", ) drl2 = DataBlockLog( pipe_log=dfl2, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl, dfl2, drl2]) s = StreamBuilder(nodes=self.node_source) s1 = s.filter_unprocessed(self.node1) assert s1.get_query(self.ctx, self.sess).first() is None # But ok with self reference s2 = s.filter_unprocessed(self.node1, allow_cycle=True) assert s2.get_query(self.ctx, self.sess).first() == self.dr1t1 def test_stream_unprocessed_eligible_schema(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl]) s = StreamBuilder(nodes=self.node_source, schema="TestSchema1") s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() == self.dr1t1 s = StreamBuilder(nodes=self.node_source, schema="TestSchema2") s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() is None def test_operators(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) drl2 = DataBlockLog( pipe_log=dfl, data_block=self.dr2t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl, drl2]) self._cnt = 0 @operator def count(stream: DataBlockStream) -> DataBlockStream: for db in stream: self._cnt += 1 yield db sb = StreamBuilder(nodes=self.node_source) expected_cnt = sb.get_query(self.ctx, self.sess).count() assert expected_cnt == 2 list(count(sb).as_managed_stream(self.ctx, self.sess)) assert self._cnt == expected_cnt # Test composed operators self._cnt = 0 list(count(latest(sb)).as_managed_stream(self.ctx, self.sess)) assert self._cnt == 1 # Test kwargs self._cnt = 0 list( count(filter(sb, function=lambda db: False)).as_managed_stream( self.ctx, self.sess)) assert self._cnt == 0
class TestStreams: def setup(self): ctx = make_test_run_context() self.ctx = ctx self.env = ctx.env self.sess = self.env.md_api.begin() self.sess.__enter__() self.g = Graph(self.env) self.graph = self.g.get_metadata_obj() self.dr1t1 = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) self.dr2t1 = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) self.dr1t2 = DataBlockMetadata( nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema2", ) self.dr2t2 = DataBlockMetadata( nominal_schema_key="_test.TestSchema2", realized_schema_key="_test.TestSchema2", ) self.node_source = self.g.create_node(key="snap_source", snap=snap_t1_source) self.node1 = self.g.create_node(key="snap1", snap=snap_t1_sink, input="snap_source") self.node2 = self.g.create_node(key="snap2", snap=snap_t1_to_t2, input="snap_source") self.node3 = self.g.create_node(key="snap3", snap=snap_generic, input="snap_source") self.env.md_api.add(self.dr1t1) self.env.md_api.add(self.dr2t1) self.env.md_api.add(self.dr1t2) self.env.md_api.add(self.dr2t2) self.env.md_api.add(self.graph) def teardown(self): self.sess.__exit__(None, None, None) def test_stream_unprocessed_pristine(self): s = stream(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query_result(self.env).scalar_one_or_none() is None def test_stream_unprocessed_eligible(self): dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.env.md_api.add_all([dfl, drl]) s = stream(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query_result(self.env).scalar_one_or_none() == self.dr1t1 def test_stream_unprocessed_ineligible_already_input(self): dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = SnapLog( graph_id=self.graph.hash, node_key=self.node1.key, snap_key=self.node1.snap.key, runtime_url="test", ) drl2 = DataBlockLog( snap_log=dfl2, data_block=self.dr1t1, direction=Direction.INPUT, ) self.env.md_api.add_all([dfl, drl, dfl2, drl2]) s = stream(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query_result(self.env).scalar_one_or_none() is None def test_stream_unprocessed_ineligible_already_output(self): """ By default we don't input a block that has already been output by a DF, _even if that block was never input_, UNLESS input is a self reference (`this`). This is to prevent infinite loops. """ dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = SnapLog( graph_id=self.graph.hash, node_key=self.node1.key, snap_key=self.node1.snap.key, runtime_url="test", ) drl2 = DataBlockLog( snap_log=dfl2, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.env.md_api.add_all([dfl, drl, dfl2, drl2]) s = stream(nodes=self.node_source) s1 = s.filter_unprocessed(self.node1) assert s1.get_query_result(self.env).scalar_one_or_none() is None # But ok with self reference s2 = s.filter_unprocessed(self.node1, allow_cycle=True) assert s2.get_query_result(self.env).scalar_one_or_none() == self.dr1t1 def test_stream_unprocessed_eligible_schema(self): dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.env.md_api.add_all([dfl, drl]) s = stream(nodes=self.node_source, schema="TestSchema1") s = s.filter_unprocessed(self.node1) assert s.get_query_result(self.env).scalar_one_or_none() == self.dr1t1 s = stream(nodes=self.node_source, schema="TestSchema2") s = s.filter_unprocessed(self.node1) assert s.get_query_result(self.env).scalar_one_or_none() is None def test_operators(self): dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) drl2 = DataBlockLog( snap_log=dfl, data_block=self.dr2t1, direction=Direction.OUTPUT, ) self.env.md_api.add_all([dfl, drl, drl2]) self._cnt = 0 @operator def count(stream: DataBlockStream) -> DataBlockStream: for db in stream: self._cnt += 1 yield db sb = stream(nodes=self.node_source) expected_cnt = sb.get_count(self.env) assert expected_cnt == 2 list(count(sb).as_managed_stream(self.ctx)) assert self._cnt == expected_cnt # Test composed operators self._cnt = 0 list(count(latest(sb)).as_managed_stream(self.ctx)) assert self._cnt == 1 # Test kwargs self._cnt = 0 list( count(filter(sb, function=lambda db: False)).as_managed_stream( self.ctx)) assert self._cnt == 0 def test_managed_stream(self): dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = SnapLog( graph_id=self.graph.hash, node_key=self.node1.key, snap_key=self.node1.snap.key, runtime_url="test", ) drl2 = DataBlockLog( snap_log=dfl2, data_block=self.dr1t1, direction=Direction.INPUT, ) self.env.md_api.add_all([dfl, drl, dfl2, drl2]) s = stream(nodes=self.node_source) s = s.filter_unprocessed(self.node1) ctx = make_test_run_context() with ctx.env.md_api.begin(): dbs = ManagedDataBlockStream(ctx, stream_builder=s) with pytest.raises(StopIteration): assert next(dbs) is None