def test_managed_stream(self): dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = SnapLog( graph_id=self.graph.hash, node_key=self.node1.key, snap_key=self.node1.snap.key, runtime_url="test", ) drl2 = DataBlockLog( snap_log=dfl2, data_block=self.dr1t1, direction=Direction.INPUT, ) self.env.md_api.add_all([dfl, drl, dfl2, drl2]) s = stream(nodes=self.node_source) s = s.filter_unprocessed(self.node1) ctx = make_test_run_context() with ctx.env.md_api.begin(): dbs = ManagedDataBlockStream(ctx, stream_builder=s) with pytest.raises(StopIteration): assert next(dbs) is None
def test_stream_unprocessed_ineligible_already_input(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = PipeLog( graph_id=self.graph.hash, node_key=self.node1.key, pipe_key=self.node1.pipe.key, runtime_url="test", ) drl2 = DataBlockLog( pipe_log=dfl2, data_block=self.dr1t1, direction=Direction.INPUT, ) self.sess.add_all([dfl, drl, dfl2, drl2]) s = StreamBuilder(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() is None
def test_stream_unprocessed_ineligible_already_input(self): dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = SnapLog( graph_id=self.graph.hash, node_key=self.node1.key, snap_key=self.node1.snap.key, runtime_url="test", ) drl2 = DataBlockLog( snap_log=dfl2, data_block=self.dr1t1, direction=Direction.INPUT, ) self.env.md_api.add_all([dfl, drl, dfl2, drl2]) s = stream(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query_result(self.env).scalar_one_or_none() is None
def log(self, block: DataBlockMetadata, direction: Direction): drl = DataBlockLog( # type: ignore pipe_log=self.pipe_log, data_block=block, direction=direction, processed_at=utcnow(), ) self.metadata_session.add(drl)
def test_operators(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) drl2 = DataBlockLog( pipe_log=dfl, data_block=self.dr2t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl, drl2]) self._cnt = 0 @operator def count(stream: DataBlockStream) -> DataBlockStream: for db in stream: self._cnt += 1 yield db sb = StreamBuilder(nodes=self.node_source) expected_cnt = sb.get_query(self.ctx, self.sess).count() assert expected_cnt == 2 list(count(sb).as_managed_stream(self.ctx, self.sess)) assert self._cnt == expected_cnt # Test composed operators self._cnt = 0 list(count(latest(sb)).as_managed_stream(self.ctx, self.sess)) assert self._cnt == 1 # Test kwargs self._cnt = 0 list( count(filter(sb, function=lambda db: False)).as_managed_stream( self.ctx, self.sess)) assert self._cnt == 0
def test_stream_unprocessed_ineligible_already_output(self): """ By default we don't input a block that has already been output by a DF, _even if that block was never input_, UNLESS input is a self reference (`this`). This is to prevent infinite loops. """ dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = PipeLog( graph_id=self.graph.hash, node_key=self.node1.key, pipe_key=self.node1.pipe.key, runtime_url="test", ) drl2 = DataBlockLog( pipe_log=dfl2, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl, dfl2, drl2]) s = StreamBuilder(nodes=self.node_source) s1 = s.filter_unprocessed(self.node1) assert s1.get_query(self.ctx, self.sess).first() is None # But ok with self reference s2 = s.filter_unprocessed(self.node1, allow_cycle=True) assert s2.get_query(self.ctx, self.sess).first() == self.dr1t1
def test_stream_unprocessed_ineligible_already_output(self): """ By default we don't input a block that has already been output by a DF, _even if that block was never input_, UNLESS input is a self reference (`this`). This is to prevent infinite loops. """ dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = SnapLog( graph_id=self.graph.hash, node_key=self.node1.key, snap_key=self.node1.snap.key, runtime_url="test", ) drl2 = DataBlockLog( snap_log=dfl2, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.env.md_api.add_all([dfl, drl, dfl2, drl2]) s = stream(nodes=self.node_source) s1 = s.filter_unprocessed(self.node1) assert s1.get_query_result(self.env).scalar_one_or_none() is None # But ok with self reference s2 = s.filter_unprocessed(self.node1, allow_cycle=True) assert s2.get_query_result(self.env).scalar_one_or_none() == self.dr1t1
def test_stream_unprocessed_eligible(self): dfl = DataFunctionLog( graph_id=self.graph.hash, node_key=self.node_source.key, function_key=self.node_source.function.key, runtime_url="test", ) drl = DataBlockLog( function_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.env.md_api.add_all([dfl, drl]) s = stream(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query_result(self.env).scalar_one_or_none() == self.dr1t1
def ensure_log(self, block: DataBlockMetadata, direction: Direction, name: str): if self.metadata_api.execute( select(DataBlockLog).filter_by( function_log_id=self.function_log.id, stream_name=name, data_block_id=block.id, direction=direction, )).scalar_one_or_none(): return drl = DataBlockLog( # type: ignore function_log_id=self.function_log.id, stream_name=name, data_block_id=block.id, direction=direction, processed_at=utcnow(), ) self.metadata_api.add(drl)
def test_stream_unprocessed_eligible_schema(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl]) s = StreamBuilder(nodes=self.node_source, schema="TestSchema1") s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() == self.dr1t1 s = StreamBuilder(nodes=self.node_source, schema="TestSchema2") s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() is None
def test_stream_unprocessed_eligible_schema(self): dfl = SnapLog( graph_id=self.graph.hash, node_key=self.node_source.key, snap_key=self.node_source.snap.key, runtime_url="test", ) drl = DataBlockLog( snap_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.env.md_api.add_all([dfl, drl]) s = stream(nodes=self.node_source, schema="TestSchema1") s = s.filter_unprocessed(self.node1) assert s.get_query_result(self.env).scalar_one_or_none() == self.dr1t1 s = stream(nodes=self.node_source, schema="TestSchema2") s = s.filter_unprocessed(self.node1) assert s.get_query_result(self.env).scalar_one_or_none() is None