def test_stream_unprocessed_ineligible_already_input(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = PipeLog( graph_id=self.graph.hash, node_key=self.node1.key, pipe_key=self.node1.pipe.key, runtime_url="test", ) drl2 = DataBlockLog( pipe_log=dfl2, data_block=self.dr1t1, direction=Direction.INPUT, ) self.sess.add_all([dfl, drl, dfl2, drl2]) s = StreamBuilder(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() is None
def test_inputs(): ec = make_test_run_context() env = ec.env g = graph() n1 = g.create_node(pipe=pipe_t1_source) n2 = g.create_node(pipe=pipe_t1_to_t2, upstream={"input": n1}) pi = n2.instantiate(env).get_interface() assert pi is not None n4 = g.create_node(pipe=pipe_multiple_input) n4.set_upstream({"input": n1}) pi = n4.instantiate(env).get_interface() assert pi is not None ec.graph = g.instantiate(env) with env.session_scope() as sess: im = NodeInterfaceManager(ctx=ec, sess=sess, node=n1.instantiate(env)) bi = im.get_bound_interface() assert bi is not None im = NodeInterfaceManager(ctx=ec, sess=sess, node=n4.instantiate(env)) db = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) sess.add(db) bi = im.get_bound_interface( {"input": StreamBuilder().as_managed_stream(ec, sess)} ) assert bi is not None
def test_inputs(): ec = make_test_run_context() env = ec.env g = graph() n1 = g.create_node(function=function_t1_source) n2 = g.create_node(function=function_t1_to_t2, inputs={"input": n1}) pi = n2.instantiate(env).get_interface() assert pi is not None n4 = g.create_node(function=function_multiple_input) n4.set_inputs({"input": n1}) pi = n4.instantiate(env).get_interface() assert pi is not None # ec.graph = g.instantiate(env) n1 = n1.instantiate(env) n4 = n4.instantiate(env) with env.md_api.begin(): exe = Executable(node=n1, function=n1.function, execution_context=ec) im = NodeInterfaceManager(exe) bi = im.get_bound_interface() assert bi is not None exe = Executable(node=n4, function=n4.function, execution_context=ec) im = NodeInterfaceManager(exe) db = DataBlockMetadata( nominal_schema_key="_test.TestSchema1", realized_schema_key="_test.TestSchema1", ) env.md_api.add(db) bi = im.get_bound_interface( {"input": StreamBuilder().as_managed_stream(ec)}) assert bi is not None
def test_operators(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) drl2 = DataBlockLog( pipe_log=dfl, data_block=self.dr2t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl, drl2]) self._cnt = 0 @operator def count(stream: DataBlockStream) -> DataBlockStream: for db in stream: self._cnt += 1 yield db sb = StreamBuilder(nodes=self.node_source) expected_cnt = sb.get_query(self.ctx, self.sess).count() assert expected_cnt == 2 list(count(sb).as_managed_stream(self.ctx, self.sess)) assert self._cnt == expected_cnt # Test composed operators self._cnt = 0 list(count(latest(sb)).as_managed_stream(self.ctx, self.sess)) assert self._cnt == 1 # Test kwargs self._cnt = 0 list( count(filter(sb, function=lambda db: False)).as_managed_stream( self.ctx, self.sess)) assert self._cnt == 0
def ensure_stream(stream_like: StreamLike) -> StreamBuilder: from snapflow.core.streams import StreamBuilder, StreamLike if isinstance(stream_like, StreamBuilder): return stream_like if isinstance(stream_like, DeclaredNode) or isinstance(stream_like, Node): return stream_like.as_stream_builder() if isinstance(stream_like, str): return StreamBuilder().filter_inputs([stream_like]) raise TypeError(stream_like)
def _filter_stream( self, stream_builder: StreamBuilder, input: NodeInput, storages: List[Storage] = None, ) -> StreamBuilder: logger.debug( f"{stream_builder.get_count(self.ctx, self.sess)} available DataBlocks" ) if storages: stream_builder = stream_builder.filter_storages(storages) logger.debug( f"{stream_builder.get_count(self.ctx, self.sess)} available DataBlocks in storages {storages}" ) logger.debug(f"Finding unprocessed input for: {stream_builder}") stream_builder = stream_builder.filter_unprocessed( self.node, allow_cycle=input.annotation.is_self_ref) logger.debug( f"{stream_builder.get_count(self.ctx, self.sess)} unprocessed DataBlocks" ) return stream_builder
def _filter_stream( self, stream_builder: StreamBuilder, input: NodeInput, storages: List[Storage] = None, ) -> StreamBuilder: logger.debug( f"{stream_builder.get_count(self.env)} available DataBlocks") if storages: stream_builder = stream_builder.filter_storages(storages) logger.debug( f"{stream_builder.get_count(self.env)} available DataBlocks in storages {storages}" ) if input.declared_input.reference: logger.debug("Reference input, taking latest") stream_builder = operators.latest(stream_builder) else: logger.debug(f"Finding unprocessed input for: {stream_builder}") stream_builder = stream_builder.filter_unprocessed( self.node, allow_cycle=input.declared_input.from_self) logger.debug( f"{stream_builder.get_count(self.env)} unprocessed DataBlocks") return stream_builder
def test_stream_unprocessed_ineligible_already_output(self): """ By default we don't input a block that has already been output by a DF, _even if that block was never input_, UNLESS input is a self reference (`this`). This is to prevent infinite loops. """ dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) dfl2 = PipeLog( graph_id=self.graph.hash, node_key=self.node1.key, pipe_key=self.node1.pipe.key, runtime_url="test", ) drl2 = DataBlockLog( pipe_log=dfl2, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl, dfl2, drl2]) s = StreamBuilder(nodes=self.node_source) s1 = s.filter_unprocessed(self.node1) assert s1.get_query(self.ctx, self.sess).first() is None # But ok with self reference s2 = s.filter_unprocessed(self.node1, allow_cycle=True) assert s2.get_query(self.ctx, self.sess).first() == self.dr1t1
def test_stream_unprocessed_eligible_schema(self): dfl = PipeLog( graph_id=self.graph.hash, node_key=self.node_source.key, pipe_key=self.node_source.pipe.key, runtime_url="test", ) drl = DataBlockLog( pipe_log=dfl, data_block=self.dr1t1, direction=Direction.OUTPUT, ) self.sess.add_all([dfl, drl]) s = StreamBuilder(nodes=self.node_source, schema="TestSchema1") s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() == self.dr1t1 s = StreamBuilder(nodes=self.node_source, schema="TestSchema2") s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() is None
def test_stream_unprocessed_pristine(self): s = StreamBuilder(nodes=self.node_source) s = s.filter_unprocessed(self.node1) assert s.get_query(self.ctx, self.sess).first() is None
def as_stream_builder(self) -> StreamBuilder: from snapflow.core.streams import StreamBuilder return StreamBuilder().filter_inputs(self)
def as_stream_builder(self) -> StreamBuilder: from snapflow.core.streams import StreamBuilder return StreamBuilder(nodes=self)