def test_stream_unprocessed_ineligible_already_input(self):
        dfl = PipeLog(
            graph_id=self.graph.hash,
            node_key=self.node_source.key,
            pipe_key=self.node_source.pipe.key,
            runtime_url="test",
        )
        drl = DataBlockLog(
            pipe_log=dfl,
            data_block=self.dr1t1,
            direction=Direction.OUTPUT,
        )
        dfl2 = PipeLog(
            graph_id=self.graph.hash,
            node_key=self.node1.key,
            pipe_key=self.node1.pipe.key,
            runtime_url="test",
        )
        drl2 = DataBlockLog(
            pipe_log=dfl2,
            data_block=self.dr1t1,
            direction=Direction.INPUT,
        )
        self.sess.add_all([dfl, drl, dfl2, drl2])

        s = StreamBuilder(nodes=self.node_source)
        s = s.filter_unprocessed(self.node1)
        assert s.get_query(self.ctx, self.sess).first() is None
Example #2
0
def test_inputs():
    ec = make_test_run_context()
    env = ec.env
    g = graph()
    n1 = g.create_node(pipe=pipe_t1_source)
    n2 = g.create_node(pipe=pipe_t1_to_t2, upstream={"input": n1})
    pi = n2.instantiate(env).get_interface()
    assert pi is not None
    n4 = g.create_node(pipe=pipe_multiple_input)
    n4.set_upstream({"input": n1})
    pi = n4.instantiate(env).get_interface()
    assert pi is not None

    ec.graph = g.instantiate(env)
    with env.session_scope() as sess:
        im = NodeInterfaceManager(ctx=ec, sess=sess, node=n1.instantiate(env))
        bi = im.get_bound_interface()
        assert bi is not None
        im = NodeInterfaceManager(ctx=ec, sess=sess, node=n4.instantiate(env))
        db = DataBlockMetadata(
            nominal_schema_key="_test.TestSchema1",
            realized_schema_key="_test.TestSchema1",
        )
        sess.add(db)
        bi = im.get_bound_interface(
            {"input": StreamBuilder().as_managed_stream(ec, sess)}
        )
        assert bi is not None
Example #3
0
def test_inputs():
    ec = make_test_run_context()
    env = ec.env
    g = graph()
    n1 = g.create_node(function=function_t1_source)
    n2 = g.create_node(function=function_t1_to_t2, inputs={"input": n1})
    pi = n2.instantiate(env).get_interface()
    assert pi is not None
    n4 = g.create_node(function=function_multiple_input)
    n4.set_inputs({"input": n1})
    pi = n4.instantiate(env).get_interface()
    assert pi is not None

    # ec.graph = g.instantiate(env)
    n1 = n1.instantiate(env)
    n4 = n4.instantiate(env)
    with env.md_api.begin():
        exe = Executable(node=n1, function=n1.function, execution_context=ec)
        im = NodeInterfaceManager(exe)
        bi = im.get_bound_interface()
        assert bi is not None
        exe = Executable(node=n4, function=n4.function, execution_context=ec)
        im = NodeInterfaceManager(exe)
        db = DataBlockMetadata(
            nominal_schema_key="_test.TestSchema1",
            realized_schema_key="_test.TestSchema1",
        )
        env.md_api.add(db)
        bi = im.get_bound_interface(
            {"input": StreamBuilder().as_managed_stream(ec)})
        assert bi is not None
    def test_operators(self):
        dfl = PipeLog(
            graph_id=self.graph.hash,
            node_key=self.node_source.key,
            pipe_key=self.node_source.pipe.key,
            runtime_url="test",
        )
        drl = DataBlockLog(
            pipe_log=dfl,
            data_block=self.dr1t1,
            direction=Direction.OUTPUT,
        )
        drl2 = DataBlockLog(
            pipe_log=dfl,
            data_block=self.dr2t1,
            direction=Direction.OUTPUT,
        )
        self.sess.add_all([dfl, drl, drl2])

        self._cnt = 0

        @operator
        def count(stream: DataBlockStream) -> DataBlockStream:
            for db in stream:
                self._cnt += 1
                yield db

        sb = StreamBuilder(nodes=self.node_source)
        expected_cnt = sb.get_query(self.ctx, self.sess).count()
        assert expected_cnt == 2
        list(count(sb).as_managed_stream(self.ctx, self.sess))
        assert self._cnt == expected_cnt

        # Test composed operators
        self._cnt = 0
        list(count(latest(sb)).as_managed_stream(self.ctx, self.sess))
        assert self._cnt == 1

        # Test kwargs
        self._cnt = 0
        list(
            count(filter(sb, function=lambda db: False)).as_managed_stream(
                self.ctx, self.sess))
        assert self._cnt == 0
Example #5
0
def ensure_stream(stream_like: StreamLike) -> StreamBuilder:
    from snapflow.core.streams import StreamBuilder, StreamLike

    if isinstance(stream_like, StreamBuilder):
        return stream_like
    if isinstance(stream_like, DeclaredNode) or isinstance(stream_like, Node):
        return stream_like.as_stream_builder()
    if isinstance(stream_like, str):
        return StreamBuilder().filter_inputs([stream_like])
    raise TypeError(stream_like)
Example #6
0
 def _filter_stream(
     self,
     stream_builder: StreamBuilder,
     input: NodeInput,
     storages: List[Storage] = None,
 ) -> StreamBuilder:
     logger.debug(
         f"{stream_builder.get_count(self.ctx, self.sess)} available DataBlocks"
     )
     if storages:
         stream_builder = stream_builder.filter_storages(storages)
         logger.debug(
             f"{stream_builder.get_count(self.ctx, self.sess)} available DataBlocks in storages {storages}"
         )
     logger.debug(f"Finding unprocessed input for: {stream_builder}")
     stream_builder = stream_builder.filter_unprocessed(
         self.node, allow_cycle=input.annotation.is_self_ref)
     logger.debug(
         f"{stream_builder.get_count(self.ctx, self.sess)} unprocessed DataBlocks"
     )
     return stream_builder
Example #7
0
 def _filter_stream(
     self,
     stream_builder: StreamBuilder,
     input: NodeInput,
     storages: List[Storage] = None,
 ) -> StreamBuilder:
     logger.debug(
         f"{stream_builder.get_count(self.env)} available DataBlocks")
     if storages:
         stream_builder = stream_builder.filter_storages(storages)
         logger.debug(
             f"{stream_builder.get_count(self.env)} available DataBlocks in storages {storages}"
         )
     if input.declared_input.reference:
         logger.debug("Reference input, taking latest")
         stream_builder = operators.latest(stream_builder)
     else:
         logger.debug(f"Finding unprocessed input for: {stream_builder}")
         stream_builder = stream_builder.filter_unprocessed(
             self.node, allow_cycle=input.declared_input.from_self)
         logger.debug(
             f"{stream_builder.get_count(self.env)} unprocessed DataBlocks")
     return stream_builder
    def test_stream_unprocessed_ineligible_already_output(self):
        """
        By default we don't input a block that has already been output by a DF, _even if that block was never input_,
        UNLESS input is a self reference (`this`). This is to prevent infinite loops.
        """
        dfl = PipeLog(
            graph_id=self.graph.hash,
            node_key=self.node_source.key,
            pipe_key=self.node_source.pipe.key,
            runtime_url="test",
        )
        drl = DataBlockLog(
            pipe_log=dfl,
            data_block=self.dr1t1,
            direction=Direction.OUTPUT,
        )
        dfl2 = PipeLog(
            graph_id=self.graph.hash,
            node_key=self.node1.key,
            pipe_key=self.node1.pipe.key,
            runtime_url="test",
        )
        drl2 = DataBlockLog(
            pipe_log=dfl2,
            data_block=self.dr1t1,
            direction=Direction.OUTPUT,
        )
        self.sess.add_all([dfl, drl, dfl2, drl2])

        s = StreamBuilder(nodes=self.node_source)
        s1 = s.filter_unprocessed(self.node1)
        assert s1.get_query(self.ctx, self.sess).first() is None

        # But ok with self reference
        s2 = s.filter_unprocessed(self.node1, allow_cycle=True)
        assert s2.get_query(self.ctx, self.sess).first() == self.dr1t1
    def test_stream_unprocessed_eligible_schema(self):
        dfl = PipeLog(
            graph_id=self.graph.hash,
            node_key=self.node_source.key,
            pipe_key=self.node_source.pipe.key,
            runtime_url="test",
        )
        drl = DataBlockLog(
            pipe_log=dfl,
            data_block=self.dr1t1,
            direction=Direction.OUTPUT,
        )
        self.sess.add_all([dfl, drl])

        s = StreamBuilder(nodes=self.node_source, schema="TestSchema1")
        s = s.filter_unprocessed(self.node1)
        assert s.get_query(self.ctx, self.sess).first() == self.dr1t1

        s = StreamBuilder(nodes=self.node_source, schema="TestSchema2")
        s = s.filter_unprocessed(self.node1)
        assert s.get_query(self.ctx, self.sess).first() is None
 def test_stream_unprocessed_pristine(self):
     s = StreamBuilder(nodes=self.node_source)
     s = s.filter_unprocessed(self.node1)
     assert s.get_query(self.ctx, self.sess).first() is None
Example #11
0
    def as_stream_builder(self) -> StreamBuilder:
        from snapflow.core.streams import StreamBuilder

        return StreamBuilder().filter_inputs(self)
    def as_stream_builder(self) -> StreamBuilder:
        from snapflow.core.streams import StreamBuilder

        return StreamBuilder(nodes=self)