Esempio n. 1
0
    def test_source_to_cache_not_changed_for_same_source(self, cell):
        with cell:  # Cell 1
            pipeline = _build_an_empty_stream_pipeline()
            transform = beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB)

        with cell:  # Cell 2
            read_foo_1 = pipeline | 'Read' >> transform
            ib.watch({'read_foo_1': read_foo_1})

        # Sets the signature for current pipeline state.
        ie.current_env().set_cached_source_signature(
            pipeline, bcj.extract_source_to_cache_signature(pipeline))

        with cell:  # Cell 3
            # Apply exactly the same transform and the same instance.
            read_foo_2 = pipeline | 'Read' >> transform
            ib.watch({'read_foo_2': read_foo_2})

        self.assertFalse(bcj.is_source_to_cache_changed(pipeline))

        with cell:  # Cell 4
            # Apply the same transform but represented in a different instance.
            # The signature representing the urn and payload is still the same, so it
            # is not treated as a new unbounded source.
            read_foo_3 = pipeline | 'Read' >> beam.io.ReadFromPubSub(
                subscription=_FOO_PUBSUB_SUB)
            ib.watch({'read_foo_3': read_foo_3})

        self.assertFalse(bcj.is_source_to_cache_changed(pipeline))
    def test_background_caching_job_starts_when_none_such_job_exists(self):

        # Create a fake PipelineResult and PipelineRunner. This is because we want
        # to test whether the BackgroundCachingJob can be started without having to
        # rely on a real pipeline run.
        class FakePipelineResult(beam.runners.runner.PipelineResult):
            def wait_until_finish(self):
                return

        class FakePipelineRunner(beam.runners.PipelineRunner):
            def run_pipeline(self, pipeline, options):
                return FakePipelineResult(
                    beam.runners.runner.PipelineState.RUNNING)

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            FakePipelineRunner()),
                          options=PipelineOptions(streaming=True))

        # pylint: disable=possibly-unused-variable
        elems = p | 'Read' >> beam.io.ReadFromPubSub(
            subscription=_FOO_PUBSUB_SUB)

        ib.watch(locals())

        _setup_test_streaming_cache(p)
        p.run()
        self.assertIsNotNone(ie.current_env().get_background_caching_job(p))
        expected_cached_source_signature = bcj.extract_source_to_cache_signature(
            p)
        # This is to check whether the cached source signature is set correctly
        # when the background caching job is started.
        self.assertEqual(expected_cached_source_signature,
                         ie.current_env().get_cached_source_signature(p))
Esempio n. 3
0
    def test_source_to_cache_not_changed_when_source_is_removed(self, cell):
        with cell:  # Cell 1
            pipeline = _build_an_empty_stream_pipeline()
            foo_transform = beam.io.ReadFromPubSub(
                subscription=_FOO_PUBSUB_SUB)
            bar_transform = beam.io.ReadFromPubSub(
                subscription=_BAR_PUBSUB_SUB)

        with cell:  # Cell 2
            read_foo = pipeline | 'Read' >> foo_transform
            ib.watch({'read_foo': read_foo})

        signature_with_only_foo = bcj.extract_source_to_cache_signature(
            pipeline)

        with cell:  # Cell 3
            read_bar = pipeline | 'Read' >> bar_transform
            ib.watch({'read_bar': read_bar})

        self.assertTrue(bcj.is_source_to_cache_changed(pipeline))
        signature_with_foo_bar = ie.current_env().get_cached_source_signature(
            pipeline)
        self.assertNotEqual(signature_with_only_foo, signature_with_foo_bar)

        class BarPruneVisitor(PipelineVisitor):
            def enter_composite_transform(self, transform_node):
                pruned_parts = list(transform_node.parts)
                for part in transform_node.parts:
                    if part.transform is bar_transform:
                        pruned_parts.remove(part)
                transform_node.parts = tuple(pruned_parts)
                self.visit_transform(transform_node)

            def visit_transform(self, transform_node):
                if transform_node.transform is bar_transform:
                    transform_node.parent = None

        v = BarPruneVisitor()
        pipeline.visit(v)

        signature_after_pruning_bar = bcj.extract_source_to_cache_signature(
            pipeline)
        self.assertEqual(signature_with_only_foo, signature_after_pruning_bar)
        self.assertFalse(bcj.is_source_to_cache_changed(pipeline))
 def test_background_caching_job_starts_when_none_such_job_exists(self):
     p = _build_a_test_stream_pipeline()
     p.run()
     self.assertIsNotNone(ie.current_env().get_background_caching_job(p))
     expected_cached_source_signature = bcj.extract_source_to_cache_signature(
         p)
     # This is to check whether the cached source signature is set correctly
     # when the background caching job is started.
     self.assertEqual(expected_cached_source_signature,
                      ie.current_env().get_cached_source_signature(p))
Esempio n. 5
0
    def test_source_to_cache_changed_when_source_is_altered(self, cell):
        with cell:  # Cell 1
            pipeline = _build_an_empty_stream_pipeline()
            transform = beam.io.ReadFromPubSub(subscription=_FOO_PUBSUB_SUB)
            read_foo = pipeline | 'Read' >> transform
            ib.watch({'read_foo': read_foo})

        # Sets the signature for current pipeline state.
        ie.current_env().set_cached_source_signature(
            pipeline, bcj.extract_source_to_cache_signature(pipeline))

        with cell:  # Cell 2
            from apache_beam.io.gcp.pubsub import _PubSubSource
            # Alter the transform.
            transform._source = _PubSubSource(subscription=_BAR_PUBSUB_SUB)

        self.assertTrue(bcj.is_source_to_cache_changed(pipeline))
Esempio n. 6
0
    def test_source_to_cache_changed_when_new_source_is_added(self, cell):
        with cell:  # Cell 1
            pipeline = _build_an_empty_stream_pipeline()
            read_foo = pipeline | 'Read' >> beam.io.ReadFromPubSub(
                subscription=_FOO_PUBSUB_SUB)
            ib.watch({'read_foo': read_foo})

        # Sets the signature for current pipeline state.
        ie.current_env().set_cached_source_signature(
            pipeline, bcj.extract_source_to_cache_signature(pipeline))

        with cell:  # Cell 2
            read_bar = pipeline | 'Read' >> beam.io.ReadFromPubSub(
                subscription=_BAR_PUBSUB_SUB)
            ib.watch({'read_bar': read_bar})

        self.assertTrue(bcj.is_source_to_cache_changed(pipeline))