def _get_transform_overrides(pipeline_options): # A list of PTransformOverride objects to be applied before running a pipeline # using DirectRunner. # Currently this only works for overrides where the input and output types do # not change. # For internal use only; no backwards-compatibility guarantees. # Importing following locally to avoid a circular dependency. from apache_beam.pipeline import PTransformOverride from apache_beam.runners.direct.helper_transforms import LiftedCombinePerKey from apache_beam.runners.direct.sdf_direct_runner import ProcessKeyedElementsViaKeyedWorkItemsOverride from apache_beam.runners.direct.sdf_direct_runner import SplittableParDoOverride class CombinePerKeyOverride(PTransformOverride): def matches(self, applied_ptransform): if isinstance(applied_ptransform.transform, CombinePerKey): return applied_ptransform.inputs[0].windowing.is_default() def get_replacement_transform_for_applied_ptransform( self, applied_ptransform): # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position try: transform = applied_ptransform.transform return LiftedCombinePerKey(transform.fn, transform.args, transform.kwargs) except NotImplementedError: return transform class StreamingGroupByKeyOverride(PTransformOverride): def matches(self, applied_ptransform): # Note: we match the exact class, since we replace it with a subclass. return applied_ptransform.transform.__class__ == _GroupByKeyOnly def get_replacement_transform_for_applied_ptransform( self, applied_ptransform): # Use specialized streaming implementation. transform = _StreamingGroupByKeyOnly() return transform class StreamingGroupAlsoByWindowOverride(PTransformOverride): def matches(self, applied_ptransform): # Note: we match the exact class, since we replace it with a subclass. transform = applied_ptransform.transform return (isinstance(applied_ptransform.transform, ParDo) and isinstance(transform.dofn, _GroupAlsoByWindowDoFn) and transform.__class__ != _StreamingGroupAlsoByWindow) def get_replacement_transform_for_applied_ptransform( self, applied_ptransform): # Use specialized streaming implementation. transform = _StreamingGroupAlsoByWindow( applied_ptransform.transform.dofn.windowing) return transform class TestStreamOverride(PTransformOverride): def matches(self, applied_ptransform): from apache_beam.testing.test_stream import TestStream self.applied_ptransform = applied_ptransform return isinstance(applied_ptransform.transform, TestStream) def get_replacement_transform_for_applied_ptransform( self, applied_ptransform): from apache_beam.runners.direct.test_stream_impl import _ExpandableTestStream return _ExpandableTestStream(applied_ptransform.transform) class GroupByKeyPTransformOverride(PTransformOverride): """A ``PTransformOverride`` for ``GroupByKey``. This replaces the Beam implementation as a primitive. """ def matches(self, applied_ptransform): # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.transforms.core import GroupByKey return isinstance(applied_ptransform.transform, GroupByKey) def get_replacement_transform_for_applied_ptransform( self, applied_ptransform): return _GroupByKey() overrides = [ # This needs to be the first and the last override. Other overrides depend # on the GroupByKey implementation to be composed of _GroupByKeyOnly and # _GroupAlsoByWindow. GroupByKeyPTransformOverride(), SplittableParDoOverride(), ProcessKeyedElementsViaKeyedWorkItemsOverride(), CombinePerKeyOverride(), TestStreamOverride(), ] # Add streaming overrides, if necessary. if pipeline_options.view_as(StandardOptions).streaming: overrides.append(StreamingGroupByKeyOverride()) overrides.append(StreamingGroupAlsoByWindowOverride()) # Add PubSub overrides, if PubSub is available. try: from apache_beam.io.gcp import pubsub as unused_pubsub overrides += _get_pubsub_transform_overrides(pipeline_options) except ImportError: pass # This also needs to be last because other transforms apply GBKs which need to # be translated into a DirectRunner-compatible transform. overrides.append(GroupByKeyPTransformOverride()) return overrides
def _get_transform_overrides(pipeline_options): # A list of PTransformOverride objects to be applied before running a pipeline # using DirectRunner. # Currently this only works for overrides where the input and output types do # not change. # For internal use only; no backwards-compatibility guarantees. # Importing following locally to avoid a circular dependency. from apache_beam.pipeline import PTransformOverride from apache_beam.runners.direct.helper_transforms import LiftedCombinePerKey from apache_beam.runners.direct.sdf_direct_runner import ProcessKeyedElementsViaKeyedWorkItemsOverride from apache_beam.runners.direct.sdf_direct_runner import SplittableParDoOverride class CombinePerKeyOverride(PTransformOverride): def matches(self, applied_ptransform): if isinstance(applied_ptransform.transform, CombinePerKey): return applied_ptransform.inputs[0].windowing.is_default() def get_replacement_transform(self, transform): # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position try: return LiftedCombinePerKey( transform.fn, transform.args, transform.kwargs) except NotImplementedError: return transform class StreamingGroupByKeyOverride(PTransformOverride): def matches(self, applied_ptransform): # Note: we match the exact class, since we replace it with a subclass. return applied_ptransform.transform.__class__ == _GroupByKeyOnly def get_replacement_transform(self, transform): # Use specialized streaming implementation. transform = _StreamingGroupByKeyOnly() return transform class StreamingGroupAlsoByWindowOverride(PTransformOverride): def matches(self, applied_ptransform): # Note: we match the exact class, since we replace it with a subclass. transform = applied_ptransform.transform return ( isinstance(applied_ptransform.transform, ParDo) and isinstance(transform.dofn, _GroupAlsoByWindowDoFn) and transform.__class__ != _StreamingGroupAlsoByWindow) def get_replacement_transform(self, transform): # Use specialized streaming implementation. transform = _StreamingGroupAlsoByWindow(transform.dofn.windowing) return transform overrides = [ SplittableParDoOverride(), ProcessKeyedElementsViaKeyedWorkItemsOverride(), CombinePerKeyOverride() ] # Add streaming overrides, if necessary. if pipeline_options.view_as(StandardOptions).streaming: overrides.append(StreamingGroupByKeyOverride()) overrides.append(StreamingGroupAlsoByWindowOverride()) # Add PubSub overrides, if PubSub is available. try: from apache_beam.io.gcp import pubsub as unused_pubsub overrides += _get_pubsub_transform_overrides(pipeline_options) except ImportError: pass return overrides