Example #1
0
    def run_pipeline(self, pipeline, options):

        from apache_beam.pipeline import PipelineVisitor
        from apache_beam.runners.dataflow.native_io.iobase import NativeSource
        from apache_beam.runners.dataflow.native_io.iobase import _NativeWrite
        from apache_beam.testing.test_stream import TestStream

        class _FnApiRunnerSupportVisitor(PipelineVisitor):
            """Visitor determining if a Pipeline can be run on the FnApiRunner."""
            def accept(self, pipeline):
                self.supported_by_fnapi_runner = True
                pipeline.visit(self)
                return self.supported_by_fnapi_runner

            def visit_transform(self, applied_ptransform):
                transform = applied_ptransform.transform
                # The FnApiRunner does not support streaming execution.
                if isinstance(transform, TestStream):
                    self.supported_by_fnapi_runner = False
                # The FnApiRunner does not support reads from NativeSources.
                if (isinstance(transform, beam.io.Read)
                        and isinstance(transform.source, NativeSource)):
                    self.supported_by_fnapi_runner = False
                # The FnApiRunner does not support the use of _NativeWrites.
                if isinstance(transform, _NativeWrite):
                    self.supported_by_fnapi_runner = False
                if isinstance(transform, beam.ParDo):
                    dofn = transform.dofn
                    # The FnApiRunner does not support execution of CombineFns with
                    # deferred side inputs.
                    if isinstance(dofn, CombineValuesDoFn):
                        args, kwargs = transform.raw_side_inputs
                        args_to_check = itertools.chain(args, kwargs.values())
                        if any(
                                isinstance(arg, ArgumentPlaceholder)
                                for arg in args_to_check):
                            self.supported_by_fnapi_runner = False
                    if userstate.is_stateful_dofn(dofn):
                        _, timer_specs = userstate.get_dofn_specs(dofn)
                        for timer in timer_specs:
                            if timer.time_domain == TimeDomain.REAL_TIME:
                                self.supported_by_fnapi_runner = False

        # Check whether all transforms used in the pipeline are supported by the
        # FnApiRunner, and the pipeline was not meant to be run as streaming.
        if _FnApiRunnerSupportVisitor().accept(pipeline):
            from apache_beam.portability.api import beam_provision_api_pb2
            from apache_beam.runners.portability.fn_api_runner import fn_runner
            from apache_beam.runners.portability.portable_runner import JobServiceHandle
            all_options = options.get_all_options()
            encoded_options = JobServiceHandle.encode_pipeline_options(
                all_options)
            provision_info = fn_runner.ExtendedProvisionInfo(
                beam_provision_api_pb2.ProvisionInfo(
                    pipeline_options=encoded_options))
            runner = fn_runner.FnApiRunner(provision_info=provision_info)
        else:
            runner = BundleBasedDirectRunner()

        return runner.run_pipeline(pipeline, options)
Example #2
0
 def create_beam_job(
     self,
     preparation_id,  # stype: str
     job_name,  # type: str
     pipeline,  # type: beam_runner_api_pb2.Pipeline
     options  # type: struct_pb2.Struct
 ):
     # type: (...) -> BeamJob
     # TODO(angoenka): Pass an appropriate staging_session_token. The token can
     # be obtained in PutArtifactResponse from JobService
     if not self._artifact_staging_endpoint:
         # The front-end didn't try to stage anything, but the worker may
         # request what's here so we should at least store an empty manifest.
         self._legacy_artifact_service.CommitManifest(
             beam_artifact_api_pb2.CommitManifestRequest(
                 staging_session_token=preparation_id,
                 manifest=beam_artifact_api_pb2.Manifest()))
     self._artifact_service.register_job(
         staging_token=preparation_id,
         dependency_sets={
             id: env.dependencies
             for (id, env) in pipeline.components.environments.items()
         })
     provision_info = fn_runner.ExtendedProvisionInfo(
         beam_provision_api_pb2.ProvisionInfo(
             pipeline_options=options,
             retrieval_token=self._legacy_artifact_service.retrieval_token(
                 preparation_id)),
         self._staging_dir,
         job_name=job_name)
     return BeamJob(preparation_id, pipeline, options, provision_info,
                    self._artifact_staging_endpoint, self._artifact_service)
Example #3
0
 def create_beam_job(
     self,
     preparation_id,  # stype: str
     job_name,  # type: str
     pipeline,  # type: beam_runner_api_pb2.Pipeline
     options  # type: struct_pb2.Struct
 ):
     # type: (...) -> BeamJob
     self._artifact_service.register_job(
         staging_token=preparation_id,
         dependency_sets={
             id: env.dependencies
             for (id, env) in pipeline.components.environments.items()
         })
     provision_info = fn_runner.ExtendedProvisionInfo(
         beam_provision_api_pb2.ProvisionInfo(pipeline_options=options),
         self._staging_dir,
         job_name=job_name)
     return BeamJob(preparation_id, pipeline, options, provision_info,
                    self._artifact_staging_endpoint, self._artifact_service)