def run_pipeline(self, pipeline, options): from apache_beam.pipeline import PipelineVisitor from apache_beam.runners.dataflow.native_io.iobase import NativeSource from apache_beam.runners.dataflow.native_io.iobase import _NativeWrite from apache_beam.testing.test_stream import TestStream class _FnApiRunnerSupportVisitor(PipelineVisitor): """Visitor determining if a Pipeline can be run on the FnApiRunner.""" def accept(self, pipeline): self.supported_by_fnapi_runner = True pipeline.visit(self) return self.supported_by_fnapi_runner def visit_transform(self, applied_ptransform): transform = applied_ptransform.transform # The FnApiRunner does not support streaming execution. if isinstance(transform, TestStream): self.supported_by_fnapi_runner = False # The FnApiRunner does not support reads from NativeSources. if (isinstance(transform, beam.io.Read) and isinstance(transform.source, NativeSource)): self.supported_by_fnapi_runner = False # The FnApiRunner does not support the use of _NativeWrites. if isinstance(transform, _NativeWrite): self.supported_by_fnapi_runner = False if isinstance(transform, beam.ParDo): dofn = transform.dofn # The FnApiRunner does not support execution of CombineFns with # deferred side inputs. if isinstance(dofn, CombineValuesDoFn): args, kwargs = transform.raw_side_inputs args_to_check = itertools.chain(args, kwargs.values()) if any( isinstance(arg, ArgumentPlaceholder) for arg in args_to_check): self.supported_by_fnapi_runner = False if userstate.is_stateful_dofn(dofn): _, timer_specs = userstate.get_dofn_specs(dofn) for timer in timer_specs: if timer.time_domain == TimeDomain.REAL_TIME: self.supported_by_fnapi_runner = False # Check whether all transforms used in the pipeline are supported by the # FnApiRunner, and the pipeline was not meant to be run as streaming. if _FnApiRunnerSupportVisitor().accept(pipeline): from apache_beam.portability.api import beam_provision_api_pb2 from apache_beam.runners.portability.fn_api_runner import fn_runner from apache_beam.runners.portability.portable_runner import JobServiceHandle all_options = options.get_all_options() encoded_options = JobServiceHandle.encode_pipeline_options( all_options) provision_info = fn_runner.ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo( pipeline_options=encoded_options)) runner = fn_runner.FnApiRunner(provision_info=provision_info) else: runner = BundleBasedDirectRunner() return runner.run_pipeline(pipeline, options)
def create_beam_job( self, preparation_id, # stype: str job_name, # type: str pipeline, # type: beam_runner_api_pb2.Pipeline options # type: struct_pb2.Struct ): # type: (...) -> BeamJob # TODO(angoenka): Pass an appropriate staging_session_token. The token can # be obtained in PutArtifactResponse from JobService if not self._artifact_staging_endpoint: # The front-end didn't try to stage anything, but the worker may # request what's here so we should at least store an empty manifest. self._legacy_artifact_service.CommitManifest( beam_artifact_api_pb2.CommitManifestRequest( staging_session_token=preparation_id, manifest=beam_artifact_api_pb2.Manifest())) self._artifact_service.register_job( staging_token=preparation_id, dependency_sets={ id: env.dependencies for (id, env) in pipeline.components.environments.items() }) provision_info = fn_runner.ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo( pipeline_options=options, retrieval_token=self._legacy_artifact_service.retrieval_token( preparation_id)), self._staging_dir, job_name=job_name) return BeamJob(preparation_id, pipeline, options, provision_info, self._artifact_staging_endpoint, self._artifact_service)
def create_beam_job( self, preparation_id, # stype: str job_name, # type: str pipeline, # type: beam_runner_api_pb2.Pipeline options # type: struct_pb2.Struct ): # type: (...) -> BeamJob self._artifact_service.register_job( staging_token=preparation_id, dependency_sets={ id: env.dependencies for (id, env) in pipeline.components.environments.items() }) provision_info = fn_runner.ExtendedProvisionInfo( beam_provision_api_pb2.ProvisionInfo(pipeline_options=options), self._staging_dir, job_name=job_name) return BeamJob(preparation_id, pipeline, options, provision_info, self._artifact_staging_endpoint, self._artifact_service)