예제 #1
0
 def Prepare(self, request, context=None):
     # For now, just use the job name as the job id.
     logging.debug('Got Prepare request.')
     preparation_id = '%s-%s' % (request.job_name, uuid.uuid4())
     provision_info = fn_api_runner.ExtendedProvisionInfo(
         beam_provision_api_pb2.ProvisionInfo(
             job_id=preparation_id,
             job_name=request.job_name,
             pipeline_options=request.pipeline_options,
             retrieval_token=self._artifact_service.retrieval_token(
                 preparation_id)), self._staging_dir)
     self._jobs[preparation_id] = BeamJob(preparation_id,
                                          request.pipeline_options,
                                          request.pipeline, provision_info)
     logging.debug("Prepared job '%s' as '%s'", request.job_name,
                   preparation_id)
     # TODO(angoenka): Pass an appropriate staging_session_token. The token can
     # be obtained in PutArtifactResponse from JobService
     if not self._artifact_staging_endpoint:
         # The front-end didn't try to stage anything, but the worker may
         # request what's here so we should at least store an empty manifest.
         self._artifact_service.CommitManifest(
             beam_artifact_api_pb2.CommitManifestRequest(
                 staging_session_token=preparation_id,
                 manifest=beam_artifact_api_pb2.Manifest()))
     return beam_job_api_pb2.PrepareJobResponse(
         preparation_id=preparation_id,
         artifact_staging_endpoint=self._artifact_staging_endpoint,
         staging_session_token=preparation_id)
예제 #2
0
 def create_beam_job(
     self,
     preparation_id,  # stype: str
     job_name,  # type: str
     pipeline,  # type: beam_runner_api_pb2.Pipeline
     options  # type: struct_pb2.Struct
 ):
     # type: (...) -> BeamJob
     # TODO(angoenka): Pass an appropriate staging_session_token. The token can
     # be obtained in PutArtifactResponse from JobService
     if not self._artifact_staging_endpoint:
         # The front-end didn't try to stage anything, but the worker may
         # request what's here so we should at least store an empty manifest.
         self._legacy_artifact_service.CommitManifest(
             beam_artifact_api_pb2.CommitManifestRequest(
                 staging_session_token=preparation_id,
                 manifest=beam_artifact_api_pb2.Manifest()))
     self._artifact_service.register_job(
         staging_token=preparation_id,
         dependency_sets={
             id: env.dependencies
             for (id, env) in pipeline.components.environments.items()
         })
     provision_info = fn_runner.ExtendedProvisionInfo(
         beam_provision_api_pb2.ProvisionInfo(
             pipeline_options=options,
             retrieval_token=self._legacy_artifact_service.retrieval_token(
                 preparation_id)),
         self._staging_dir,
         job_name=job_name)
     return BeamJob(preparation_id, pipeline, options, provision_info,
                    self._artifact_staging_endpoint, self._artifact_service)
    def __init__(
        self,
        default_environment=None,  # type: Optional[environments.Environment]
        bundle_repeat=0,
        use_state_iterables=False,
        provision_info=None,  # type: Optional[ExtendedProvisionInfo]
        progress_request_frequency=None):
        # type: (...) -> None
        """Creates a new Fn API Runner.

    Args:
      default_environment: the default environment to use for UserFns.
      bundle_repeat: replay every bundle this many extra times, for profiling
          and debugging
      use_state_iterables: Intentionally split gbk iterables over state API
          (for testing)
      provision_info: provisioning info to make available to workers, or None
      progress_request_frequency: The frequency (in seconds) that the runner
          waits before requesting progress from the SDK.
    """
        super(FnApiRunner, self).__init__()
        self._default_environment = (default_environment or
                                     environments.EmbeddedPythonEnvironment())
        self._bundle_repeat = bundle_repeat
        self._num_workers = 1
        self._progress_frequency = progress_request_frequency
        self._profiler_factory = None  # type: Optional[Callable[..., profiler.Profile]]
        self._use_state_iterables = use_state_iterables
        self._provision_info = provision_info or ExtendedProvisionInfo(
            beam_provision_api_pb2.ProvisionInfo(
                retrieval_token='unused-retrieval-token'))
예제 #4
0
    def run_pipeline(self, pipeline, options):

        from apache_beam.pipeline import PipelineVisitor
        from apache_beam.runners.dataflow.native_io.iobase import NativeSource
        from apache_beam.runners.dataflow.native_io.iobase import _NativeWrite
        from apache_beam.testing.test_stream import TestStream

        class _FnApiRunnerSupportVisitor(PipelineVisitor):
            """Visitor determining if a Pipeline can be run on the FnApiRunner."""
            def accept(self, pipeline):
                self.supported_by_fnapi_runner = True
                pipeline.visit(self)
                return self.supported_by_fnapi_runner

            def visit_transform(self, applied_ptransform):
                transform = applied_ptransform.transform
                # The FnApiRunner does not support streaming execution.
                if isinstance(transform, TestStream):
                    self.supported_by_fnapi_runner = False
                # The FnApiRunner does not support reads from NativeSources.
                if (isinstance(transform, beam.io.Read)
                        and isinstance(transform.source, NativeSource)):
                    self.supported_by_fnapi_runner = False
                # The FnApiRunner does not support the use of _NativeWrites.
                if isinstance(transform, _NativeWrite):
                    self.supported_by_fnapi_runner = False
                if isinstance(transform, beam.ParDo):
                    dofn = transform.dofn
                    # The FnApiRunner does not support execution of CombineFns with
                    # deferred side inputs.
                    if isinstance(dofn, CombineValuesDoFn):
                        args, kwargs = transform.raw_side_inputs
                        args_to_check = itertools.chain(args, kwargs.values())
                        if any(
                                isinstance(arg, ArgumentPlaceholder)
                                for arg in args_to_check):
                            self.supported_by_fnapi_runner = False
                    if userstate.is_stateful_dofn(dofn):
                        _, timer_specs = userstate.get_dofn_specs(dofn)
                        for timer in timer_specs:
                            if timer.time_domain == TimeDomain.REAL_TIME:
                                self.supported_by_fnapi_runner = False

        # Check whether all transforms used in the pipeline are supported by the
        # FnApiRunner, and the pipeline was not meant to be run as streaming.
        if _FnApiRunnerSupportVisitor().accept(pipeline):
            from apache_beam.portability.api import beam_provision_api_pb2
            from apache_beam.runners.portability.fn_api_runner import fn_runner
            from apache_beam.runners.portability.portable_runner import JobServiceHandle
            all_options = options.get_all_options()
            encoded_options = JobServiceHandle.encode_pipeline_options(
                all_options)
            provision_info = fn_runner.ExtendedProvisionInfo(
                beam_provision_api_pb2.ProvisionInfo(
                    pipeline_options=encoded_options))
            runner = fn_runner.FnApiRunner(provision_info=provision_info)
        else:
            runner = BundleBasedDirectRunner()

        return runner.run_pipeline(pipeline, options)
예제 #5
0
 def __init__(self,
              provision_info=None,  # type: Optional[beam_provision_api_pb2.ProvisionInfo]
              artifact_staging_dir=None,
              job_name=None,  # type: Optional[str]
             ):
   self.provision_info = (
       provision_info or beam_provision_api_pb2.ProvisionInfo())
   self.artifact_staging_dir = artifact_staging_dir
   self.job_name = job_name
예제 #6
0
 def create_beam_job(self, preparation_id, job_name, pipeline, options):
     # TODO(angoenka): Pass an appropriate staging_session_token. The token can
     # be obtained in PutArtifactResponse from JobService
     if not self._artifact_staging_endpoint:
         # The front-end didn't try to stage anything, but the worker may
         # request what's here so we should at least store an empty manifest.
         self._artifact_service.CommitManifest(
             beam_artifact_api_pb2.CommitManifestRequest(
                 staging_session_token=preparation_id,
                 manifest=beam_artifact_api_pb2.Manifest()))
     provision_info = fn_api_runner.ExtendedProvisionInfo(
         beam_provision_api_pb2.ProvisionInfo(
             job_id=preparation_id,
             job_name=job_name,
             pipeline_options=options,
             retrieval_token=self._artifact_service.retrieval_token(
                 preparation_id)), self._staging_dir)
     return BeamJob(preparation_id, pipeline, options, provision_info,
                    self._artifact_staging_endpoint)
예제 #7
0
 def create_beam_job(
     self,
     preparation_id,  # stype: str
     job_name,  # type: str
     pipeline,  # type: beam_runner_api_pb2.Pipeline
     options  # type: struct_pb2.Struct
 ):
     # type: (...) -> BeamJob
     self._artifact_service.register_job(
         staging_token=preparation_id,
         dependency_sets={
             id: env.dependencies
             for (id, env) in pipeline.components.environments.items()
         })
     provision_info = fn_runner.ExtendedProvisionInfo(
         beam_provision_api_pb2.ProvisionInfo(pipeline_options=options),
         self._staging_dir,
         job_name=job_name)
     return BeamJob(preparation_id, pipeline, options, provision_info,
                    self._artifact_staging_endpoint, self._artifact_service)