Esempio n. 1
0
  def run_pipeline(self, pipeline, options):
    # type: (Pipeline, PipelineOptions) -> PipelineResult
    portable_options = options.view_as(PortableOptions)

    # TODO: https://issues.apache.org/jira/browse/BEAM-5525
    # portable runner specific default
    if options.view_as(SetupOptions).sdk_location == 'default':
      options.view_as(SetupOptions).sdk_location = 'container'

    experiments = options.view_as(DebugOptions).experiments or []

    # This is needed as we start a worker server if one is requested
    # but none is provided.
    if portable_options.environment_type == 'LOOPBACK':
      use_loopback_process_worker = options.view_as(
          DebugOptions).lookup_experiment('use_loopback_process_worker', False)
      portable_options.environment_config, server = (
          worker_pool_main.BeamFnExternalWorkerPoolServicer.start(
              state_cache_size=
              sdk_worker_main._get_state_cache_size(experiments),
              data_buffer_time_limit_ms=
              sdk_worker_main._get_data_buffer_time_limit_ms(experiments),
              use_process=use_loopback_process_worker))
      cleanup_callbacks = [functools.partial(server.stop, 1)]
    else:
      cleanup_callbacks = []

    proto_pipeline = self.get_proto_pipeline(pipeline, options)
    job_service_handle = self.create_job_service(options)
    job_id, message_stream, state_stream = \
      job_service_handle.submit(proto_pipeline)

    result = PipelineResult(
        job_service_handle.job_service,
        job_id,
        message_stream,
        state_stream,
        cleanup_callbacks)
    if cleanup_callbacks:
      # Register an exit handler to ensure cleanup on exit.
      atexit.register(functools.partial(result._cleanup, on_exit=True))
      _LOGGER.info(
          'Environment "%s" has started a component necessary for the '
          'execution. Be sure to run the pipeline using\n'
          '  with Pipeline() as p:\n'
          '    p.apply(..)\n'
          'This ensures that the pipeline finishes before this program exits.',
          portable_options.environment_type)
    return result
Esempio n. 2
0
  def run_pipeline(self, pipeline, options):
    # type: (Pipeline, PipelineOptions) -> PipelineResult
    portable_options = options.view_as(PortableOptions)

    # TODO: https://issues.apache.org/jira/browse/BEAM-5525
    # portable runner specific default
    if options.view_as(SetupOptions).sdk_location == 'default':
      options.view_as(SetupOptions).sdk_location = 'container'

    # This is needed as we start a worker server if one is requested
    # but none is provided.
    if portable_options.environment_type == 'LOOPBACK':
      use_loopback_process_worker = options.view_as(
          DebugOptions).lookup_experiment('use_loopback_process_worker', False)
      portable_options.environment_config, server = (
          worker_pool_main.BeamFnExternalWorkerPoolServicer.start(
              state_cache_size=sdk_worker_main._get_state_cache_size(options),
              data_buffer_time_limit_ms=
              sdk_worker_main._get_data_buffer_time_limit_ms(options),
              use_process=use_loopback_process_worker))
      cleanup_callbacks = [functools.partial(server.stop, 1)]
    else:
      cleanup_callbacks = []

    proto_pipeline = self.get_proto_pipeline(pipeline, options)
    job_service_handle = self.create_job_service(options)
    job_id, message_stream, state_stream = \
      job_service_handle.submit(proto_pipeline)

    result = PipelineResult(
        job_service_handle.job_service,
        job_id,
        message_stream,
        state_stream,
        cleanup_callbacks)
    if cleanup_callbacks:
      # We wait here to ensure that we run the cleanup callbacks.
      logging.info(
          'Waiting until the pipeline has finished because the '
          'environment "%s" has started a component necessary for the '
          'execution.',
          portable_options.environment_type)
      result.wait_until_finish()
    return result
Esempio n. 3
0
    def run_pipeline(self, pipeline, options):
        portable_options = options.view_as(PortableOptions)

        # TODO: https://issues.apache.org/jira/browse/BEAM-5525
        # portable runner specific default
        if options.view_as(SetupOptions).sdk_location == 'default':
            options.view_as(SetupOptions).sdk_location = 'container'

        # This is needed as we start a worker server if one is requested
        # but none is provided.
        if portable_options.environment_type == 'LOOPBACK':
            use_loopback_process_worker = options.view_as(
                DebugOptions).lookup_experiment('use_loopback_process_worker',
                                                False)
            portable_options.environment_config, server = (
                worker_pool_main.BeamFnExternalWorkerPoolServicer.start(
                    state_cache_size=sdk_worker_main._get_state_cache_size(
                        options),
                    use_process=use_loopback_process_worker))
            cleanup_callbacks = [functools.partial(server.stop, 1)]
        else:
            cleanup_callbacks = []

        proto_pipeline = pipeline.to_runner_api(
            default_environment=PortableRunner._create_environment(
                portable_options))

        # Some runners won't detect the GroupByKey transform unless it has no
        # subtransforms.  Remove all sub-transforms until BEAM-4605 is resolved.
        for _, transform_proto in list(
                proto_pipeline.components.transforms.items()):
            if transform_proto.spec.urn == common_urns.primitives.GROUP_BY_KEY.urn:
                for sub_transform in transform_proto.subtransforms:
                    del proto_pipeline.components.transforms[sub_transform]
                del transform_proto.subtransforms[:]

        # Preemptively apply combiner lifting, until all runners support it.
        # These optimizations commute and are idempotent.
        pre_optimize = options.view_as(DebugOptions).lookup_experiment(
            'pre_optimize', 'lift_combiners').lower()
        if not options.view_as(StandardOptions).streaming:
            flink_known_urns = frozenset([
                common_urns.composites.RESHUFFLE.urn,
                common_urns.primitives.IMPULSE.urn,
                common_urns.primitives.FLATTEN.urn,
                common_urns.primitives.GROUP_BY_KEY.urn
            ])
            if pre_optimize == 'none':
                pass
            elif pre_optimize == 'all':
                proto_pipeline = fn_api_runner_transforms.optimize_pipeline(
                    proto_pipeline,
                    phases=[
                        fn_api_runner_transforms.
                        annotate_downstream_side_inputs,
                        fn_api_runner_transforms.
                        annotate_stateful_dofns_as_roots,
                        fn_api_runner_transforms.fix_side_input_pcoll_coders,
                        fn_api_runner_transforms.lift_combiners,
                        fn_api_runner_transforms.expand_sdf,
                        fn_api_runner_transforms.fix_flatten_coders,
                        # fn_api_runner_transforms.sink_flattens,
                        fn_api_runner_transforms.greedily_fuse,
                        fn_api_runner_transforms.read_to_impulse,
                        fn_api_runner_transforms.extract_impulse_stages,
                        fn_api_runner_transforms.remove_data_plane_ops,
                        fn_api_runner_transforms.sort_stages
                    ],
                    known_runner_urns=flink_known_urns)
            else:
                phases = []
                for phase_name in pre_optimize.split(','):
                    # For now, these are all we allow.
                    if phase_name in 'lift_combiners':
                        phases.append(
                            getattr(fn_api_runner_transforms, phase_name))
                    else:
                        raise ValueError(
                            'Unknown or inapplicable phase for pre_optimize: %s'
                            % phase_name)
                proto_pipeline = fn_api_runner_transforms.optimize_pipeline(
                    proto_pipeline,
                    phases=phases,
                    known_runner_urns=flink_known_urns,
                    partial=True)

        job_service = self.create_job_service(options)

        # fetch runner options from job service
        # retries in case the channel is not ready
        def send_options_request(max_retries=5):
            num_retries = 0
            while True:
                try:
                    # This reports channel is READY but connections may fail
                    # Seems to be only an issue on Mac with port forwardings
                    return job_service.DescribePipelineOptions(
                        beam_job_api_pb2.DescribePipelineOptionsRequest(),
                        timeout=portable_options.job_server_timeout)
                except grpc.FutureTimeoutError:
                    # no retry for timeout errors
                    raise
                except grpc._channel._Rendezvous as e:
                    num_retries += 1
                    if num_retries > max_retries:
                        raise e
                    time.sleep(1)

        options_response = send_options_request()

        def add_runner_options(parser):
            for option in options_response.options:
                try:
                    # no default values - we don't want runner options
                    # added unless they were specified by the user
                    add_arg_args = {
                        'action': 'store',
                        'help': option.description
                    }
                    if option.type == beam_job_api_pb2.PipelineOptionType.BOOLEAN:
                        add_arg_args['action'] = 'store_true'\
                          if option.default_value != 'true' else 'store_false'
                    elif option.type == beam_job_api_pb2.PipelineOptionType.INTEGER:
                        add_arg_args['type'] = int
                    elif option.type == beam_job_api_pb2.PipelineOptionType.ARRAY:
                        add_arg_args['action'] = 'append'
                    parser.add_argument("--%s" % option.name, **add_arg_args)
                except Exception as e:
                    # ignore runner options that are already present
                    # only in this case is duplicate not treated as error
                    if 'conflicting option string' not in str(e):
                        raise
                    _LOGGER.debug("Runner option '%s' was already added" %
                                  option.name)

        all_options = options.get_all_options(
            add_extra_args_fn=add_runner_options)
        # TODO: Define URNs for options.
        # convert int values: https://issues.apache.org/jira/browse/BEAM-5509
        p_options = {
            'beam:option:' + k + ':v1': (str(v) if type(v) == int else v)
            for k, v in all_options.items() if v is not None
        }

        prepare_request = beam_job_api_pb2.PrepareJobRequest(
            job_name='job',
            pipeline=proto_pipeline,
            pipeline_options=job_utils.dict_to_struct(p_options))
        _LOGGER.debug('PrepareJobRequest: %s', prepare_request)
        prepare_response = job_service.Prepare(
            prepare_request, timeout=portable_options.job_server_timeout)
        artifact_endpoint = (portable_options.artifact_endpoint
                             if portable_options.artifact_endpoint else
                             prepare_response.artifact_staging_endpoint.url)
        if artifact_endpoint:
            stager = portable_stager.PortableStager(
                grpc.insecure_channel(artifact_endpoint),
                prepare_response.staging_session_token)
            retrieval_token, _ = stager.stage_job_resources(
                options, staging_location='')
        else:
            retrieval_token = None

        try:
            state_stream = job_service.GetStateStream(
                beam_job_api_pb2.GetJobStateRequest(
                    job_id=prepare_response.preparation_id),
                timeout=portable_options.job_server_timeout)
            # If there's an error, we don't always get it until we try to read.
            # Fortunately, there's always an immediate current state published.
            state_stream = itertools.chain([next(state_stream)], state_stream)
            message_stream = job_service.GetMessageStream(
                beam_job_api_pb2.JobMessagesRequest(
                    job_id=prepare_response.preparation_id),
                timeout=portable_options.job_server_timeout)
        except Exception:
            # TODO(BEAM-6442): Unify preparation_id and job_id for all runners.
            state_stream = message_stream = None

        # Run the job and wait for a result, we don't set a timeout here because
        # it may take a long time for a job to complete and streaming
        # jobs currently never return a response.
        run_response = job_service.Run(
            beam_job_api_pb2.RunJobRequest(
                preparation_id=prepare_response.preparation_id,
                retrieval_token=retrieval_token))

        if state_stream is None:
            state_stream = job_service.GetStateStream(
                beam_job_api_pb2.GetJobStateRequest(
                    job_id=run_response.job_id))
            message_stream = job_service.GetMessageStream(
                beam_job_api_pb2.JobMessagesRequest(
                    job_id=run_response.job_id))

        return PipelineResult(job_service, run_response.job_id, message_stream,
                              state_stream, cleanup_callbacks)
    def _start_sdk_worker_main(
            self, start_worker_request: beam_fn_api_pb2.StartWorkerRequest):
        params = start_worker_request.params
        self._parse_param_lock.acquire()
        # The first thread to start is responsible for preparing all execution environment.
        if not self._ref_cnt:
            if 'PYTHONPATH' in params:
                self._old_python_path = sys.path[:]
                python_path_list = params['PYTHONPATH'].split(':')
                python_path_list.reverse()
                for path in python_path_list:
                    sys.path.insert(0, path)
            if '_PYTHON_WORKING_DIR' in params:
                self._old_working_dir = os.getcwd()
                os.chdir(params['_PYTHON_WORKING_DIR'])
            os.environ.update(params)
        self._ref_cnt += 1
        self._parse_param_lock.release()

        # read job information from provision stub
        metadata = [("worker_id", start_worker_request.worker_id)]
        provision_endpoint = start_worker_request.provision_endpoint.url
        with grpc.insecure_channel(provision_endpoint) as channel:
            client = ProvisionServiceStub(channel=channel)
            info = client.GetProvisionInfo(GetProvisionInfoRequest(),
                                           metadata=metadata).info
            options = json_format.MessageToJson(info.pipeline_options)
            logging_endpoint = info.logging_endpoint.url
            control_endpoint = info.control_endpoint.url

        try:
            logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor(
                url=logging_endpoint)

            # Send all logs to the runner.
            fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor)
            logging.getLogger().setLevel(logging.INFO)
            # Remove all the built-in log handles
            logging.getLogger().handlers = []
            logging.getLogger().addHandler(fn_log_handler)
            logging.info("Starting up Python worker in loopback mode.")
        except Exception:
            _LOGGER.error(
                "Failed to set up logging handler, continuing without.",
                exc_info=True)
            fn_log_handler = None

        sdk_pipeline_options = sdk_worker_main._parse_pipeline_options(options)

        _worker_id = start_worker_request.worker_id

        try:
            control_service_descriptor = endpoints_pb2.ApiServiceDescriptor(
                url=control_endpoint)
            status_service_descriptor = endpoints_pb2.ApiServiceDescriptor()

            experiments = sdk_pipeline_options.view_as(
                DebugOptions).experiments or []
            enable_heap_dump = 'enable_heap_dump' in experiments
            SdkHarness(control_address=control_service_descriptor.url,
                       status_address=status_service_descriptor.url,
                       worker_id=_worker_id,
                       state_cache_size=sdk_worker_main._get_state_cache_size(
                           experiments),
                       data_buffer_time_limit_ms=sdk_worker_main.
                       _get_data_buffer_time_limit_ms(experiments),
                       profiler_factory=profiler.Profile.factory_from_options(
                           sdk_pipeline_options.view_as(ProfilingOptions)),
                       enable_heap_dump=enable_heap_dump).run()
        except:  # pylint: disable=broad-except
            _LOGGER.exception('Python sdk harness failed: ')
            raise
        finally:
            self._parse_param_lock.acquire()
            self._ref_cnt -= 1
            # The last thread to exit is responsible for reverting working directory and sys.path.
            if self._ref_cnt == 0:
                if self._old_python_path is not None:
                    sys.path.clear()
                    for item in self._old_python_path:
                        sys.path.append(item)
                    self._old_python_path = None
                if self._old_working_dir is not None:
                    os.chdir(self._old_working_dir)
                    self._old_working_dir = None
            self._parse_param_lock.release()
            if fn_log_handler:
                fn_log_handler.close()
Esempio n. 5
0
    def _start_sdk_worker_main(
            self, start_worker_request: beam_fn_api_pb2.StartWorkerRequest):
        params = start_worker_request.params
        self._parse_param_lock.acquire()
        if 'PYTHONPATH' in params:
            python_path_list = params['PYTHONPATH'].split(':')
            python_path_list.reverse()
            for path in python_path_list:
                sys.path.insert(0, path)
        if '_PYTHON_WORKING_DIR' in params:
            os.chdir(params['_PYTHON_WORKING_DIR'])
        os.environ.update(params)
        self._parse_param_lock.release()

        # read job information from provision stub
        metadata = [("worker_id", start_worker_request.worker_id)]
        provision_endpoint = start_worker_request.provision_endpoint.url
        with grpc.insecure_channel(provision_endpoint) as channel:
            client = ProvisionServiceStub(channel=channel)
            info = client.GetProvisionInfo(GetProvisionInfoRequest(),
                                           metadata=metadata).info
            options = json_format.MessageToJson(info.pipeline_options)
            logging_endpoint = info.logging_endpoint.url
            control_endpoint = info.control_endpoint.url

        try:
            logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor(
                url=logging_endpoint)

            # Send all logs to the runner.
            fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor)
            logging.getLogger().setLevel(logging.ERROR)
            logging.getLogger().addHandler(fn_log_handler)
        except Exception:
            _LOGGER.error(
                "Failed to set up logging handler, continuing without.",
                exc_info=True)
            fn_log_handler = None

        sdk_pipeline_options = sdk_worker_main._parse_pipeline_options(options)

        _worker_id = start_worker_request.worker_id

        try:
            control_service_descriptor = endpoints_pb2.ApiServiceDescriptor(
                url=control_endpoint)
            status_service_descriptor = endpoints_pb2.ApiServiceDescriptor()

            experiments = sdk_pipeline_options.view_as(
                DebugOptions).experiments or []
            enable_heap_dump = 'enable_heap_dump' in experiments
            SdkHarness(control_address=control_service_descriptor.url,
                       status_address=status_service_descriptor.url,
                       worker_id=_worker_id,
                       state_cache_size=sdk_worker_main._get_state_cache_size(
                           experiments),
                       data_buffer_time_limit_ms=sdk_worker_main.
                       _get_data_buffer_time_limit_ms(experiments),
                       profiler_factory=profiler.Profile.factory_from_options(
                           sdk_pipeline_options.view_as(ProfilingOptions)),
                       enable_heap_dump=enable_heap_dump).run()
        except:  # pylint: disable=broad-except
            _LOGGER.exception('Python sdk harness failed: ')
            raise
        finally:
            if fn_log_handler:
                fn_log_handler.close()