Esempio n. 1
0
  def create_stages(self, pipeline_proto):

    pipeline_context = fn_api_runner_transforms.TransformContext(
        copy.deepcopy(pipeline_proto.components),
        use_state_iterables=self._use_state_iterables)

    # Initial set of stages are singleton leaf transforms.
    stages = list(fn_api_runner_transforms.leaf_transform_stages(
        pipeline_proto.root_transform_ids,
        pipeline_proto.components))

    # Apply each phase in order.
    for phase in [
        fn_api_runner_transforms.annotate_downstream_side_inputs,
        fn_api_runner_transforms.fix_side_input_pcoll_coders,
        fn_api_runner_transforms.lift_combiners,
        fn_api_runner_transforms.expand_gbk,
        fn_api_runner_transforms.sink_flattens,
        fn_api_runner_transforms.greedily_fuse,
        fn_api_runner_transforms.read_to_impulse,
        fn_api_runner_transforms.impulse_to_input,
        fn_api_runner_transforms.inject_timer_pcollections,
        fn_api_runner_transforms.sort_stages,
        fn_api_runner_transforms.window_pcollection_coders]:
      logging.info('%s %s %s', '=' * 20, phase, '=' * 20)
      stages = list(phase(stages, pipeline_context))
      logging.debug('Stages: %s', [str(s) for s in stages])

    # Return the (possibly mutated) context and ordered set of stages.
    return pipeline_context.components, stages, pipeline_context.safe_coders
Esempio n. 2
0
    def run_pipeline(self, pipeline, options):
        portable_options = options.view_as(PortableOptions)
        job_endpoint = portable_options.job_endpoint

        # TODO: https://issues.apache.org/jira/browse/BEAM-5525
        # portable runner specific default
        if options.view_as(SetupOptions).sdk_location == 'default':
            options.view_as(SetupOptions).sdk_location = 'container'

        if not job_endpoint:
            # TODO Provide a way to specify a container Docker URL
            # https://issues.apache.org/jira/browse/BEAM-6328
            docker = DockerizedJobServer()
            job_endpoint = docker.start()

        # This is needed as we start a worker server if one is requested
        # but none is provided.
        if portable_options.environment_type == 'LOOPBACK':
            portable_options.environment_config, server = (
                BeamFnExternalWorkerPoolServicer.start(
                    sdk_worker_main._get_worker_count(options)))
            cleanup_callbacks = [functools.partial(server.stop, 1)]
        else:
            cleanup_callbacks = []

        proto_pipeline = pipeline.to_runner_api(
            default_environment=PortableRunner._create_environment(
                portable_options))

        # Some runners won't detect the GroupByKey transform unless it has no
        # subtransforms.  Remove all sub-transforms until BEAM-4605 is resolved.
        for _, transform_proto in list(
                proto_pipeline.components.transforms.items()):
            if transform_proto.spec.urn == common_urns.primitives.GROUP_BY_KEY.urn:
                for sub_transform in transform_proto.subtransforms:
                    del proto_pipeline.components.transforms[sub_transform]
                del transform_proto.subtransforms[:]

        # Preemptively apply combiner lifting, until all runners support it.
        # This optimization is idempotent.
        if not options.view_as(StandardOptions).streaming:
            stages = list(
                fn_api_runner_transforms.leaf_transform_stages(
                    proto_pipeline.root_transform_ids,
                    proto_pipeline.components))
            stages = fn_api_runner_transforms.lift_combiners(
                stages,
                fn_api_runner_transforms.TransformContext(
                    proto_pipeline.components))
            proto_pipeline = fn_api_runner_transforms.with_stages(
                proto_pipeline, stages)

        # TODO: Define URNs for options.
        # convert int values: https://issues.apache.org/jira/browse/BEAM-5509
        p_options = {
            'beam:option:' + k + ':v1': (str(v) if type(v) == int else v)
            for k, v in options.get_all_options().items() if v is not None
        }

        channel = grpc.insecure_channel(job_endpoint)
        grpc.channel_ready_future(channel).result()
        job_service = beam_job_api_pb2_grpc.JobServiceStub(channel)

        # Sends the PrepareRequest but retries in case the channel is not ready
        def send_prepare_request(max_retries=5):
            num_retries = 0
            while True:
                try:
                    # This reports channel is READY but connections may fail
                    # Seems to be only an issue on Mac with port forwardings
                    grpc.channel_ready_future(channel).result()
                    return job_service.Prepare(
                        beam_job_api_pb2.PrepareJobRequest(
                            job_name='job',
                            pipeline=proto_pipeline,
                            pipeline_options=job_utils.dict_to_struct(
                                p_options)))
                except grpc._channel._Rendezvous as e:
                    num_retries += 1
                    if num_retries > max_retries:
                        raise e

        prepare_response = send_prepare_request()
        if prepare_response.artifact_staging_endpoint.url:
            stager = portable_stager.PortableStager(
                grpc.insecure_channel(
                    prepare_response.artifact_staging_endpoint.url),
                prepare_response.staging_session_token)
            retrieval_token, _ = stager.stage_job_resources(
                options, staging_location='')
        else:
            retrieval_token = None

        try:
            state_stream = job_service.GetStateStream(
                beam_job_api_pb2.GetJobStateRequest(
                    job_id=prepare_response.preparation_id))
            # If there's an error, we don't always get it until we try to read.
            # Fortunately, there's always an immediate current state published.
            state_stream = itertools.chain([next(state_stream)], state_stream)
            message_stream = job_service.GetMessageStream(
                beam_job_api_pb2.JobMessagesRequest(
                    job_id=prepare_response.preparation_id))
        except Exception:
            # TODO(BEAM-6442): Unify preparation_id and job_id for all runners.
            state_stream = message_stream = None

        # Run the job and wait for a result.
        run_response = job_service.Run(
            beam_job_api_pb2.RunJobRequest(
                preparation_id=prepare_response.preparation_id,
                retrieval_token=retrieval_token))

        if state_stream is None:
            state_stream = job_service.GetStateStream(
                beam_job_api_pb2.GetJobStateRequest(
                    job_id=run_response.job_id))
            message_stream = job_service.GetMessageStream(
                beam_job_api_pb2.JobMessagesRequest(
                    job_id=run_response.job_id))

        return PipelineResult(job_service, run_response.job_id, message_stream,
                              state_stream, cleanup_callbacks)