Ejemplo n.º 1
0
    def run_pipeline(self, pipeline, options):
        """Execute the entire pipeline and returns an DirectPipelineResult."""

        # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
        # with resolving imports when they are at top.
        # pylint: disable=wrong-import-position
        from apache_beam.pipeline import PipelineVisitor
        from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
          ConsumerTrackingPipelineVisitor
        from apache_beam.runners.direct.evaluation_context import EvaluationContext
        from apache_beam.runners.direct.executor import Executor
        from apache_beam.runners.direct.transform_evaluator import \
          TransformEvaluatorRegistry
        from apache_beam.testing.test_stream import TestStream

        # Performing configured PTransform overrides.
        pipeline.replace_all(_get_transform_overrides(options))

        # If the TestStream I/O is used, use a mock test clock.
        class _TestStreamUsageVisitor(PipelineVisitor):
            """Visitor determining whether a Pipeline uses a TestStream."""
            def __init__(self):
                self.uses_test_stream = False

            def visit_transform(self, applied_ptransform):
                if isinstance(applied_ptransform.transform, TestStream):
                    self.uses_test_stream = True

        visitor = _TestStreamUsageVisitor()
        pipeline.visit(visitor)
        clock = TestClock() if visitor.uses_test_stream else RealClock()

        # TODO(BEAM-4274): Circular import runners-metrics. Requires refactoring.
        from apache_beam.metrics.execution import MetricsEnvironment
        MetricsEnvironment.set_metrics_supported(True)
        logging.info('Running pipeline with DirectRunner.')
        self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
        pipeline.visit(self.consumer_tracking_visitor)

        evaluation_context = EvaluationContext(
            options,
            BundleFactory(stacked=options.view_as(
                DirectOptions).direct_runner_use_stacked_bundle),
            self.consumer_tracking_visitor.root_transforms,
            self.consumer_tracking_visitor.value_to_consumers,
            self.consumer_tracking_visitor.step_names,
            self.consumer_tracking_visitor.views, clock)

        executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                            TransformEvaluatorRegistry(evaluation_context),
                            evaluation_context)
        # DirectRunner does not support injecting
        # PipelineOptions values at runtime
        RuntimeValueProvider.set_runtime_options({})
        # Start the executor. This is a non-blocking call, it will start the
        # execution in background threads and return.
        executor.start(self.consumer_tracking_visitor.root_transforms)
        result = DirectPipelineResult(executor, evaluation_context)

        return result
Ejemplo n.º 2
0
  def run_pipeline(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.pipeline import PipelineVisitor
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry
    from apache_beam.testing.test_stream import TestStream

    # Performing configured PTransform overrides.
    pipeline.replace_all(_get_transform_overrides(pipeline.options))

    # If the TestStream I/O is used, use a mock test clock.
    class _TestStreamUsageVisitor(PipelineVisitor):
      """Visitor determining whether a Pipeline uses a TestStream."""

      def __init__(self):
        self.uses_test_stream = False

      def visit_transform(self, applied_ptransform):
        if isinstance(applied_ptransform.transform, TestStream):
          self.uses_test_stream = True

    visitor = _TestStreamUsageVisitor()
    pipeline.visit(visitor)
    clock = TestClock() if visitor.uses_test_stream else RealClock()

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(self.consumer_tracking_visitor)

    evaluation_context = EvaluationContext(
        pipeline._options,
        BundleFactory(stacked=pipeline._options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views,
        clock)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # DirectRunner does not support injecting
    # PipelineOptions values at runtime
    RuntimeValueProvider.set_runtime_options({})
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    return result
Ejemplo n.º 3
0
  def run(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(group_by_key_input_visitor())
    pipeline.visit(self.consumer_tracking_visitor)

    evaluation_context = EvaluationContext(
        pipeline.options,
        BundleFactory(stacked=pipeline.options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views)

    evaluation_context.use_pvalue_cache(self._cache)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.

    if pipeline.options:
      RuntimeValueProvider.set_runtime_options(pipeline.options._options_id, {})
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    if self._cache:
      # We are running in eager mode, block until the pipeline execution
      # completes in order to have full results in the cache.
      result.wait_until_finish()
      self._cache.finalize()

      # Unset runtime options after the pipeline finishes.
      # TODO: Move this to a post finish hook and clean for all cases.
      if pipeline.options:
        RuntimeValueProvider.unset_runtime_options(pipeline.options._options_id)

    return result
Ejemplo n.º 4
0
  def run_pipeline(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # Performing configured PTransform overrides.
    pipeline.replace_all(self._ptransform_overrides)

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(self.consumer_tracking_visitor)

    clock = TestClock() if self._use_test_clock else RealClock()
    evaluation_context = EvaluationContext(
        pipeline._options,
        BundleFactory(stacked=pipeline._options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views,
        clock)

    evaluation_context.use_pvalue_cache(self._cache)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # DirectRunner does not support injecting
    # PipelineOptions values at runtime
    RuntimeValueProvider.set_runtime_options({})
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    if self._cache:
      # We are running in eager mode, block until the pipeline execution
      # completes in order to have full results in the cache.
      result.wait_until_finish()
      self._cache.finalize()

    return result
Ejemplo n.º 5
0
  def run(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(self.visitor)

    evaluation_context = EvaluationContext(
        pipeline.options,
        BundleFactory(stacked=pipeline.options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.visitor.root_transforms,
        self.visitor.value_to_consumers,
        self.visitor.step_names,
        self.visitor.views)

    evaluation_context.use_pvalue_cache(self._cache)

    executor = Executor(self.visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.

    if pipeline.options:
      RuntimeValueProvider.set_runtime_options(pipeline.options._options_id, {})
    executor.start(self.visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    if self._cache:
      # We are running in eager mode, block until the pipeline execution
      # completes in order to have full results in the cache.
      result.wait_until_finish()
      self._cache.finalize()

      # Unset runtime options after the pipeline finishes.
      # TODO: Move this to a post finish hook and clean for all cases.
      if pipeline.options:
        RuntimeValueProvider.unset_runtime_options(pipeline.options._options_id)

    return result
Ejemplo n.º 6
0
    def run(self, pipeline):
        MetricsEnvironment.set_metrics_supported(self.has_metrics_support())
        # List of map tasks  Each map task is a list of
        # (stage_name, operation_specs.WorkerOperation) instructions.
        self.map_tasks = []

        # Map of pvalues to
        # (map_task_index, producer_operation_index, producer_output_index)
        self.outputs = {}

        # Unique mappings of PCollections to strings.
        self.side_input_labels = collections.defaultdict(
            lambda: str(len(self.side_input_labels)))

        # Mapping of map task indices to all map tasks that must preceed them.
        self.dependencies = collections.defaultdict(set)

        # Visit the graph, building up the map_tasks and their metadata.
        super(MapTaskExecutorRunner, self).run(pipeline)

        # Now run the tasks in topological order.
        def compute_depth_map(deps):
            memoized = {}

            def compute_depth(x):
                if x not in memoized:
                    memoized[x] = 1 + max([-1] +
                                          [compute_depth(y) for y in deps[x]])
                return memoized[x]

            return {x: compute_depth(x) for x in deps.keys()}

        map_task_depths = compute_depth_map(self.dependencies)
        ordered_map_tasks = sorted(
            (map_task_depths.get(ix, -1), map_task)
            for ix, map_task in enumerate(self.map_tasks))

        profile_options = pipeline.options.view_as(
            pipeline_options.ProfilingOptions)
        if profile_options.profile_cpu:
            with profiler.Profile(
                    profile_id='worker-runner',
                    profile_location=profile_options.profile_location,
                    log_results=True,
                    file_copy_fn=_dependency_file_copy):
                self.execute_map_tasks(ordered_map_tasks)
        else:
            self.execute_map_tasks(ordered_map_tasks)

        return WorkerRunnerResult(PipelineState.UNKNOWN)
Ejemplo n.º 7
0
 def run_pipeline(self, pipeline, options):
   MetricsEnvironment.set_metrics_supported(False)
   RuntimeValueProvider.set_runtime_options({})
   # This is sometimes needed if type checking is disabled
   # to enforce that the inputs (and outputs) of GroupByKey operations
   # are known to be KVs.
   from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
   pipeline.visit(DataflowRunner.group_by_key_input_visitor())
   self._bundle_repeat = self._bundle_repeat or options.view_as(
       pipeline_options.DirectOptions).direct_runner_bundle_repeat
   self._profiler_factory = profiler.Profile.factory_from_options(
       options.view_as(pipeline_options.ProfilingOptions))
   return self.run_via_runner_api(pipeline.to_runner_api(
       default_environment=self._default_environment))
Ejemplo n.º 8
0
 def run_pipeline(self, pipeline, options):
   MetricsEnvironment.set_metrics_supported(False)
   RuntimeValueProvider.set_runtime_options({})
   # This is sometimes needed if type checking is disabled
   # to enforce that the inputs (and outputs) of GroupByKey operations
   # are known to be KVs.
   from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
   pipeline.visit(DataflowRunner.group_by_key_input_visitor())
   self._bundle_repeat = self._bundle_repeat or options.view_as(
       pipeline_options.DirectOptions).direct_runner_bundle_repeat
   self._profiler_factory = profiler.Profile.factory_from_options(
       options.view_as(pipeline_options.ProfilingOptions))
   return self.run_via_runner_api(pipeline.to_runner_api(
       default_environment=self._default_environment))
  def run_pipeline(self, pipeline):
    MetricsEnvironment.set_metrics_supported(self.has_metrics_support())
    # List of map tasks  Each map task is a list of
    # (stage_name, operation_specs.WorkerOperation) instructions.
    self.map_tasks = []

    # Map of pvalues to
    # (map_task_index, producer_operation_index, producer_output_index)
    self.outputs = {}

    # Unique mappings of PCollections to strings.
    self.side_input_labels = collections.defaultdict(
        lambda: str(len(self.side_input_labels)))

    # Mapping of map task indices to all map tasks that must preceed them.
    self.dependencies = collections.defaultdict(set)

    # Visit the graph, building up the map_tasks and their metadata.
    super(MapTaskExecutorRunner, self).run_pipeline(pipeline)

    # Now run the tasks in topological order.
    def compute_depth_map(deps):
      memoized = {}

      def compute_depth(x):
        if x not in memoized:
          memoized[x] = 1 + max([-1] + [compute_depth(y) for y in deps[x]])
        return memoized[x]

      return {x: compute_depth(x) for x in deps.keys()}

    map_task_depths = compute_depth_map(self.dependencies)
    ordered_map_tasks = sorted((map_task_depths.get(ix, -1), map_task)
                               for ix, map_task in enumerate(self.map_tasks))

    profile_options = pipeline.options.view_as(
        pipeline_options.ProfilingOptions)
    if profile_options.profile_cpu:
      with profiler.Profile(
          profile_id='worker-runner',
          profile_location=profile_options.profile_location,
          log_results=True, file_copy_fn=_dependency_file_copy):
        self.execute_map_tasks(ordered_map_tasks)
    else:
      self.execute_map_tasks(ordered_map_tasks)

    return WorkerRunnerResult(PipelineState.UNKNOWN)
Ejemplo n.º 10
0
 def run_pipeline(self, pipeline):
     MetricsEnvironment.set_metrics_supported(False)
     return self.run_via_runner_api(pipeline.to_runner_api())
Ejemplo n.º 11
0
 def run(self, pipeline):
   MetricsEnvironment.set_metrics_supported(self.has_metrics_support())
   if pipeline._verify_runner_api_compatible():
     return self.run_via_runner_api(pipeline.to_runner_api())
   else:
     return super(FnApiRunner, self).run(pipeline)
Ejemplo n.º 12
0
 def run(self, pipeline):
   MetricsEnvironment.set_metrics_supported(self.has_metrics_support())
   if pipeline._verify_runner_api_compatible():
     return self.run_via_runner_api(pipeline.to_runner_api())
   else:
     return super(FnApiRunner, self).run(pipeline)
Ejemplo n.º 13
0
 def run_pipeline(self, pipeline):
   MetricsEnvironment.set_metrics_supported(False)
   return self.run_via_runner_api(pipeline.to_runner_api())