def run_pipeline(self, pipeline):
    """Remotely executes entire pipeline or parts reachable from node."""
    # Import here to avoid adding the dependency for local running scenarios.
    try:
      # pylint: disable=wrong-import-order, wrong-import-position
      from apache_beam.runners.dataflow.internal import apiclient
    except ImportError:
      raise ImportError(
          'Google Cloud Dataflow runner not available, '
          'please install apache_beam[gcp]')

    # Snapshot the pipeline in a portable proto before mutating it
    proto_pipeline, self.proto_context = pipeline.to_runner_api(
        return_context=True)

    # Performing configured PTransform overrides.
    pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)

    # Add setup_options for all the BeamPlugin imports
    setup_options = pipeline._options.view_as(SetupOptions)
    plugins = BeamPlugin.get_all_plugin_paths()
    if setup_options.beam_plugins is not None:
      plugins = list(set(plugins + setup_options.beam_plugins))
    setup_options.beam_plugins = plugins

    self.job = apiclient.Job(pipeline._options, proto_pipeline)

    # Dataflow runner requires a KV type for GBK inputs, hence we enforce that
    # here.
    pipeline.visit(self.group_by_key_input_visitor())

    # Dataflow runner requires output type of the Flatten to be the same as the
    # inputs, hence we enforce that here.
    pipeline.visit(self.flatten_input_visitor())

    # The superclass's run will trigger a traversal of all reachable nodes.
    super(DataflowRunner, self).run_pipeline(pipeline)

    test_options = pipeline._options.view_as(TestOptions)
    # If it is a dry run, return without submitting the job.
    if test_options.dry_run:
      return None

    # Get a Dataflow API client and set its options
    self.dataflow_client = apiclient.DataflowApplicationClient(
        pipeline._options)

    # Create the job description and send a request to the service. The result
    # can be None if there is no need to send a request to the service (e.g.
    # template creation). If a request was sent and failed then the call will
    # raise an exception.
    result = DataflowPipelineResult(
        self.dataflow_client.create_job(self.job), self)

    self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
    result.metric_results = self._metrics
    return result
  def run(self, pipeline):
    """Remotely executes entire pipeline or parts reachable from node."""
    # Import here to avoid adding the dependency for local running scenarios.
    try:
      # pylint: disable=wrong-import-order, wrong-import-position
      from apache_beam.runners.dataflow.internal import apiclient
    except ImportError:
      raise ImportError(
          'Google Cloud Dataflow runner not available, '
          'please install apache_beam[gcp]')

    # Performing configured PTransform overrides.
    pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)

    # Add setup_options for all the BeamPlugin imports
    setup_options = pipeline._options.view_as(SetupOptions)
    plugins = BeamPlugin.get_all_plugin_paths()
    if setup_options.beam_plugins is not None:
      plugins = list(set(plugins + setup_options.beam_plugins))
    setup_options.beam_plugins = plugins

    self.job = apiclient.Job(pipeline._options)

    # Dataflow runner requires a KV type for GBK inputs, hence we enforce that
    # here.
    pipeline.visit(self.group_by_key_input_visitor())

    # Dataflow runner requires output type of the Flatten to be the same as the
    # inputs, hence we enforce that here.
    pipeline.visit(self.flatten_input_visitor())

    # The superclass's run will trigger a traversal of all reachable nodes.
    super(DataflowRunner, self).run(pipeline)

    test_options = pipeline._options.view_as(TestOptions)
    # If it is a dry run, return without submitting the job.
    if test_options.dry_run:
      return None

    standard_options = pipeline._options.view_as(StandardOptions)
    if standard_options.streaming:
      job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION
    else:
      job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION

    # Get a Dataflow API client and set its options
    self.dataflow_client = apiclient.DataflowApplicationClient(
        pipeline._options, job_version)

    # Create the job
    result = DataflowPipelineResult(
        self.dataflow_client.create_job(self.job), self)

    self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
    result.metric_results = self._metrics
    return result
Beispiel #3
0
    def run(self, pipeline):
        """Remotely executes entire pipeline or parts reachable from node."""
        # Import here to avoid adding the dependency for local running scenarios.
        try:
            # pylint: disable=wrong-import-order, wrong-import-position
            from apache_beam.runners.dataflow.internal import apiclient
        except ImportError:
            raise ImportError('Google Cloud Dataflow runner not available, '
                              'please install apache_beam[gcp]')

        # Performing configured PTransform overrides.
        pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)

        # Add setup_options for all the BeamPlugin imports
        setup_options = pipeline._options.view_as(SetupOptions)
        plugins = BeamPlugin.get_all_plugin_paths()
        if setup_options.beam_plugins is not None:
            plugins = list(set(plugins + setup_options.beam_plugins))
        setup_options.beam_plugins = plugins

        self.job = apiclient.Job(pipeline._options)

        # Dataflow runner requires a KV type for GBK inputs, hence we enforce that
        # here.
        pipeline.visit(self.group_by_key_input_visitor())

        # Dataflow runner requires output type of the Flatten to be the same as the
        # inputs, hence we enforce that here.
        pipeline.visit(self.flatten_input_visitor())

        # The superclass's run will trigger a traversal of all reachable nodes.
        super(DataflowRunner, self).run(pipeline)

        test_options = pipeline._options.view_as(TestOptions)
        # If it is a dry run, return without submitting the job.
        if test_options.dry_run:
            return None

        # Get a Dataflow API client and set its options
        self.dataflow_client = apiclient.DataflowApplicationClient(
            pipeline._options)

        # Create the job
        result = DataflowPipelineResult(
            self.dataflow_client.create_job(self.job), self)

        self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
        result.metric_results = self._metrics
        return result
Beispiel #4
0
    def run(self, pipeline):
        """Remotely executes entire pipeline or parts reachable from node."""
        # Import here to avoid adding the dependency for local running scenarios.
        try:
            # pylint: disable=wrong-import-order, wrong-import-position
            from apache_beam.runners.dataflow.internal import apiclient
        except ImportError:
            raise ImportError('Google Cloud Dataflow runner not available, '
                              'please install apache_beam[gcp]')

        # Add setup_options for all the BeamPlugin imports
        setup_options = pipeline._options.view_as(SetupOptions)
        plugins = BeamPlugin.get_all_plugin_paths()
        if setup_options.beam_plugins is not None:
            plugins = list(set(plugins +
                               setup_options.beam_plugins.split(',')))
        setup_options.beam_plugins = plugins

        self.job = apiclient.Job(pipeline._options)

        # Dataflow runner requires a KV type for GBK inputs, hence we enforce that
        # here.
        pipeline.visit(self.group_by_key_input_visitor())

        # Dataflow runner requires output type of the Flatten to be the same as the
        # inputs, hence we enforce that here.
        pipeline.visit(self.flatten_input_visitor())

        # The superclass's run will trigger a traversal of all reachable nodes.
        super(DataflowRunner, self).run(pipeline)

        standard_options = pipeline._options.view_as(StandardOptions)
        if standard_options.streaming:
            job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION
        else:
            job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION

        # Get a Dataflow API client and set its options
        self.dataflow_client = apiclient.DataflowApplicationClient(
            pipeline._options, job_version)

        # Create the job
        result = DataflowPipelineResult(
            self.dataflow_client.create_job(self.job), self)

        self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
        result.metric_results = self._metrics
        return result
 def test_import_beam_plugins(self):
     sdk_worker_main._import_beam_plugins(BeamPlugin.get_all_plugin_paths())
Beispiel #6
0
  def run_pipeline(self, pipeline):
    """Remotely executes entire pipeline or parts reachable from node."""
    # Import here to avoid adding the dependency for local running scenarios.
    try:
      # pylint: disable=wrong-import-order, wrong-import-position
      from apache_beam.runners.dataflow.internal import apiclient
    except ImportError:
      raise ImportError(
          'Google Cloud Dataflow runner not available, '
          'please install apache_beam[gcp]')

    # Convert all side inputs into a form acceptable to Dataflow.
    if apiclient._use_fnapi(pipeline._options):
      pipeline.visit(self.side_input_visitor())

    # Snapshot the pipeline in a portable proto before mutating it
    proto_pipeline, self.proto_context = pipeline.to_runner_api(
        return_context=True)

    # TODO(BEAM-2717): Remove once Coders are already in proto.
    for pcoll in proto_pipeline.components.pcollections.values():
      if pcoll.coder_id not in self.proto_context.coders:
        coder = coders.registry.get_coder(pickler.loads(pcoll.coder_id))
        pcoll.coder_id = self.proto_context.coders.get_id(coder)
    self.proto_context.coders.populate_map(proto_pipeline.components.coders)

    # Performing configured PTransform overrides.
    pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)

    # Add setup_options for all the BeamPlugin imports
    setup_options = pipeline._options.view_as(SetupOptions)
    plugins = BeamPlugin.get_all_plugin_paths()
    if setup_options.beam_plugins is not None:
      plugins = list(set(plugins + setup_options.beam_plugins))
    setup_options.beam_plugins = plugins

    self.job = apiclient.Job(pipeline._options, proto_pipeline)

    # Dataflow runner requires a KV type for GBK inputs, hence we enforce that
    # here.
    pipeline.visit(self.group_by_key_input_visitor())

    # Dataflow runner requires output type of the Flatten to be the same as the
    # inputs, hence we enforce that here.
    pipeline.visit(self.flatten_input_visitor())

    # The superclass's run will trigger a traversal of all reachable nodes.
    super(DataflowRunner, self).run_pipeline(pipeline)

    test_options = pipeline._options.view_as(TestOptions)
    # If it is a dry run, return without submitting the job.
    if test_options.dry_run:
      return None

    # Get a Dataflow API client and set its options
    self.dataflow_client = apiclient.DataflowApplicationClient(
        pipeline._options)

    # Create the job description and send a request to the service. The result
    # can be None if there is no need to send a request to the service (e.g.
    # template creation). If a request was sent and failed then the call will
    # raise an exception.
    result = DataflowPipelineResult(
        self.dataflow_client.create_job(self.job), self)

    self._metrics = DataflowMetrics(self.dataflow_client, result, self.job)
    result.metric_results = self._metrics
    return result