def run_pipeline(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient except ImportError: raise ImportError( 'Google Cloud Dataflow runner not available, ' 'please install apache_beam[gcp]') # Snapshot the pipeline in a portable proto before mutating it proto_pipeline, self.proto_context = pipeline.to_runner_api( return_context=True) # Performing configured PTransform overrides. pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) # Add setup_options for all the BeamPlugin imports setup_options = pipeline._options.view_as(SetupOptions) plugins = BeamPlugin.get_all_plugin_paths() if setup_options.beam_plugins is not None: plugins = list(set(plugins + setup_options.beam_plugins)) setup_options.beam_plugins = plugins self.job = apiclient.Job(pipeline._options, proto_pipeline) # Dataflow runner requires a KV type for GBK inputs, hence we enforce that # here. pipeline.visit(self.group_by_key_input_visitor()) # Dataflow runner requires output type of the Flatten to be the same as the # inputs, hence we enforce that here. pipeline.visit(self.flatten_input_visitor()) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run_pipeline(pipeline) test_options = pipeline._options.view_as(TestOptions) # If it is a dry run, return without submitting the job. if test_options.dry_run: return None # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline._options) # Create the job description and send a request to the service. The result # can be None if there is no need to send a request to the service (e.g. # template creation). If a request was sent and failed then the call will # raise an exception. result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) result.metric_results = self._metrics return result
def run(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient except ImportError: raise ImportError( 'Google Cloud Dataflow runner not available, ' 'please install apache_beam[gcp]') # Performing configured PTransform overrides. pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) # Add setup_options for all the BeamPlugin imports setup_options = pipeline._options.view_as(SetupOptions) plugins = BeamPlugin.get_all_plugin_paths() if setup_options.beam_plugins is not None: plugins = list(set(plugins + setup_options.beam_plugins)) setup_options.beam_plugins = plugins self.job = apiclient.Job(pipeline._options) # Dataflow runner requires a KV type for GBK inputs, hence we enforce that # here. pipeline.visit(self.group_by_key_input_visitor()) # Dataflow runner requires output type of the Flatten to be the same as the # inputs, hence we enforce that here. pipeline.visit(self.flatten_input_visitor()) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run(pipeline) test_options = pipeline._options.view_as(TestOptions) # If it is a dry run, return without submitting the job. if test_options.dry_run: return None standard_options = pipeline._options.view_as(StandardOptions) if standard_options.streaming: job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION else: job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline._options, job_version) # Create the job result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) result.metric_results = self._metrics return result
def run(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient except ImportError: raise ImportError('Google Cloud Dataflow runner not available, ' 'please install apache_beam[gcp]') # Performing configured PTransform overrides. pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) # Add setup_options for all the BeamPlugin imports setup_options = pipeline._options.view_as(SetupOptions) plugins = BeamPlugin.get_all_plugin_paths() if setup_options.beam_plugins is not None: plugins = list(set(plugins + setup_options.beam_plugins)) setup_options.beam_plugins = plugins self.job = apiclient.Job(pipeline._options) # Dataflow runner requires a KV type for GBK inputs, hence we enforce that # here. pipeline.visit(self.group_by_key_input_visitor()) # Dataflow runner requires output type of the Flatten to be the same as the # inputs, hence we enforce that here. pipeline.visit(self.flatten_input_visitor()) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run(pipeline) test_options = pipeline._options.view_as(TestOptions) # If it is a dry run, return without submitting the job. if test_options.dry_run: return None # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline._options) # Create the job result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) result.metric_results = self._metrics return result
def run(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient except ImportError: raise ImportError('Google Cloud Dataflow runner not available, ' 'please install apache_beam[gcp]') # Add setup_options for all the BeamPlugin imports setup_options = pipeline._options.view_as(SetupOptions) plugins = BeamPlugin.get_all_plugin_paths() if setup_options.beam_plugins is not None: plugins = list(set(plugins + setup_options.beam_plugins.split(','))) setup_options.beam_plugins = plugins self.job = apiclient.Job(pipeline._options) # Dataflow runner requires a KV type for GBK inputs, hence we enforce that # here. pipeline.visit(self.group_by_key_input_visitor()) # Dataflow runner requires output type of the Flatten to be the same as the # inputs, hence we enforce that here. pipeline.visit(self.flatten_input_visitor()) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run(pipeline) standard_options = pipeline._options.view_as(StandardOptions) if standard_options.streaming: job_version = DataflowRunner.STREAMING_ENVIRONMENT_MAJOR_VERSION else: job_version = DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline._options, job_version) # Create the job result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) result.metric_results = self._metrics return result
def test_import_beam_plugins(self): sdk_worker_main._import_beam_plugins(BeamPlugin.get_all_plugin_paths())
def run_pipeline(self, pipeline): """Remotely executes entire pipeline or parts reachable from node.""" # Import here to avoid adding the dependency for local running scenarios. try: # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient except ImportError: raise ImportError( 'Google Cloud Dataflow runner not available, ' 'please install apache_beam[gcp]') # Convert all side inputs into a form acceptable to Dataflow. if apiclient._use_fnapi(pipeline._options): pipeline.visit(self.side_input_visitor()) # Snapshot the pipeline in a portable proto before mutating it proto_pipeline, self.proto_context = pipeline.to_runner_api( return_context=True) # TODO(BEAM-2717): Remove once Coders are already in proto. for pcoll in proto_pipeline.components.pcollections.values(): if pcoll.coder_id not in self.proto_context.coders: coder = coders.registry.get_coder(pickler.loads(pcoll.coder_id)) pcoll.coder_id = self.proto_context.coders.get_id(coder) self.proto_context.coders.populate_map(proto_pipeline.components.coders) # Performing configured PTransform overrides. pipeline.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES) # Add setup_options for all the BeamPlugin imports setup_options = pipeline._options.view_as(SetupOptions) plugins = BeamPlugin.get_all_plugin_paths() if setup_options.beam_plugins is not None: plugins = list(set(plugins + setup_options.beam_plugins)) setup_options.beam_plugins = plugins self.job = apiclient.Job(pipeline._options, proto_pipeline) # Dataflow runner requires a KV type for GBK inputs, hence we enforce that # here. pipeline.visit(self.group_by_key_input_visitor()) # Dataflow runner requires output type of the Flatten to be the same as the # inputs, hence we enforce that here. pipeline.visit(self.flatten_input_visitor()) # The superclass's run will trigger a traversal of all reachable nodes. super(DataflowRunner, self).run_pipeline(pipeline) test_options = pipeline._options.view_as(TestOptions) # If it is a dry run, return without submitting the job. if test_options.dry_run: return None # Get a Dataflow API client and set its options self.dataflow_client = apiclient.DataflowApplicationClient( pipeline._options) # Create the job description and send a request to the service. The result # can be None if there is no need to send a request to the service (e.g. # template creation). If a request was sent and failed then the call will # raise an exception. result = DataflowPipelineResult( self.dataflow_client.create_job(self.job), self) self._metrics = DataflowMetrics(self.dataflow_client, result, self.job) result.metric_results = self._metrics return result