def execute(self, context: 'Context'): """Execute the Apache Beam Pipeline.""" ( is_dataflow, dataflow_job_name, snake_case_pipeline_options, process_line_callback, ) = self._init_pipeline_options(format_pipeline_options=True, job_name_variable_key="job_name") if not self.beam_hook: raise AirflowException("Beam hook is not defined.") with ExitStack() as exit_stack: if self.go_file.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) with tempfile.TemporaryDirectory( prefix="apache-beam-go") as tmp_dir: tmp_gcs_file = exit_stack.enter_context( gcs_hook.provide_file(object_url=self.go_file, dir=tmp_dir)) self.go_file = tmp_gcs_file.name self.should_init_go_module = True if is_dataflow and self.dataflow_hook: with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_go_pipeline( variables=snake_case_pipeline_options, go_file=self.go_file, process_line_callback=process_line_callback, should_init_module=self.should_init_go_module, ) DataflowJobLink.persist( self, context, self.dataflow_config.project_id, self.dataflow_config.location, self.dataflow_job_id, ) if dataflow_job_name and self.dataflow_config.location: self.dataflow_hook.wait_for_done( job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=False, project_id=self.dataflow_config.project_id, ) return {"dataflow_job_id": self.dataflow_job_id} else: self.beam_hook.start_go_pipeline( variables=snake_case_pipeline_options, go_file=self.go_file, process_line_callback=process_line_callback, should_init_module=self.should_init_go_module, )
def execute(self, context: 'Context'): """Execute the Apache Beam Pipeline.""" ( is_dataflow, dataflow_job_name, snake_case_pipeline_options, process_line_callback, ) = self._init_pipeline_options(format_pipeline_options=True, job_name_variable_key="job_name") if not self.beam_hook: raise AirflowException("Beam hook is not defined.") with ExitStack() as exit_stack: if self.py_file.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( gcs_hook.provide_file(object_url=self.py_file)) self.py_file = tmp_gcs_file.name if is_dataflow and self.dataflow_hook: with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_python_pipeline( variables=snake_case_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) DataflowJobLink.persist( self, context, self.dataflow_config.project_id, self.dataflow_config.location, self.dataflow_job_id, ) if dataflow_job_name and self.dataflow_config.location: self.dataflow_hook.wait_for_done( job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=False, project_id=self.dataflow_config.project_id, ) return {"dataflow_job_id": self.dataflow_job_id} else: self.beam_hook.start_python_pipeline( variables=snake_case_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, )
def set_current_job(current_job): self.job = current_job DataflowJobLink.persist(self, context, self.project_id, self.location, self.job.get("id"))
def execute(self, context: 'Context'): """Execute the Apache Beam Pipeline.""" ( is_dataflow, dataflow_job_name, pipeline_options, process_line_callback, ) = self._init_pipeline_options() if not self.beam_hook: raise AirflowException("Beam hook is not defined.") with ExitStack() as exit_stack: if self.jar.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( gcs_hook.provide_file(object_url=self.jar)) self.jar = tmp_gcs_file.name if is_dataflow and self.dataflow_hook: is_running = False if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob: is_running = ( # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) self.dataflow_hook.is_job_dataflow_running( name=self.dataflow_config.job_name, variables=pipeline_options, )) while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun: # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) is_running = self.dataflow_hook.is_job_dataflow_running( name=self.dataflow_config.job_name, variables=pipeline_options, ) if not is_running: pipeline_options["jobName"] = dataflow_job_name with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) if dataflow_job_name and self.dataflow_config.location: multiple_jobs = (self.dataflow_config.multiple_jobs if self.dataflow_config.multiple_jobs else False) DataflowJobLink.persist( self, context, self.dataflow_config.project_id, self.dataflow_config.location, self.dataflow_job_id, ) self.dataflow_hook.wait_for_done( job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=multiple_jobs, project_id=self.dataflow_config.project_id, ) return {"dataflow_job_id": self.dataflow_job_id} else: self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, )