def test_start_python_pipeline_with_non_empty_py_requirements_and_without_system_packages( self, current_py_requirements, current_py_system_site_packages, mock_runner, mock_virtualenv ): hook = BeamHook(runner=DEFAULT_RUNNER) wait_for_done = mock_runner.return_value.wait_for_done mock_virtualenv.return_value = '/dummy_dir/bin/python' process_line_callback = MagicMock() hook.start_python_pipeline( # pylint: disable=no-value-for-parameter variables=copy.deepcopy(BEAM_VARIABLES_PY), py_file=PY_FILE, py_options=PY_OPTIONS, py_requirements=current_py_requirements, py_system_site_packages=current_py_system_site_packages, process_line_callback=process_line_callback, ) expected_cmd = [ '/dummy_dir/bin/python', '-m', PY_FILE, f'--runner={DEFAULT_RUNNER}', '--output=gs://test/output', '--labels=foo=bar', ] mock_runner.assert_called_once_with(cmd=expected_cmd, process_line_callback=process_line_callback) wait_for_done.assert_called_once_with() mock_virtualenv.assert_called_once_with( venv_directory=mock.ANY, python_bin="python3", system_site_packages=current_py_system_site_packages, requirements=current_py_requirements, )
def execute(self, context): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable] = None is_dataflow = self.runner.lower() == BeamRunnerType.DataflowRunner.lower() dataflow_job_name: Optional[str] = None if isinstance(self.dataflow_config, dict): self.dataflow_config = DataflowConfiguration(**self.dataflow_config) if is_dataflow: dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow( pipeline_options=pipeline_options, job_name_variable_key="job_name" ) pipeline_options.update(self.pipeline_options) # Convert argument names from lowerCamelCase to snake case. formatted_pipeline_options = { convert_camel_to_snake(key): pipeline_options[key] for key in pipeline_options } with ExitStack() as exit_stack: if self.py_file.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context(gcs_hook.provide_file(object_url=self.py_file)) self.py_file = tmp_gcs_file.name if is_dataflow: with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) self.dataflow_hook.wait_for_done( job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=False, ) else: self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) return {"dataflow_job_id": self.dataflow_job_id}
def _init_pipeline_options( self, format_pipeline_options: bool = False, job_name_variable_key: Optional[str] = None, ) -> Tuple[bool, Optional[str], dict, Optional[Callable[[str], None]]]: self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable[[str], None]] = None is_dataflow = self.runner.lower( ) == BeamRunnerType.DataflowRunner.lower() dataflow_job_name: Optional[str] = None if is_dataflow: dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow( pipeline_options=pipeline_options, job_name_variable_key=job_name_variable_key, ) self.log.info(pipeline_options) pipeline_options.update(self.pipeline_options) if format_pipeline_options: snake_case_pipeline_options = { convert_camel_to_snake(key): pipeline_options[key] for key in pipeline_options } return is_dataflow, dataflow_job_name, snake_case_pipeline_options, process_line_callback return is_dataflow, dataflow_job_name, pipeline_options, process_line_callback
def execute(self, context: 'Context'): """Execute the python dataflow job.""" self.beam_hook = BeamHook(runner=BeamRunnerType.DataflowRunner) self.dataflow_hook = DataflowHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep, impersonation_chain=None, drain_pipeline=self.drain_pipeline, cancel_timeout=self.cancel_timeout, wait_until_finished=self.wait_until_finished, ) job_name = self.dataflow_hook.build_dataflow_job_name(job_name=self.job_name) pipeline_options = self.dataflow_default_options.copy() pipeline_options["job_name"] = job_name pipeline_options["project"] = self.project_id or self.dataflow_hook.project_id pipeline_options["region"] = self.location pipeline_options.update(self.options) # Convert argument names from lowerCamelCase to snake case. camel_to_snake = lambda name: re.sub(r"[A-Z]", lambda x: "_" + x.group(0).lower(), name) formatted_pipeline_options = {camel_to_snake(key): pipeline_options[key] for key in pipeline_options} def set_current_job_id(job_id): self.job_id = job_id process_line_callback = process_line_and_extract_dataflow_job_id_callback( on_new_job_id_callback=set_current_job_id ) with ExitStack() as exit_stack: if self.py_file.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context(gcs_hook.provide_file(object_url=self.py_file)) self.py_file = tmp_gcs_file.name with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) self.dataflow_hook.wait_for_done( job_name=job_name, location=self.location, job_id=self.job_id, multiple_jobs=False, ) return {"job_id": self.job_id}
def test_start_python_pipeline_with_empty_py_requirements_and_without_system_packages(self, mock_runner): hook = BeamHook(runner=DEFAULT_RUNNER) wait_for_done = mock_runner.return_value.wait_for_done process_line_callback = MagicMock() with self.assertRaisesRegex(AirflowException, "Invalid method invocation."): hook.start_python_pipeline( # pylint: disable=no-value-for-parameter variables=copy.deepcopy(BEAM_VARIABLES_PY), py_file=PY_FILE, py_options=PY_OPTIONS, py_requirements=[], process_line_callback=process_line_callback, ) mock_runner.assert_not_called() wait_for_done.assert_not_called()
def __init__( self, gcp_conn_id: str = "google_cloud_default", delegate_to: Optional[str] = None, poll_sleep: int = 10, impersonation_chain: Optional[Union[str, Sequence[str]]] = None, drain_pipeline: bool = False, cancel_timeout: Optional[int] = 5 * 60, wait_until_finished: Optional[bool] = None, ) -> None: self.poll_sleep = poll_sleep self.drain_pipeline = drain_pipeline self.cancel_timeout = cancel_timeout self.wait_until_finished = wait_until_finished self.job_id: Optional[str] = None self.beam_hook = BeamHook(BeamRunnerType.DataflowRunner) super().__init__( gcp_conn_id=gcp_conn_id, delegate_to=delegate_to, impersonation_chain=impersonation_chain, )
def test_start_java_pipeline(self, mock_runner): hook = BeamHook(runner=DEFAULT_RUNNER) wait_for_done = mock_runner.return_value.wait_for_done process_line_callback = MagicMock() hook.start_java_pipeline( # pylint: disable=no-value-for-parameter jar=JAR_FILE, variables=copy.deepcopy(BEAM_VARIABLES_JAVA), process_line_callback=process_line_callback, ) expected_cmd = [ 'java', '-jar', JAR_FILE, f'--runner={DEFAULT_RUNNER}', '--output=gs://test/output', '--labels={"foo":"bar"}', ] mock_runner.assert_called_once_with(cmd=expected_cmd, process_line_callback=process_line_callback) wait_for_done.assert_called_once_with()
def test_start_python_pipeline(self, mock_runner): hook = BeamHook(runner=DEFAULT_RUNNER) wait_for_done = mock_runner.return_value.wait_for_done process_line_callback = MagicMock() hook.start_python_pipeline( # pylint: disable=no-value-for-parameter variables=copy.deepcopy(BEAM_VARIABLES_PY), py_file=PY_FILE, py_options=PY_OPTIONS, process_line_callback=process_line_callback, ) expected_cmd = [ "python3", '-m', PY_FILE, f'--runner={DEFAULT_RUNNER}', '--output=gs://test/output', '--labels=foo=bar', ] mock_runner.assert_called_once_with(cmd=expected_cmd, process_line_callback=process_line_callback) wait_for_done.assert_called_once_with()
def execute(self, context: 'Context'): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=BeamRunnerType.DataflowRunner) self.dataflow_hook = DataflowHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep, cancel_timeout=self.cancel_timeout, wait_until_finished=self.wait_until_finished, ) job_name = self.dataflow_hook.build_dataflow_job_name( job_name=self.job_name) pipeline_options = copy.deepcopy(self.dataflow_default_options) pipeline_options["jobName"] = self.job_name pipeline_options[ "project"] = self.project_id or self.dataflow_hook.project_id pipeline_options["region"] = self.location pipeline_options.update(self.options) pipeline_options.setdefault("labels", {}).update({ "airflow-version": "v" + version.replace(".", "-").replace("+", "-") }) pipeline_options.update(self.options) def set_current_job_id(job_id): self.job_id = job_id process_line_callback = process_line_and_extract_dataflow_job_id_callback( on_new_job_id_callback=set_current_job_id) with ExitStack() as exit_stack: if self.jar.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( gcs_hook.provide_file(object_url=self.jar)) self.jar = tmp_gcs_file.name is_running = False if self.check_if_running != CheckJobRunning.IgnoreJob: is_running = self.dataflow_hook.is_job_dataflow_running( name=self.job_name, variables=pipeline_options, ) while is_running and self.check_if_running == CheckJobRunning.WaitForRun: is_running = self.dataflow_hook.is_job_dataflow_running( name=self.job_name, variables=pipeline_options, ) if not is_running: pipeline_options["jobName"] = job_name with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) self.dataflow_hook.wait_for_done( job_name=job_name, location=self.location, job_id=self.job_id, multiple_jobs=self.multiple_jobs, ) return {"job_id": self.job_id}
def execute(self, context: 'Context'): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable] = None is_dataflow = self.runner.lower( ) == BeamRunnerType.DataflowRunner.lower() dataflow_job_name: Optional[str] = None if is_dataflow: dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow( pipeline_options=pipeline_options, job_name_variable_key=None) pipeline_options.update(self.pipeline_options) with ExitStack() as exit_stack: if self.jar.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( gcs_hook.provide_file(object_url=self.jar)) self.jar = tmp_gcs_file.name if is_dataflow and self.dataflow_hook: is_running = False if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob: is_running = ( # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) self.dataflow_hook.is_job_dataflow_running( name=self.dataflow_config.job_name, variables=pipeline_options, )) while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun: # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) is_running = self.dataflow_hook.is_job_dataflow_running( name=self.dataflow_config.job_name, variables=pipeline_options, ) if not is_running: pipeline_options["jobName"] = dataflow_job_name with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) if dataflow_job_name and self.dataflow_config.location: multiple_jobs = (self.dataflow_config.multiple_jobs if self.dataflow_config.multiple_jobs else False) self.dataflow_hook.wait_for_done( job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=multiple_jobs, project_id=self.dataflow_config.project_id, ) else: self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) return {"dataflow_job_id": self.dataflow_job_id}
def execute(self, context): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable] = None is_dataflow = self.runner.lower( ) == BeamRunnerType.DataflowRunner.lower() if isinstance(self.dataflow_config, dict): self.dataflow_config = DataflowConfiguration( **self.dataflow_config) if is_dataflow: self.dataflow_hook = DataflowHook( gcp_conn_id=self.dataflow_config.gcp_conn_id or self.gcp_conn_id, delegate_to=self.dataflow_config.delegate_to or self.delegate_to, poll_sleep=self.dataflow_config.poll_sleep, impersonation_chain=self.dataflow_config.impersonation_chain, drain_pipeline=self.dataflow_config.drain_pipeline, cancel_timeout=self.dataflow_config.cancel_timeout, wait_until_finished=self.dataflow_config.wait_until_finished, ) self.dataflow_config.project_id = self.dataflow_config.project_id or self.dataflow_hook.project_id self._dataflow_job_name = DataflowHook.build_dataflow_job_name( self.dataflow_config.job_name, self.dataflow_config.append_job_name) pipeline_options["jobName"] = self.dataflow_config.job_name pipeline_options["project"] = self.dataflow_config.project_id pipeline_options["region"] = self.dataflow_config.location pipeline_options.setdefault("labels", {}).update({ "airflow-version": "v" + version.replace(".", "-").replace("+", "-") }) def set_current_dataflow_job_id(job_id): self.dataflow_job_id = job_id process_line_callback = process_line_and_extract_dataflow_job_id_callback( on_new_job_id_callback=set_current_dataflow_job_id) pipeline_options.update(self.pipeline_options) with ExitStack() as exit_stack: if self.jar.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( # pylint: disable=no-member gcs_hook.provide_file(object_url=self.jar)) self.jar = tmp_gcs_file.name if is_dataflow: is_running = False if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob: is_running = ( # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) self.dataflow_hook.is_job_dataflow_running( # pylint: disable=no-value-for-parameter name=self.dataflow_config.job_name, variables=pipeline_options, )) while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun: # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) # pylint: disable=no-value-for-parameter is_running = self.dataflow_hook.is_job_dataflow_running( name=self.dataflow_config.job_name, variables=pipeline_options, ) if not is_running: pipeline_options["jobName"] = self._dataflow_job_name self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) self.dataflow_hook.wait_for_done( job_name=self._dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=self.dataflow_config.multiple_jobs, project_id=self.dataflow_config.project_id, ) else: self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) return {"dataflow_job_id": self.dataflow_job_id}
def execute(self, context): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable] = None is_dataflow = self.runner.lower( ) == BeamRunnerType.DataflowRunner.lower() if isinstance(self.dataflow_config, dict): self.dataflow_config = DataflowConfiguration( **self.dataflow_config) if is_dataflow: self.dataflow_hook = DataflowHook( gcp_conn_id=self.dataflow_config.gcp_conn_id or self.gcp_conn_id, delegate_to=self.dataflow_config.delegate_to or self.delegate_to, poll_sleep=self.dataflow_config.poll_sleep, impersonation_chain=self.dataflow_config.impersonation_chain, drain_pipeline=self.dataflow_config.drain_pipeline, cancel_timeout=self.dataflow_config.cancel_timeout, wait_until_finished=self.dataflow_config.wait_until_finished, ) self.dataflow_config.project_id = self.dataflow_config.project_id or self.dataflow_hook.project_id dataflow_job_name = DataflowHook.build_dataflow_job_name( self.dataflow_config.job_name, self.dataflow_config.append_job_name) pipeline_options["job_name"] = dataflow_job_name pipeline_options["project"] = self.dataflow_config.project_id pipeline_options["region"] = self.dataflow_config.location pipeline_options.setdefault("labels", {}).update({ "airflow-version": "v" + version.replace(".", "-").replace("+", "-") }) def set_current_dataflow_job_id(job_id): self.dataflow_job_id = job_id process_line_callback = process_line_and_extract_dataflow_job_id_callback( on_new_job_id_callback=set_current_dataflow_job_id) pipeline_options.update(self.pipeline_options) # Convert argument names from lowerCamelCase to snake case. formatted_pipeline_options = { convert_camel_to_snake(key): pipeline_options[key] for key in pipeline_options } with ExitStack() as exit_stack: if self.py_file.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( # pylint: disable=no-member gcs_hook.provide_file(object_url=self.py_file)) self.py_file = tmp_gcs_file.name self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) if is_dataflow: self.dataflow_hook.wait_for_done( # pylint: disable=no-value-for-parameter job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=False, ) return {"dataflow_job_id": self.dataflow_job_id}