Beispiel #1
0
    def test_start_python_pipeline_with_non_empty_py_requirements_and_without_system_packages(
        self, current_py_requirements, current_py_system_site_packages, mock_runner, mock_virtualenv
    ):
        hook = BeamHook(runner=DEFAULT_RUNNER)
        wait_for_done = mock_runner.return_value.wait_for_done
        mock_virtualenv.return_value = '/dummy_dir/bin/python'
        process_line_callback = MagicMock()

        hook.start_python_pipeline(  # pylint: disable=no-value-for-parameter
            variables=copy.deepcopy(BEAM_VARIABLES_PY),
            py_file=PY_FILE,
            py_options=PY_OPTIONS,
            py_requirements=current_py_requirements,
            py_system_site_packages=current_py_system_site_packages,
            process_line_callback=process_line_callback,
        )

        expected_cmd = [
            '/dummy_dir/bin/python',
            '-m',
            PY_FILE,
            f'--runner={DEFAULT_RUNNER}',
            '--output=gs://test/output',
            '--labels=foo=bar',
        ]
        mock_runner.assert_called_once_with(cmd=expected_cmd, process_line_callback=process_line_callback)
        wait_for_done.assert_called_once_with()
        mock_virtualenv.assert_called_once_with(
            venv_directory=mock.ANY,
            python_bin="python3",
            system_site_packages=current_py_system_site_packages,
            requirements=current_py_requirements,
        )
Beispiel #2
0
    def execute(self, context):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=self.runner)
        pipeline_options = self.default_pipeline_options.copy()
        process_line_callback: Optional[Callable] = None
        is_dataflow = self.runner.lower() == BeamRunnerType.DataflowRunner.lower()
        dataflow_job_name: Optional[str] = None

        if isinstance(self.dataflow_config, dict):
            self.dataflow_config = DataflowConfiguration(**self.dataflow_config)

        if is_dataflow:
            dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow(
                pipeline_options=pipeline_options, job_name_variable_key="job_name"
            )

        pipeline_options.update(self.pipeline_options)

        # Convert argument names from lowerCamelCase to snake case.
        formatted_pipeline_options = {
            convert_camel_to_snake(key): pipeline_options[key] for key in pipeline_options
        }

        with ExitStack() as exit_stack:
            if self.py_file.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(gcs_hook.provide_file(object_url=self.py_file))
                self.py_file = tmp_gcs_file.name

            if is_dataflow:
                with self.dataflow_hook.provide_authorized_gcloud():
                    self.beam_hook.start_python_pipeline(
                        variables=formatted_pipeline_options,
                        py_file=self.py_file,
                        py_options=self.py_options,
                        py_interpreter=self.py_interpreter,
                        py_requirements=self.py_requirements,
                        py_system_site_packages=self.py_system_site_packages,
                        process_line_callback=process_line_callback,
                    )

                self.dataflow_hook.wait_for_done(
                    job_name=dataflow_job_name,
                    location=self.dataflow_config.location,
                    job_id=self.dataflow_job_id,
                    multiple_jobs=False,
                )

            else:
                self.beam_hook.start_python_pipeline(
                    variables=formatted_pipeline_options,
                    py_file=self.py_file,
                    py_options=self.py_options,
                    py_interpreter=self.py_interpreter,
                    py_requirements=self.py_requirements,
                    py_system_site_packages=self.py_system_site_packages,
                    process_line_callback=process_line_callback,
                )

        return {"dataflow_job_id": self.dataflow_job_id}
Beispiel #3
0
    def _init_pipeline_options(
        self,
        format_pipeline_options: bool = False,
        job_name_variable_key: Optional[str] = None,
    ) -> Tuple[bool, Optional[str], dict, Optional[Callable[[str], None]]]:
        self.beam_hook = BeamHook(runner=self.runner)
        pipeline_options = self.default_pipeline_options.copy()
        process_line_callback: Optional[Callable[[str], None]] = None
        is_dataflow = self.runner.lower(
        ) == BeamRunnerType.DataflowRunner.lower()
        dataflow_job_name: Optional[str] = None
        if is_dataflow:
            dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow(
                pipeline_options=pipeline_options,
                job_name_variable_key=job_name_variable_key,
            )
            self.log.info(pipeline_options)

        pipeline_options.update(self.pipeline_options)

        if format_pipeline_options:
            snake_case_pipeline_options = {
                convert_camel_to_snake(key): pipeline_options[key]
                for key in pipeline_options
            }
            return is_dataflow, dataflow_job_name, snake_case_pipeline_options, process_line_callback

        return is_dataflow, dataflow_job_name, pipeline_options, process_line_callback
Beispiel #4
0
    def execute(self, context: 'Context'):
        """Execute the python dataflow job."""
        self.beam_hook = BeamHook(runner=BeamRunnerType.DataflowRunner)
        self.dataflow_hook = DataflowHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            poll_sleep=self.poll_sleep,
            impersonation_chain=None,
            drain_pipeline=self.drain_pipeline,
            cancel_timeout=self.cancel_timeout,
            wait_until_finished=self.wait_until_finished,
        )

        job_name = self.dataflow_hook.build_dataflow_job_name(job_name=self.job_name)
        pipeline_options = self.dataflow_default_options.copy()
        pipeline_options["job_name"] = job_name
        pipeline_options["project"] = self.project_id or self.dataflow_hook.project_id
        pipeline_options["region"] = self.location
        pipeline_options.update(self.options)

        # Convert argument names from lowerCamelCase to snake case.
        camel_to_snake = lambda name: re.sub(r"[A-Z]", lambda x: "_" + x.group(0).lower(), name)
        formatted_pipeline_options = {camel_to_snake(key): pipeline_options[key] for key in pipeline_options}

        def set_current_job_id(job_id):
            self.job_id = job_id

        process_line_callback = process_line_and_extract_dataflow_job_id_callback(
            on_new_job_id_callback=set_current_job_id
        )

        with ExitStack() as exit_stack:
            if self.py_file.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(gcs_hook.provide_file(object_url=self.py_file))
                self.py_file = tmp_gcs_file.name

            with self.dataflow_hook.provide_authorized_gcloud():
                self.beam_hook.start_python_pipeline(
                    variables=formatted_pipeline_options,
                    py_file=self.py_file,
                    py_options=self.py_options,
                    py_interpreter=self.py_interpreter,
                    py_requirements=self.py_requirements,
                    py_system_site_packages=self.py_system_site_packages,
                    process_line_callback=process_line_callback,
                )

            self.dataflow_hook.wait_for_done(
                job_name=job_name,
                location=self.location,
                job_id=self.job_id,
                multiple_jobs=False,
            )

        return {"job_id": self.job_id}
Beispiel #5
0
    def test_start_python_pipeline_with_empty_py_requirements_and_without_system_packages(self, mock_runner):
        hook = BeamHook(runner=DEFAULT_RUNNER)
        wait_for_done = mock_runner.return_value.wait_for_done
        process_line_callback = MagicMock()

        with self.assertRaisesRegex(AirflowException, "Invalid method invocation."):
            hook.start_python_pipeline(  # pylint: disable=no-value-for-parameter
                variables=copy.deepcopy(BEAM_VARIABLES_PY),
                py_file=PY_FILE,
                py_options=PY_OPTIONS,
                py_requirements=[],
                process_line_callback=process_line_callback,
            )

        mock_runner.assert_not_called()
        wait_for_done.assert_not_called()
Beispiel #6
0
 def __init__(
     self,
     gcp_conn_id: str = "google_cloud_default",
     delegate_to: Optional[str] = None,
     poll_sleep: int = 10,
     impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
     drain_pipeline: bool = False,
     cancel_timeout: Optional[int] = 5 * 60,
     wait_until_finished: Optional[bool] = None,
 ) -> None:
     self.poll_sleep = poll_sleep
     self.drain_pipeline = drain_pipeline
     self.cancel_timeout = cancel_timeout
     self.wait_until_finished = wait_until_finished
     self.job_id: Optional[str] = None
     self.beam_hook = BeamHook(BeamRunnerType.DataflowRunner)
     super().__init__(
         gcp_conn_id=gcp_conn_id,
         delegate_to=delegate_to,
         impersonation_chain=impersonation_chain,
     )
Beispiel #7
0
    def test_start_java_pipeline(self, mock_runner):
        hook = BeamHook(runner=DEFAULT_RUNNER)
        wait_for_done = mock_runner.return_value.wait_for_done
        process_line_callback = MagicMock()

        hook.start_java_pipeline(  # pylint: disable=no-value-for-parameter
            jar=JAR_FILE,
            variables=copy.deepcopy(BEAM_VARIABLES_JAVA),
            process_line_callback=process_line_callback,
        )

        expected_cmd = [
            'java',
            '-jar',
            JAR_FILE,
            f'--runner={DEFAULT_RUNNER}',
            '--output=gs://test/output',
            '--labels={"foo":"bar"}',
        ]
        mock_runner.assert_called_once_with(cmd=expected_cmd, process_line_callback=process_line_callback)
        wait_for_done.assert_called_once_with()
Beispiel #8
0
    def test_start_python_pipeline(self, mock_runner):
        hook = BeamHook(runner=DEFAULT_RUNNER)
        wait_for_done = mock_runner.return_value.wait_for_done
        process_line_callback = MagicMock()

        hook.start_python_pipeline(  # pylint: disable=no-value-for-parameter
            variables=copy.deepcopy(BEAM_VARIABLES_PY),
            py_file=PY_FILE,
            py_options=PY_OPTIONS,
            process_line_callback=process_line_callback,
        )

        expected_cmd = [
            "python3",
            '-m',
            PY_FILE,
            f'--runner={DEFAULT_RUNNER}',
            '--output=gs://test/output',
            '--labels=foo=bar',
        ]
        mock_runner.assert_called_once_with(cmd=expected_cmd, process_line_callback=process_line_callback)
        wait_for_done.assert_called_once_with()
Beispiel #9
0
    def execute(self, context: 'Context'):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=BeamRunnerType.DataflowRunner)
        self.dataflow_hook = DataflowHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            poll_sleep=self.poll_sleep,
            cancel_timeout=self.cancel_timeout,
            wait_until_finished=self.wait_until_finished,
        )
        job_name = self.dataflow_hook.build_dataflow_job_name(
            job_name=self.job_name)
        pipeline_options = copy.deepcopy(self.dataflow_default_options)

        pipeline_options["jobName"] = self.job_name
        pipeline_options[
            "project"] = self.project_id or self.dataflow_hook.project_id
        pipeline_options["region"] = self.location
        pipeline_options.update(self.options)
        pipeline_options.setdefault("labels", {}).update({
            "airflow-version":
            "v" + version.replace(".", "-").replace("+", "-")
        })
        pipeline_options.update(self.options)

        def set_current_job_id(job_id):
            self.job_id = job_id

        process_line_callback = process_line_and_extract_dataflow_job_id_callback(
            on_new_job_id_callback=set_current_job_id)

        with ExitStack() as exit_stack:
            if self.jar.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(
                    gcs_hook.provide_file(object_url=self.jar))
                self.jar = tmp_gcs_file.name

            is_running = False
            if self.check_if_running != CheckJobRunning.IgnoreJob:
                is_running = self.dataflow_hook.is_job_dataflow_running(
                    name=self.job_name,
                    variables=pipeline_options,
                )
                while is_running and self.check_if_running == CheckJobRunning.WaitForRun:

                    is_running = self.dataflow_hook.is_job_dataflow_running(
                        name=self.job_name,
                        variables=pipeline_options,
                    )
            if not is_running:
                pipeline_options["jobName"] = job_name
                with self.dataflow_hook.provide_authorized_gcloud():
                    self.beam_hook.start_java_pipeline(
                        variables=pipeline_options,
                        jar=self.jar,
                        job_class=self.job_class,
                        process_line_callback=process_line_callback,
                    )
                self.dataflow_hook.wait_for_done(
                    job_name=job_name,
                    location=self.location,
                    job_id=self.job_id,
                    multiple_jobs=self.multiple_jobs,
                )

        return {"job_id": self.job_id}
Beispiel #10
0
    def execute(self, context: 'Context'):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=self.runner)
        pipeline_options = self.default_pipeline_options.copy()
        process_line_callback: Optional[Callable] = None
        is_dataflow = self.runner.lower(
        ) == BeamRunnerType.DataflowRunner.lower()
        dataflow_job_name: Optional[str] = None

        if is_dataflow:
            dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow(
                pipeline_options=pipeline_options, job_name_variable_key=None)

        pipeline_options.update(self.pipeline_options)

        with ExitStack() as exit_stack:
            if self.jar.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(
                    gcs_hook.provide_file(object_url=self.jar))
                self.jar = tmp_gcs_file.name

            if is_dataflow and self.dataflow_hook:
                is_running = False
                if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob:
                    is_running = (
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)
                        self.dataflow_hook.is_job_dataflow_running(
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        ))
                    while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun:
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)

                        is_running = self.dataflow_hook.is_job_dataflow_running(
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        )
                if not is_running:
                    pipeline_options["jobName"] = dataflow_job_name
                    with self.dataflow_hook.provide_authorized_gcloud():
                        self.beam_hook.start_java_pipeline(
                            variables=pipeline_options,
                            jar=self.jar,
                            job_class=self.job_class,
                            process_line_callback=process_line_callback,
                        )
                    if dataflow_job_name and self.dataflow_config.location:
                        multiple_jobs = (self.dataflow_config.multiple_jobs
                                         if self.dataflow_config.multiple_jobs
                                         else False)
                        self.dataflow_hook.wait_for_done(
                            job_name=dataflow_job_name,
                            location=self.dataflow_config.location,
                            job_id=self.dataflow_job_id,
                            multiple_jobs=multiple_jobs,
                            project_id=self.dataflow_config.project_id,
                        )
            else:
                self.beam_hook.start_java_pipeline(
                    variables=pipeline_options,
                    jar=self.jar,
                    job_class=self.job_class,
                    process_line_callback=process_line_callback,
                )

        return {"dataflow_job_id": self.dataflow_job_id}
Beispiel #11
0
    def execute(self, context):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=self.runner)
        pipeline_options = self.default_pipeline_options.copy()
        process_line_callback: Optional[Callable] = None
        is_dataflow = self.runner.lower(
        ) == BeamRunnerType.DataflowRunner.lower()

        if isinstance(self.dataflow_config, dict):
            self.dataflow_config = DataflowConfiguration(
                **self.dataflow_config)

        if is_dataflow:
            self.dataflow_hook = DataflowHook(
                gcp_conn_id=self.dataflow_config.gcp_conn_id
                or self.gcp_conn_id,
                delegate_to=self.dataflow_config.delegate_to
                or self.delegate_to,
                poll_sleep=self.dataflow_config.poll_sleep,
                impersonation_chain=self.dataflow_config.impersonation_chain,
                drain_pipeline=self.dataflow_config.drain_pipeline,
                cancel_timeout=self.dataflow_config.cancel_timeout,
                wait_until_finished=self.dataflow_config.wait_until_finished,
            )
            self.dataflow_config.project_id = self.dataflow_config.project_id or self.dataflow_hook.project_id

            self._dataflow_job_name = DataflowHook.build_dataflow_job_name(
                self.dataflow_config.job_name,
                self.dataflow_config.append_job_name)
            pipeline_options["jobName"] = self.dataflow_config.job_name
            pipeline_options["project"] = self.dataflow_config.project_id
            pipeline_options["region"] = self.dataflow_config.location
            pipeline_options.setdefault("labels", {}).update({
                "airflow-version":
                "v" + version.replace(".", "-").replace("+", "-")
            })

            def set_current_dataflow_job_id(job_id):
                self.dataflow_job_id = job_id

            process_line_callback = process_line_and_extract_dataflow_job_id_callback(
                on_new_job_id_callback=set_current_dataflow_job_id)

        pipeline_options.update(self.pipeline_options)

        with ExitStack() as exit_stack:
            if self.jar.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(  # pylint: disable=no-member
                    gcs_hook.provide_file(object_url=self.jar))
                self.jar = tmp_gcs_file.name

            if is_dataflow:
                is_running = False
                if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob:
                    is_running = (
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)
                        self.dataflow_hook.is_job_dataflow_running(  # pylint: disable=no-value-for-parameter
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        ))
                    while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun:
                        # The reason for disable=no-value-for-parameter is that project_id parameter is
                        # required but here is not passed, moreover it cannot be passed here.
                        # This method is wrapped by @_fallback_to_project_id_from_variables decorator which
                        # fallback project_id value from variables and raise error if project_id is
                        # defined both in variables and as parameter (here is already defined in variables)
                        # pylint: disable=no-value-for-parameter
                        is_running = self.dataflow_hook.is_job_dataflow_running(
                            name=self.dataflow_config.job_name,
                            variables=pipeline_options,
                        )
                if not is_running:
                    pipeline_options["jobName"] = self._dataflow_job_name
                    self.beam_hook.start_java_pipeline(
                        variables=pipeline_options,
                        jar=self.jar,
                        job_class=self.job_class,
                        process_line_callback=process_line_callback,
                    )
                    self.dataflow_hook.wait_for_done(
                        job_name=self._dataflow_job_name,
                        location=self.dataflow_config.location,
                        job_id=self.dataflow_job_id,
                        multiple_jobs=self.dataflow_config.multiple_jobs,
                        project_id=self.dataflow_config.project_id,
                    )

            else:
                self.beam_hook.start_java_pipeline(
                    variables=pipeline_options,
                    jar=self.jar,
                    job_class=self.job_class,
                    process_line_callback=process_line_callback,
                )

        return {"dataflow_job_id": self.dataflow_job_id}
Beispiel #12
0
    def execute(self, context):
        """Execute the Apache Beam Pipeline."""
        self.beam_hook = BeamHook(runner=self.runner)
        pipeline_options = self.default_pipeline_options.copy()
        process_line_callback: Optional[Callable] = None
        is_dataflow = self.runner.lower(
        ) == BeamRunnerType.DataflowRunner.lower()

        if isinstance(self.dataflow_config, dict):
            self.dataflow_config = DataflowConfiguration(
                **self.dataflow_config)

        if is_dataflow:
            self.dataflow_hook = DataflowHook(
                gcp_conn_id=self.dataflow_config.gcp_conn_id
                or self.gcp_conn_id,
                delegate_to=self.dataflow_config.delegate_to
                or self.delegate_to,
                poll_sleep=self.dataflow_config.poll_sleep,
                impersonation_chain=self.dataflow_config.impersonation_chain,
                drain_pipeline=self.dataflow_config.drain_pipeline,
                cancel_timeout=self.dataflow_config.cancel_timeout,
                wait_until_finished=self.dataflow_config.wait_until_finished,
            )
            self.dataflow_config.project_id = self.dataflow_config.project_id or self.dataflow_hook.project_id

            dataflow_job_name = DataflowHook.build_dataflow_job_name(
                self.dataflow_config.job_name,
                self.dataflow_config.append_job_name)
            pipeline_options["job_name"] = dataflow_job_name
            pipeline_options["project"] = self.dataflow_config.project_id
            pipeline_options["region"] = self.dataflow_config.location
            pipeline_options.setdefault("labels", {}).update({
                "airflow-version":
                "v" + version.replace(".", "-").replace("+", "-")
            })

            def set_current_dataflow_job_id(job_id):
                self.dataflow_job_id = job_id

            process_line_callback = process_line_and_extract_dataflow_job_id_callback(
                on_new_job_id_callback=set_current_dataflow_job_id)

        pipeline_options.update(self.pipeline_options)

        # Convert argument names from lowerCamelCase to snake case.
        formatted_pipeline_options = {
            convert_camel_to_snake(key): pipeline_options[key]
            for key in pipeline_options
        }

        with ExitStack() as exit_stack:
            if self.py_file.lower().startswith("gs://"):
                gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to)
                tmp_gcs_file = exit_stack.enter_context(  # pylint: disable=no-member
                    gcs_hook.provide_file(object_url=self.py_file))
                self.py_file = tmp_gcs_file.name

            self.beam_hook.start_python_pipeline(
                variables=formatted_pipeline_options,
                py_file=self.py_file,
                py_options=self.py_options,
                py_interpreter=self.py_interpreter,
                py_requirements=self.py_requirements,
                py_system_site_packages=self.py_system_site_packages,
                process_line_callback=process_line_callback,
            )

            if is_dataflow:
                self.dataflow_hook.wait_for_done(  # pylint: disable=no-value-for-parameter
                    job_name=dataflow_job_name,
                    location=self.dataflow_config.location,
                    job_id=self.dataflow_job_id,
                    multiple_jobs=False,
                )

        return {"dataflow_job_id": self.dataflow_job_id}