Exemple #1
0
    def execute_callable(self):
        with TemporaryDirectory(prefix='venv') as tmp_dir:
            if self.templates_dict:
                self.op_kwargs['templates_dict'] = self.templates_dict

            input_filename = os.path.join(tmp_dir, 'script.in')
            output_filename = os.path.join(tmp_dir, 'script.out')
            string_args_filename = os.path.join(tmp_dir, 'string_args.txt')
            script_filename = os.path.join(tmp_dir, 'script.py')

            prepare_virtualenv(venv_directory=tmp_dir,
                               python_bin=f'python{self.python_version}'
                               if self.python_version else None,
                               system_site_packages=self.system_site_packages,
                               requirements=self.requirements)

            self._write_args(input_filename)
            self._write_string_args(string_args_filename)
            write_python_script(jinja_context=dict(
                op_args=self.op_args,
                op_kwargs=self.op_kwargs,
                pickling_library=self.pickling_library.__name__,
                python_callable=self.python_callable.__name__,
                python_callable_source=dedent(
                    inspect.getsource(self.python_callable))),
                                filename=script_filename)

            execute_in_subprocess(cmd=[
                f'{tmp_dir}/bin/python', script_filename, input_filename,
                output_filename, string_args_filename
            ])

            return self._read_result(output_filename)
Exemple #2
0
    def execute_callable(self):
        with TemporaryDirectory(prefix='venv') as tmp_dir:
            if self.templates_dict:
                self.op_kwargs['templates_dict'] = self.templates_dict
            # generate filenames
            input_filename = os.path.join(tmp_dir, 'script.in')
            output_filename = os.path.join(tmp_dir, 'script.out')
            string_args_filename = os.path.join(tmp_dir, 'string_args.txt')
            script_filename = os.path.join(tmp_dir, 'script.py')

            # set up virtualenv
            python_bin = 'python' + str(
                self.python_version) if self.python_version else None
            prepare_virtualenv(
                venv_directory=tmp_dir,
                python_bin=python_bin,
                system_site_packages=self.system_site_packages,
                requirements=self.requirements,
            )

            self._write_args(input_filename)
            self._write_script(script_filename)
            self._write_string_args(string_args_filename)

            # execute command in virtualenv
            execute_in_subprocess(
                self._generate_python_cmd(tmp_dir, script_filename,
                                          input_filename, output_filename,
                                          string_args_filename))
            return self._read_result(output_filename)
Exemple #3
0
    def execute_callable(self):
        with TemporaryDirectory(prefix='venv') as tmp_dir:
            requirements_file_name = f'{tmp_dir}/requirements.txt'

            if not isinstance(self.requirements, str):
                requirements_file_contents = "\n".join(
                    str(dependency) for dependency in self.requirements)
            else:
                requirements_file_contents = self.requirements

            if not self.system_site_packages and self.use_dill:
                requirements_file_contents += '\ndill'

            with open(requirements_file_name, 'w') as file:
                file.write(requirements_file_contents)

            if self.templates_dict:
                self.op_kwargs['templates_dict'] = self.templates_dict

            input_filename = os.path.join(tmp_dir, 'script.in')
            output_filename = os.path.join(tmp_dir, 'script.out')
            string_args_filename = os.path.join(tmp_dir, 'string_args.txt')
            script_filename = os.path.join(tmp_dir, 'script.py')

            prepare_virtualenv(
                venv_directory=tmp_dir,
                python_bin=f'python{self.python_version}'
                if self.python_version else None,
                system_site_packages=self.system_site_packages,
                requirements_file_path=requirements_file_name,
                pip_install_options=self.pip_install_options,
            )

            self._write_args(input_filename)
            self._write_string_args(string_args_filename)
            write_python_script(
                jinja_context=dict(
                    op_args=self.op_args,
                    op_kwargs=self.op_kwargs,
                    pickling_library=self.pickling_library.__name__,
                    python_callable=self.python_callable.__name__,
                    python_callable_source=self.get_python_source(),
                ),
                filename=script_filename,
                render_template_as_native_obj=self.dag.
                render_template_as_native_obj,
            )

            execute_in_subprocess(cmd=[
                f'{tmp_dir}/bin/python',
                script_filename,
                input_filename,
                output_filename,
                string_args_filename,
            ])

            return self._read_result(output_filename)
Exemple #4
0
 def test_should_create_virtualenv(self, mock_execute_in_subprocess):
     python_bin = prepare_virtualenv(venv_directory="/VENV",
                                     python_bin="pythonVER",
                                     system_site_packages=False,
                                     requirements=[])
     self.assertEqual("/VENV/bin/python", python_bin)
     mock_execute_in_subprocess.assert_called_once_with(
         ['virtualenv', '/VENV', '--python=pythonVER'])
 def test_should_create_virtualenv_with_system_packages(
         self, mock_execute_in_subprocess):
     python_bin = prepare_virtualenv(venv_directory="/VENV",
                                     python_bin="pythonVER",
                                     system_site_packages=True,
                                     requirements=[])
     assert "/VENV/bin/python" == python_bin
     mock_execute_in_subprocess.assert_called_once_with([
         'virtualenv', '/VENV', '--system-site-packages',
         '--python=pythonVER'
     ])
Exemple #6
0
    def test_should_create_virtualenv_with_extra_packages(
            self, mock_execute_in_subprocess):
        python_bin = prepare_virtualenv(venv_directory="/VENV",
                                        python_bin="pythonVER",
                                        system_site_packages=False,
                                        requirements=['apache-beam[gcp]'])
        self.assertEqual("/VENV/bin/python", python_bin)

        mock_execute_in_subprocess.assert_any_call(
            ['virtualenv', '/VENV', '--python=pythonVER'])

        mock_execute_in_subprocess.assert_called_with(
            ['/VENV/bin/pip', 'install', 'apache-beam[gcp]'])
Exemple #7
0
    def start_python_dataflow(  # pylint: disable=too-many-arguments
        self,
        job_name: str,
        variables: dict,
        dataflow: str,
        py_options: List[str],
        project_id: str,
        py_interpreter: str = "python3",
        py_requirements: Optional[List[str]] = None,
        py_system_site_packages: bool = False,
        append_job_name: bool = True,
        on_new_job_id_callback: Optional[Callable[[str], None]] = None,
        location: str = DEFAULT_DATAFLOW_LOCATION,
    ):
        """
        Starts Dataflow job.

        :param job_name: The name of the job.
        :type job_name: str
        :param variables: Variables passed to the job.
        :type variables: Dict
        :param dataflow: Name of the Dataflow process.
        :type dataflow: str
        :param py_options: Additional options.
        :type py_options: List[str]
        :param project_id: The ID of the GCP project that owns the job.
            If set to ``None`` or missing, the default project_id from the GCP connection is used.
        :type project_id: Optional[str]
        :param py_interpreter: Python version of the beam pipeline.
            If None, this defaults to the python3.
            To track python versions supported by beam and related
            issues check: https://issues.apache.org/jira/browse/BEAM-1251
        :param py_requirements: Additional python package(s) to install.
            If a value is passed to this parameter, a new virtual environment has been created with
            additional packages installed.

            You could also install the apache-beam package if it is not installed on your system or you want
            to use a different version.
        :type py_requirements: List[str]
        :param py_system_site_packages: Whether to include system_site_packages in your virtualenv.
            See virtualenv documentation for more information.

            This option is only relevant if the ``py_requirements`` parameter is not None.
        :type py_interpreter: str
        :param append_job_name: True if unique suffix has to be appended to job name.
        :type append_job_name: bool
        :param project_id: Optional, the Google Cloud project ID in which to start a job.
            If set to None or missing, the default project_id from the Google Cloud connection is used.
        :param on_new_job_id_callback: Callback called when the job ID is known.
        :type on_new_job_id_callback: callable
        :param location: Job location.
        :type location: str
        """
        name = self._build_dataflow_job_name(job_name, append_job_name)
        variables['job_name'] = name
        variables['region'] = location

        def label_formatter(labels_dict):
            return [
                '--labels={}={}'.format(key, value)
                for key, value in labels_dict.items()
            ]

        if py_requirements is not None:
            if not py_requirements and not py_system_site_packages:
                warning_invalid_environment = textwrap.dedent("""\
                    Invalid method invocation. You have disabled inclusion of system packages and empty list
                    required for installation, so it is not possible to create a valid virtual environment.
                    In the virtual environment, apache-beam package must be installed for your job to be \
                    executed. To fix this problem:
                    * install apache-beam on the system, then set parameter py_system_site_packages to True,
                    * add apache-beam to the list of required packages in parameter py_requirements.
                    """)
                raise AirflowException(warning_invalid_environment)

            with TemporaryDirectory(prefix='dataflow-venv') as tmp_dir:
                py_interpreter = prepare_virtualenv(
                    venv_directory=tmp_dir,
                    python_bin=py_interpreter,
                    system_site_packages=py_system_site_packages,
                    requirements=py_requirements,
                )
                command_prefix = [py_interpreter] + py_options + [dataflow]

                self._start_dataflow(
                    variables=variables,
                    name=name,
                    command_prefix=command_prefix,
                    label_formatter=label_formatter,
                    project_id=project_id,
                    on_new_job_id_callback=on_new_job_id_callback,
                    location=location,
                )
        else:
            command_prefix = [py_interpreter] + py_options + [dataflow]

            self._start_dataflow(
                variables=variables,
                name=name,
                command_prefix=command_prefix,
                label_formatter=label_formatter,
                project_id=project_id,
                on_new_job_id_callback=on_new_job_id_callback,
                location=location,
            )
Exemple #8
0
    def start_python_pipeline(
        self,
        variables: dict,
        py_file: str,
        py_options: List[str],
        py_interpreter: str = "python3",
        py_requirements: Optional[List[str]] = None,
        py_system_site_packages: bool = False,
        process_line_callback: Optional[Callable[[str], None]] = None,
    ):
        """
        Starts Apache Beam python pipeline.

        :param variables: Variables passed to the pipeline.
        :param py_file: Path to the python file to execute.
        :param py_options: Additional options.
        :param py_interpreter: Python version of the Apache Beam pipeline.
            If None, this defaults to the python3.
            To track python versions supported by beam and related
            issues check: https://issues.apache.org/jira/browse/BEAM-1251
        :param py_requirements: Additional python package(s) to install.
            If a value is passed to this parameter, a new virtual environment has been created with
            additional packages installed.

            You could also install the apache-beam package if it is not installed on your system or you want
            to use a different version.
        :param py_system_site_packages: Whether to include system_site_packages in your virtualenv.
            See virtualenv documentation for more information.

            This option is only relevant if the ``py_requirements`` parameter is not None.
        :param process_line_callback: (optional) Callback that can be used to process each line of
            the stdout and stderr file descriptors.
        """
        if "labels" in variables:
            variables["labels"] = [f"{key}={value}" for key, value in variables["labels"].items()]

        if py_requirements is not None:
            if not py_requirements and not py_system_site_packages:
                warning_invalid_environment = textwrap.dedent(
                    """\
                    Invalid method invocation. You have disabled inclusion of system packages and empty list
                    required for installation, so it is not possible to create a valid virtual environment.
                    In the virtual environment, apache-beam package must be installed for your job to be \
                    executed. To fix this problem:
                    * install apache-beam on the system, then set parameter py_system_site_packages to True,
                    * add apache-beam to the list of required packages in parameter py_requirements.
                    """
                )
                raise AirflowException(warning_invalid_environment)

            with TemporaryDirectory(prefix="apache-beam-venv") as tmp_dir:
                py_interpreter = prepare_virtualenv(
                    venv_directory=tmp_dir,
                    python_bin=py_interpreter,
                    system_site_packages=py_system_site_packages,
                    requirements=py_requirements,
                )
                command_prefix = [py_interpreter] + py_options + [py_file]

                self._start_pipeline(
                    variables=variables,
                    command_prefix=command_prefix,
                    process_line_callback=process_line_callback,
                )
        else:
            command_prefix = [py_interpreter] + py_options + [py_file]

            self._start_pipeline(
                variables=variables,
                command_prefix=command_prefix,
                process_line_callback=process_line_callback,
            )
Exemple #9
0
    def start_python_dataflow(  # pylint: disable=too-many-arguments
            self,
            job_name: str,
            variables: Dict,
            dataflow: str,
            py_options: List[str],
            py_interpreter: str = "python3",
            py_requirements: Optional[List[str]] = None,
            py_system_site_packages: bool = False,
            project_id: Optional[str] = None,
            append_job_name: bool = True,
            on_new_job_id_callback: Optional[Callable[[str], None]] = None):
        """
        Starts Dataflow job.

        :param job_name: The name of the job.
        :type job_name: str
        :param variables: Variables passed to the job.
        :type variables: Dict
        :param dataflow: Name of the Dataflow process.
        :type dataflow: str
        :param py_options: Additional options.
        :type py_options: List[str]
        :param py_interpreter: Python version of the beam pipeline.
            If None, this defaults to the python3.
            To track python versions supported by beam and related
            issues check: https://issues.apache.org/jira/browse/BEAM-1251
        :param py_requirements: Additional python package(s) to install.
            If a value is passed to this parameter, a new virtual environment has been created with
            additional packages installed.

            You could also install the apache-beam package if it is not installed on your system or you want
            to use a different version.
        :type py_requirements: List[str]
        :param py_system_site_packages: Whether to include system_site_packages in your virtualenv.
            See virtualenv documentation for more information.

            This option is only relevant if the ``py_requirements`` parameter is passed.
        :type py_interpreter: str
        :param append_job_name: True if unique suffix has to be appended to job name.
        :type append_job_name: bool
        :param project_id: Optional, the GCP project ID in which to start a job.
            If set to None or missing, the default project_id from the GCP connection is used.
        :param on_new_job_id_callback: Callback called when the job ID is known.
        :type on_new_job_id_callback: callable
        """
        if not project_id:
            raise ValueError("The project_id should be set")

        name = self._build_dataflow_job_name(job_name, append_job_name)
        variables['job_name'] = name

        def label_formatter(labels_dict):
            return [
                '--labels={}={}'.format(key, value)
                for key, value in labels_dict.items()
            ]

        if py_requirements is not None:
            with TemporaryDirectory(prefix='dataflow-venv') as tmp_dir:
                py_interpreter = prepare_virtualenv(
                    venv_directory=tmp_dir,
                    python_bin=py_interpreter,
                    system_site_packages=py_system_site_packages,
                    requirements=py_requirements,
                )
                command_prefix = [py_interpreter] + py_options + [dataflow]

                self._start_dataflow(
                    variables=variables,
                    name=name,
                    command_prefix=command_prefix,
                    label_formatter=label_formatter,
                    project_id=project_id,
                    on_new_job_id_callback=on_new_job_id_callback)
        else:
            command_prefix = [py_interpreter] + py_options + [dataflow]

            self._start_dataflow(variables=variables,
                                 name=name,
                                 command_prefix=command_prefix,
                                 label_formatter=label_formatter,
                                 project_id=project_id,
                                 on_new_job_id_callback=on_new_job_id_callback)