def execute_callable(self): with TemporaryDirectory(prefix='venv') as tmp_dir: if self.templates_dict: self.op_kwargs['templates_dict'] = self.templates_dict input_filename = os.path.join(tmp_dir, 'script.in') output_filename = os.path.join(tmp_dir, 'script.out') string_args_filename = os.path.join(tmp_dir, 'string_args.txt') script_filename = os.path.join(tmp_dir, 'script.py') prepare_virtualenv(venv_directory=tmp_dir, python_bin=f'python{self.python_version}' if self.python_version else None, system_site_packages=self.system_site_packages, requirements=self.requirements) self._write_args(input_filename) self._write_string_args(string_args_filename) write_python_script(jinja_context=dict( op_args=self.op_args, op_kwargs=self.op_kwargs, pickling_library=self.pickling_library.__name__, python_callable=self.python_callable.__name__, python_callable_source=dedent( inspect.getsource(self.python_callable))), filename=script_filename) execute_in_subprocess(cmd=[ f'{tmp_dir}/bin/python', script_filename, input_filename, output_filename, string_args_filename ]) return self._read_result(output_filename)
def execute_callable(self): with TemporaryDirectory(prefix='venv') as tmp_dir: if self.templates_dict: self.op_kwargs['templates_dict'] = self.templates_dict # generate filenames input_filename = os.path.join(tmp_dir, 'script.in') output_filename = os.path.join(tmp_dir, 'script.out') string_args_filename = os.path.join(tmp_dir, 'string_args.txt') script_filename = os.path.join(tmp_dir, 'script.py') # set up virtualenv python_bin = 'python' + str( self.python_version) if self.python_version else None prepare_virtualenv( venv_directory=tmp_dir, python_bin=python_bin, system_site_packages=self.system_site_packages, requirements=self.requirements, ) self._write_args(input_filename) self._write_script(script_filename) self._write_string_args(string_args_filename) # execute command in virtualenv execute_in_subprocess( self._generate_python_cmd(tmp_dir, script_filename, input_filename, output_filename, string_args_filename)) return self._read_result(output_filename)
def execute_callable(self): with TemporaryDirectory(prefix='venv') as tmp_dir: requirements_file_name = f'{tmp_dir}/requirements.txt' if not isinstance(self.requirements, str): requirements_file_contents = "\n".join( str(dependency) for dependency in self.requirements) else: requirements_file_contents = self.requirements if not self.system_site_packages and self.use_dill: requirements_file_contents += '\ndill' with open(requirements_file_name, 'w') as file: file.write(requirements_file_contents) if self.templates_dict: self.op_kwargs['templates_dict'] = self.templates_dict input_filename = os.path.join(tmp_dir, 'script.in') output_filename = os.path.join(tmp_dir, 'script.out') string_args_filename = os.path.join(tmp_dir, 'string_args.txt') script_filename = os.path.join(tmp_dir, 'script.py') prepare_virtualenv( venv_directory=tmp_dir, python_bin=f'python{self.python_version}' if self.python_version else None, system_site_packages=self.system_site_packages, requirements_file_path=requirements_file_name, pip_install_options=self.pip_install_options, ) self._write_args(input_filename) self._write_string_args(string_args_filename) write_python_script( jinja_context=dict( op_args=self.op_args, op_kwargs=self.op_kwargs, pickling_library=self.pickling_library.__name__, python_callable=self.python_callable.__name__, python_callable_source=self.get_python_source(), ), filename=script_filename, render_template_as_native_obj=self.dag. render_template_as_native_obj, ) execute_in_subprocess(cmd=[ f'{tmp_dir}/bin/python', script_filename, input_filename, output_filename, string_args_filename, ]) return self._read_result(output_filename)
def test_should_create_virtualenv(self, mock_execute_in_subprocess): python_bin = prepare_virtualenv(venv_directory="/VENV", python_bin="pythonVER", system_site_packages=False, requirements=[]) self.assertEqual("/VENV/bin/python", python_bin) mock_execute_in_subprocess.assert_called_once_with( ['virtualenv', '/VENV', '--python=pythonVER'])
def test_should_create_virtualenv_with_system_packages( self, mock_execute_in_subprocess): python_bin = prepare_virtualenv(venv_directory="/VENV", python_bin="pythonVER", system_site_packages=True, requirements=[]) assert "/VENV/bin/python" == python_bin mock_execute_in_subprocess.assert_called_once_with([ 'virtualenv', '/VENV', '--system-site-packages', '--python=pythonVER' ])
def test_should_create_virtualenv_with_extra_packages( self, mock_execute_in_subprocess): python_bin = prepare_virtualenv(venv_directory="/VENV", python_bin="pythonVER", system_site_packages=False, requirements=['apache-beam[gcp]']) self.assertEqual("/VENV/bin/python", python_bin) mock_execute_in_subprocess.assert_any_call( ['virtualenv', '/VENV', '--python=pythonVER']) mock_execute_in_subprocess.assert_called_with( ['/VENV/bin/pip', 'install', 'apache-beam[gcp]'])
def start_python_dataflow( # pylint: disable=too-many-arguments self, job_name: str, variables: dict, dataflow: str, py_options: List[str], project_id: str, py_interpreter: str = "python3", py_requirements: Optional[List[str]] = None, py_system_site_packages: bool = False, append_job_name: bool = True, on_new_job_id_callback: Optional[Callable[[str], None]] = None, location: str = DEFAULT_DATAFLOW_LOCATION, ): """ Starts Dataflow job. :param job_name: The name of the job. :type job_name: str :param variables: Variables passed to the job. :type variables: Dict :param dataflow: Name of the Dataflow process. :type dataflow: str :param py_options: Additional options. :type py_options: List[str] :param project_id: The ID of the GCP project that owns the job. If set to ``None`` or missing, the default project_id from the GCP connection is used. :type project_id: Optional[str] :param py_interpreter: Python version of the beam pipeline. If None, this defaults to the python3. To track python versions supported by beam and related issues check: https://issues.apache.org/jira/browse/BEAM-1251 :param py_requirements: Additional python package(s) to install. If a value is passed to this parameter, a new virtual environment has been created with additional packages installed. You could also install the apache-beam package if it is not installed on your system or you want to use a different version. :type py_requirements: List[str] :param py_system_site_packages: Whether to include system_site_packages in your virtualenv. See virtualenv documentation for more information. This option is only relevant if the ``py_requirements`` parameter is not None. :type py_interpreter: str :param append_job_name: True if unique suffix has to be appended to job name. :type append_job_name: bool :param project_id: Optional, the Google Cloud project ID in which to start a job. If set to None or missing, the default project_id from the Google Cloud connection is used. :param on_new_job_id_callback: Callback called when the job ID is known. :type on_new_job_id_callback: callable :param location: Job location. :type location: str """ name = self._build_dataflow_job_name(job_name, append_job_name) variables['job_name'] = name variables['region'] = location def label_formatter(labels_dict): return [ '--labels={}={}'.format(key, value) for key, value in labels_dict.items() ] if py_requirements is not None: if not py_requirements and not py_system_site_packages: warning_invalid_environment = textwrap.dedent("""\ Invalid method invocation. You have disabled inclusion of system packages and empty list required for installation, so it is not possible to create a valid virtual environment. In the virtual environment, apache-beam package must be installed for your job to be \ executed. To fix this problem: * install apache-beam on the system, then set parameter py_system_site_packages to True, * add apache-beam to the list of required packages in parameter py_requirements. """) raise AirflowException(warning_invalid_environment) with TemporaryDirectory(prefix='dataflow-venv') as tmp_dir: py_interpreter = prepare_virtualenv( venv_directory=tmp_dir, python_bin=py_interpreter, system_site_packages=py_system_site_packages, requirements=py_requirements, ) command_prefix = [py_interpreter] + py_options + [dataflow] self._start_dataflow( variables=variables, name=name, command_prefix=command_prefix, label_formatter=label_formatter, project_id=project_id, on_new_job_id_callback=on_new_job_id_callback, location=location, ) else: command_prefix = [py_interpreter] + py_options + [dataflow] self._start_dataflow( variables=variables, name=name, command_prefix=command_prefix, label_formatter=label_formatter, project_id=project_id, on_new_job_id_callback=on_new_job_id_callback, location=location, )
def start_python_pipeline( self, variables: dict, py_file: str, py_options: List[str], py_interpreter: str = "python3", py_requirements: Optional[List[str]] = None, py_system_site_packages: bool = False, process_line_callback: Optional[Callable[[str], None]] = None, ): """ Starts Apache Beam python pipeline. :param variables: Variables passed to the pipeline. :param py_file: Path to the python file to execute. :param py_options: Additional options. :param py_interpreter: Python version of the Apache Beam pipeline. If None, this defaults to the python3. To track python versions supported by beam and related issues check: https://issues.apache.org/jira/browse/BEAM-1251 :param py_requirements: Additional python package(s) to install. If a value is passed to this parameter, a new virtual environment has been created with additional packages installed. You could also install the apache-beam package if it is not installed on your system or you want to use a different version. :param py_system_site_packages: Whether to include system_site_packages in your virtualenv. See virtualenv documentation for more information. This option is only relevant if the ``py_requirements`` parameter is not None. :param process_line_callback: (optional) Callback that can be used to process each line of the stdout and stderr file descriptors. """ if "labels" in variables: variables["labels"] = [f"{key}={value}" for key, value in variables["labels"].items()] if py_requirements is not None: if not py_requirements and not py_system_site_packages: warning_invalid_environment = textwrap.dedent( """\ Invalid method invocation. You have disabled inclusion of system packages and empty list required for installation, so it is not possible to create a valid virtual environment. In the virtual environment, apache-beam package must be installed for your job to be \ executed. To fix this problem: * install apache-beam on the system, then set parameter py_system_site_packages to True, * add apache-beam to the list of required packages in parameter py_requirements. """ ) raise AirflowException(warning_invalid_environment) with TemporaryDirectory(prefix="apache-beam-venv") as tmp_dir: py_interpreter = prepare_virtualenv( venv_directory=tmp_dir, python_bin=py_interpreter, system_site_packages=py_system_site_packages, requirements=py_requirements, ) command_prefix = [py_interpreter] + py_options + [py_file] self._start_pipeline( variables=variables, command_prefix=command_prefix, process_line_callback=process_line_callback, ) else: command_prefix = [py_interpreter] + py_options + [py_file] self._start_pipeline( variables=variables, command_prefix=command_prefix, process_line_callback=process_line_callback, )
def start_python_dataflow( # pylint: disable=too-many-arguments self, job_name: str, variables: Dict, dataflow: str, py_options: List[str], py_interpreter: str = "python3", py_requirements: Optional[List[str]] = None, py_system_site_packages: bool = False, project_id: Optional[str] = None, append_job_name: bool = True, on_new_job_id_callback: Optional[Callable[[str], None]] = None): """ Starts Dataflow job. :param job_name: The name of the job. :type job_name: str :param variables: Variables passed to the job. :type variables: Dict :param dataflow: Name of the Dataflow process. :type dataflow: str :param py_options: Additional options. :type py_options: List[str] :param py_interpreter: Python version of the beam pipeline. If None, this defaults to the python3. To track python versions supported by beam and related issues check: https://issues.apache.org/jira/browse/BEAM-1251 :param py_requirements: Additional python package(s) to install. If a value is passed to this parameter, a new virtual environment has been created with additional packages installed. You could also install the apache-beam package if it is not installed on your system or you want to use a different version. :type py_requirements: List[str] :param py_system_site_packages: Whether to include system_site_packages in your virtualenv. See virtualenv documentation for more information. This option is only relevant if the ``py_requirements`` parameter is passed. :type py_interpreter: str :param append_job_name: True if unique suffix has to be appended to job name. :type append_job_name: bool :param project_id: Optional, the GCP project ID in which to start a job. If set to None or missing, the default project_id from the GCP connection is used. :param on_new_job_id_callback: Callback called when the job ID is known. :type on_new_job_id_callback: callable """ if not project_id: raise ValueError("The project_id should be set") name = self._build_dataflow_job_name(job_name, append_job_name) variables['job_name'] = name def label_formatter(labels_dict): return [ '--labels={}={}'.format(key, value) for key, value in labels_dict.items() ] if py_requirements is not None: with TemporaryDirectory(prefix='dataflow-venv') as tmp_dir: py_interpreter = prepare_virtualenv( venv_directory=tmp_dir, python_bin=py_interpreter, system_site_packages=py_system_site_packages, requirements=py_requirements, ) command_prefix = [py_interpreter] + py_options + [dataflow] self._start_dataflow( variables=variables, name=name, command_prefix=command_prefix, label_formatter=label_formatter, project_id=project_id, on_new_job_id_callback=on_new_job_id_callback) else: command_prefix = [py_interpreter] + py_options + [dataflow] self._start_dataflow(variables=variables, name=name, command_prefix=command_prefix, label_formatter=label_formatter, project_id=project_id, on_new_job_id_callback=on_new_job_id_callback)