def test_injected_hook(self): def_hook = LivyHook(livy_conn_id='livyunittest') task = LivyOperator(file='sparkapp', dag=self.dag, task_id='livy_example') task._livy_hook = def_hook assert task.get_hook() == def_hook
def test_execution_with_extra_options(self, mock_post): extra_options = {'check_response': True} task = LivyOperator(file='sparkapp', dag=self.dag, task_id='livy_example', extra_options=extra_options) task.execute(context={}) assert task.get_hook().extra_options == extra_options
def test_execution(self, mock_post, mock_get): task = LivyOperator(livy_conn_id='livyunittest', file='sparkapp', polling_interval=1, dag=self.dag, task_id='livy_example') task.execute(context={}) call_args = {k: v for k, v in mock_post.call_args[1].items() if v} self.assertEqual(call_args, {'file': 'sparkapp'}) mock_get.assert_called_once_with(BATCH_ID)
def test_poll_for_termination_fail(self, mock_livy): state_list = 2 * [BatchState.RUNNING] + [BatchState.ERROR] def side_effect(_): if state_list: return state_list.pop(0) # fail if does not stop right before raise AssertionError() mock_livy.side_effect = side_effect task = LivyOperator( file='sparkapp', polling_interval=1, dag=self.dag, task_id='livy_example' ) task._livy_hook = task.get_hook() with self.assertRaises(AirflowException): task.poll_for_termination(BATCH_ID) mock_livy.assert_called_with(BATCH_ID) self.assertEqual(mock_livy.call_count, 3)
def test_deletion(self, mock_post, mock_delete): task = LivyOperator( livy_conn_id='livyunittest', file='sparkapp', dag=self.dag, task_id='livy_example' ) task.execute(context={}) task.kill() mock_delete.assert_called_once_with(BATCH_ID)
def _build_livy_operator( task: str, spark_conf_extra: Optional[Dict[Any, Any]] = None) -> LivyOperator: spark_conf_base = { "spark.yarn.appMasterEnv.PYSPARK_PYTHON": "./env/bin/python", } spark_conf_extra = spark_conf_extra or {} return LivyOperator( task_id=task, file=f"{ETL_CODE_LOCATION}/main.py", args=["--task", task, "--execution-date", "{{ ds }}"], archives=[f"{ETL_CODE_LOCATION}/venv_build.tar.gz#env"], conf={ **spark_conf_base, **spark_conf_extra }, proxy_user=LIVY_PROXY_USER, livy_conn_id=LIVY_CONN_ID, )
'depends_on_past': False } with DAG( dag_id='example_livy_operator', default_args=args, schedule_interval='@daily', start_date=days_ago(5), ) as dag: livy_java_task = LivyOperator( task_id='pi_java_task', dag=dag, livy_conn_id='livy_conn_default', file='/spark-examples.jar', args=[10], num_executors=1, conf={ 'spark.shuffle.compress': 'false', }, class_name='org.apache.spark.examples.SparkPi', ) livy_python_task = LivyOperator( task_id='pi_python_task', dag=dag, livy_conn_id='livy_conn_default', file='/pi.py', args=[10], polling_interval=60, )
from airflow import DAG from datetime import timedelta from airflow.utils.dates import days_ago from airflow.operators.bash_operator import BashOperator from airflow.operators.http_operator import SimpleHttpOperator from airflow.providers.apache.livy.operators.livy import LivyOperator default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': days_ago(2), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG( 'livydag', default_args=default_args, description='Livy DAG', schedule_interval=timedelta(days=1), ) livy_batch = LivyOperator(dag=dag, task_id="livy_batch", file="/opt/jars/spark-examples_2.11-2.4.6.jar", class_name="org.apache.spark.examples.SparkPi", polling_interval=1)
default_args=default_args, description='A simple test livy DAG', schedule_interval=None, start_date=days_ago(0), tags=['test'], ) as dag: dag.doc_md = dedent("""\ Тестовый WF с вызовом простой трансформации """) aws_test = LivyOperator( task_id='aws_test2', dag=dag, livy_conn_id='livy_default', file='s3a://datagram/user/root/deployments/autogenerated_tr_aws_test_2/' 'ru.neoflex.meta.etl2.spark.aws_test_2-1.0-SNAPSHOT.jar', proxy_user='******', args=[ 'HOME=/user', 'USER=root', 'WF_HOME=/user/root', 'ROOT_WORKFLOW_ID=aws_test_2_729508210', 'CURRENT_WORKFLOW_ID=aws_test_2_729508210', 'SLIDE_SIZE=400', 'FETCH_SIZE=1000', 'PARTITION_NUM=1', 'FAIL_THRESHOLD=1000', 'DEBUG=true', 'MASTER=' ], num_executors=1, conf={ 'spark.shuffle.compress': 'false', }, class_name='ru.neoflex.meta.etl2.spark.aws_test_2Job', polling_interval=5, )