def test_python_callable_keyword_arguments_are_templatized(self): """Test PythonSensor op_kwargs are templatized""" recorded_calls = [] task = PythonSensor( task_id='python_sensor', timeout=0.01, poke_interval=0.3, # a Mock instance cannot be used as a callable function or test fails with a # TypeError: Object of type Mock is not JSON serializable python_callable=build_recording_function(recorded_calls), op_kwargs={ 'an_int': 4, 'a_date': date(2019, 1, 1), 'a_templated_string': "dag {{dag.dag_id}} ran on {{ds}}." }, dag=self.dag) self.dag.create_dagrun(run_type=DagRunType.MANUAL, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, state=State.RUNNING) with self.assertRaises(AirflowSensorTimeout): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # 2 calls: first: at start, second: before timeout self.assertEqual(2, len(recorded_calls)) self._assert_calls_equal( recorded_calls[0], Call(an_int=4, a_date=date(2019, 1, 1), a_templated_string="dag {} ran on {}.".format( self.dag.dag_id, DEFAULT_DATE.date().isoformat())))
def test_python_sensor_raise(self): op = PythonSensor( task_id='python_sensor_check_raise', python_callable=lambda: 1 / 0, dag=self.dag) with self.assertRaises(ZeroDivisionError): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_python_sensor_true(self): op = PythonSensor(task_id='python_sensor_check_true', python_callable=lambda: True, dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_python_sensor_false(self): op = PythonSensor( task_id='python_sensor_check_false', timeout=0.01, poke_interval=0.01, python_callable=lambda: False, dag=self.dag) with self.assertRaises(AirflowSensorTimeout): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_python_callable_arguments_are_templatized(self): """Test PythonSensor op_args are templatized""" recorded_calls = [] # Create a named tuple and ensure it is still preserved # after the rendering is done Named = namedtuple('Named', ['var1', 'var2']) named_tuple = Named('{{ ds }}', 'unchanged') task = PythonSensor( task_id='python_sensor', timeout=0.01, poke_interval=0.3, # a Mock instance cannot be used as a callable function or test fails with a # TypeError: Object of type Mock is not JSON serializable python_callable=build_recording_function(recorded_calls), op_args=[ 4, date(2019, 1, 1), "dag {{dag.dag_id}} ran on {{ds}}.", named_tuple ], dag=self.dag, ) self.dag.create_dagrun( run_type=DagRunType.MANUAL, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, state=State.RUNNING, ) with pytest.raises(AirflowSensorTimeout): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) ds_templated = DEFAULT_DATE.date().isoformat() # 2 calls: first: at start, second: before timeout assert 2 == len(recorded_calls) self._assert_calls_equal( recorded_calls[0], Call( 4, date(2019, 1, 1), f"dag {self.dag.dag_id} ran on {ds_templated}.", Named(ds_templated, 'unchanged'), ), )
def test_clear_task_instances_with_task_reschedule(self): """Test that TaskReschedules are deleted correctly when TaskInstances are cleared""" with DAG( 'test_clear_task_instances_with_task_reschedule', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=10), ) as dag: task0 = PythonSensor(task_id='0', python_callable=lambda: False, mode="reschedule") task1 = PythonSensor(task_id='1', python_callable=lambda: False, mode="reschedule") ti0 = TI(task=task0, execution_date=DEFAULT_DATE) ti1 = TI(task=task1, execution_date=DEFAULT_DATE) dag.create_dagrun( execution_date=ti0.execution_date, state=State.RUNNING, run_type=DagRunType.SCHEDULED, ) ti0.run() ti1.run() with create_session() as session: def count_task_reschedule(task_id): return (session.query(TaskReschedule).filter( TaskReschedule.dag_id == dag.dag_id, TaskReschedule.task_id == task_id, TaskReschedule.execution_date == DEFAULT_DATE, TaskReschedule.try_number == 1, ).count()) assert count_task_reschedule(ti0.task_id) == 1 assert count_task_reschedule(ti1.task_id) == 1 qry = session.query(TI).filter(TI.dag_id == dag.dag_id, TI.task_id == ti0.task_id).all() clear_task_instances(qry, session, dag=dag) assert count_task_reschedule(ti0.task_id) == 0 assert count_task_reschedule(ti1.task_id) == 1
def test_get_classpath(self): # Test the classpath in/out airflow obj1 = NamedHivePartitionSensor(partition_names=['test_partition'], task_id='meta_partition_test_1') obj1_classpath = SensorInstance.get_classpath(obj1) obj1_importpath = ( "airflow.providers.apache.hive.sensors.named_hive_partition.NamedHivePartitionSensor" ) assert obj1_classpath == obj1_importpath def test_callable(): return obj3 = PythonSensor(python_callable=test_callable, task_id='python_sensor_test') obj3_classpath = SensorInstance.get_classpath(obj3) obj3_importpath = "airflow.sensors.python.PythonSensor" assert obj3_classpath == obj3_importpath
json.dump(random_forest_metrics, f) print('RandomForestClassifier model showed better results') print(f'Save model and metrics into {output_model_path}') with DAG( dag_id='train_validate', start_date=airflow.utils.dates.days_ago(1), schedule_interval='@weekly', max_active_runs=1, ) as dag: data_sensor = PythonSensor( task_id='data_sensor', python_callable=_wait_for_file, op_kwargs={'path': '/opt/airflow/data/raw/{{ ds }}/data.csv'}, timeout=6000, poke_interval=10, retries=100, mode='poke', ) target_sensor = PythonSensor( task_id='target_sensor', python_callable=_wait_for_file, op_kwargs={'path': '/opt/airflow/data/raw/{{ ds }}/target.csv'}, timeout=6000, poke_interval=10, retries=100, mode='poke', )
start_date=airflow.utils.dates.days_ago(3), schedule_interval=None, ) def _wait_for_supermarket(supermarket_id_): supermarket_path = Path("/data/" + supermarket_id_) data_files = supermarket_path.glob("data-*.csv") success_file = supermarket_path / "_SUCCESS" return data_files and success_file.exists() for supermarket_id in range(1, 5): wait = PythonSensor( task_id=f"wait_for_supermarket_{supermarket_id}", python_callable=_wait_for_supermarket, op_kwargs={"supermarket_id_": f"supermarket{supermarket_id}"}, dag=dag1, ) copy = DummyOperator(task_id=f"copy_to_raw_supermarket_{supermarket_id}", dag=dag1) process = DummyOperator(task_id=f"process_supermarket_{supermarket_id}", dag=dag1) trigger_create_metrics_dag = TriggerDagRunOperator( task_id=f"trigger_create_metrics_dag_supermarket_{supermarket_id}", trigger_dag_id="listing_6_04_dag02", dag=dag1, ) wait >> copy >> process >> trigger_create_metrics_dag compute_differences = DummyOperator(task_id="compute_differences", dag=dag2) update_dashboard = DummyOperator(task_id="update_dashboard", dag=dag2)
pd.DataFrame(np.array(preds).T, columns=['target']).to_csv(output_path, index=False) print(f'Predict test data and save into {output_path}') with DAG( dag_id='predict', start_date=airflow.utils.dates.days_ago(1), schedule_interval='@daily', max_active_runs=1, ) as dag: data_sensor = PythonSensor( task_id='data_sensor', python_callable=_wait_for_file, op_kwargs={'path': '/opt/airflow/data/raw/{{ ds }}/test.csv'}, timeout=60, poke_interval=10, retries=100, mode='poke', ) model_sensor = PythonSensor( task_id='model_sensor', python_callable=_wait_for_file, op_kwargs={'path': '{{ var.value.model_path }}' }, #Variable.get('model_path')}, timeout=60, poke_interval=10, retries=100, mode='poke', )
return os.path.exists("/opt/airflow/data/wait.txt") with DAG( "08_sensor", default_args=default_args, description="A simple tutorial DAG", schedule_interval=timedelta(days=1), ) as dag: t1 = BashOperator( task_id="touch_file_1", bash_command="touch /opt/airflow/data/1.txt", ) wait = PythonSensor( task_id="wait_for_file", python_callable=_wait_for_file, timeout=6000, poke_interval=10, retries=100, mode="poke", ) t3 = BashOperator( task_id="touch_file_3", depends_on_past=True, bash_command="touch /opt/airflow/data/2.txt", ) t1 >> wait >> t3
} with DAG( dag_id="3_dag_inference", default_args=default_args, schedule_interval="@daily", start_date=days_ago(5), ) as dag: data_sensor = PythonSensor( task_id="data_sensor", python_callable=_wait_for_file, op_kwargs={ "pre_folder_name": "raw", "folder_name": "{{ ds }}", "file_name": "data.csv" }, timeout=6000, poke_interval=10, retries=100, mode="poke", ) preprocess = DockerOperator( task_id="docker-airflow-inference-preprocess", image="airflow-train-preprocess", command= "--input_dir /data/raw/{{ ds }} --output_dir /data/preprocessed/{{ ds }} --mode=inference", network_mode="bridge", do_xcom_push=False, volumes=[f"{Variable.get('data_folder_path')}:/data"])
from pathlib import Path import airflow.utils.dates from airflow import DAG from airflow.sensors.python import PythonSensor dag = DAG( dag_id="listing_6_02", start_date=airflow.utils.dates.days_ago(3), schedule_interval="0 16 * * *", description="A batch workflow for ingesting supermarket promotions data.", default_args={"depends_on_past": True}, ) def _wait_for_supermarket(supermarket_id_): supermarket_path = Path("/data/" + supermarket_id_) data_files = supermarket_path.glob("data-*.csv") success_file = supermarket_path / "_SUCCESS" return data_files and success_file.exists() wait_for_supermarket_1 = PythonSensor( task_id="wait_for_supermarket_1", python_callable=_wait_for_supermarket, op_kwargs={"supermarket_id": "supermarket1"}, dag=dag, )
def _data_ready_for_predict(): return os.path.exists("/opt/airflow/data/raw/{{ ds }}/data.csv") with DAG( "data_ready_sensor", default_args=default_args, description="This DAG checks that data is ready", schedule_interval=timedelta(days=1), ) as dag: wait_data_ready_for_train = PythonSensor( task_id="data_ready_for_train", python_callable=_data_ready_for_train, timeout=6000, poke_interval=10, retries=100, mode="poke", ) wait_data_ready_for_predict = PythonSensor( task_id="data_ready_for_predict", python_callable=_data_ready_for_predict, timeout=6000, poke_interval=10, retries=100, mode="poke", ) t = BashOperator( task_id="touch_file",
start_date=airflow.utils.dates.days_ago(14), schedule_interval="0 16 * * *", description= "A batch workflow for ingesting supermarket promotions data, demonstrating the PythonSensor.", ) create_metrics = DummyOperator(task_id="create_metrics", dag=dag) def _wait_for_supermarket(supermarket_id_): supermarket_path = Path("/data/" + supermarket_id_) data_files = supermarket_path.glob("data-*.csv") success_file = supermarket_path / "_SUCCESS" return data_files and success_file.exists() for supermarket_id in range(1, 5): wait = PythonSensor( task_id=f"wait_for_supermarket_{supermarket_id}", python_callable=_wait_for_supermarket, op_kwargs={"supermarket_id_": f"supermarket{supermarket_id}"}, timeout=600, mode="reschedule", dag=dag, ) copy = DummyOperator(task_id=f"copy_to_raw_supermarket_{supermarket_id}", dag=dag) process = DummyOperator(task_id=f"process_supermarket_{supermarket_id}", dag=dag) wait >> copy >> process >> create_metrics
"retries": 1, "retry_delay": timedelta(minutes=1), } with DAG( dag_id="2_dag_train_model", default_args=default_args, schedule_interval="@daily", start_date=days_ago(5), ) as dag: file_sensor = PythonSensor( task_id="file_sensor", python_callable=_wait_for_file, op_kwargs={"folder_name": "{{ ds }}"}, timeout=6000, poke_interval=10, retries=100, mode="poke", ) preprocess = DockerOperator( task_id="docker-airflow-train-preprocess", image="airflow-train-preprocess", command= "--input_dir /data/raw/{{ ds }} --output_dir /data/preprocessed/{{ ds }} --mode=train", network_mode="bridge", do_xcom_push=False, volumes=[f"{Variable.get('data_folder_path')}:/data"]) split = DockerOperator(