예제 #1
0
    def test_python_callable_keyword_arguments_are_templatized(self):
        """Test PythonSensor op_kwargs are templatized"""
        recorded_calls = []

        task = PythonSensor(
            task_id='python_sensor',
            timeout=0.01,
            poke_interval=0.3,
            # a Mock instance cannot be used as a callable function or test fails with a
            # TypeError: Object of type Mock is not JSON serializable
            python_callable=build_recording_function(recorded_calls),
            op_kwargs={
                'an_int': 4,
                'a_date': date(2019, 1, 1),
                'a_templated_string': "dag {{dag.dag_id}} ran on {{ds}}."
            },
            dag=self.dag)

        self.dag.create_dagrun(run_type=DagRunType.MANUAL,
                               execution_date=DEFAULT_DATE,
                               start_date=DEFAULT_DATE,
                               state=State.RUNNING)
        with self.assertRaises(AirflowSensorTimeout):
            task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        # 2 calls: first: at start, second: before timeout
        self.assertEqual(2, len(recorded_calls))
        self._assert_calls_equal(
            recorded_calls[0],
            Call(an_int=4,
                 a_date=date(2019, 1, 1),
                 a_templated_string="dag {} ran on {}.".format(
                     self.dag.dag_id,
                     DEFAULT_DATE.date().isoformat())))
예제 #2
0
 def test_python_sensor_raise(self):
     op = PythonSensor(
         task_id='python_sensor_check_raise',
         python_callable=lambda: 1 / 0,
         dag=self.dag)
     with self.assertRaises(ZeroDivisionError):
         op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
예제 #3
0
 def test_python_sensor_true(self):
     op = PythonSensor(task_id='python_sensor_check_true',
                       python_callable=lambda: True,
                       dag=self.dag)
     op.run(start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)
예제 #4
0
 def test_python_sensor_false(self):
     op = PythonSensor(
         task_id='python_sensor_check_false',
         timeout=0.01,
         poke_interval=0.01,
         python_callable=lambda: False,
         dag=self.dag)
     with self.assertRaises(AirflowSensorTimeout):
         op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
예제 #5
0
    def test_python_callable_arguments_are_templatized(self):
        """Test PythonSensor op_args are templatized"""
        recorded_calls = []

        # Create a named tuple and ensure it is still preserved
        # after the rendering is done
        Named = namedtuple('Named', ['var1', 'var2'])
        named_tuple = Named('{{ ds }}', 'unchanged')

        task = PythonSensor(
            task_id='python_sensor',
            timeout=0.01,
            poke_interval=0.3,
            # a Mock instance cannot be used as a callable function or test fails with a
            # TypeError: Object of type Mock is not JSON serializable
            python_callable=build_recording_function(recorded_calls),
            op_args=[
                4,
                date(2019, 1, 1), "dag {{dag.dag_id}} ran on {{ds}}.",
                named_tuple
            ],
            dag=self.dag,
        )

        self.dag.create_dagrun(
            run_type=DagRunType.MANUAL,
            execution_date=DEFAULT_DATE,
            start_date=DEFAULT_DATE,
            state=State.RUNNING,
        )
        with pytest.raises(AirflowSensorTimeout):
            task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        ds_templated = DEFAULT_DATE.date().isoformat()
        # 2 calls: first: at start, second: before timeout
        assert 2 == len(recorded_calls)
        self._assert_calls_equal(
            recorded_calls[0],
            Call(
                4,
                date(2019, 1, 1),
                f"dag {self.dag.dag_id} ran on {ds_templated}.",
                Named(ds_templated, 'unchanged'),
            ),
        )
예제 #6
0
    def test_clear_task_instances_with_task_reschedule(self):
        """Test that TaskReschedules are deleted correctly when TaskInstances are cleared"""

        with DAG(
                'test_clear_task_instances_with_task_reschedule',
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE + datetime.timedelta(days=10),
        ) as dag:
            task0 = PythonSensor(task_id='0',
                                 python_callable=lambda: False,
                                 mode="reschedule")
            task1 = PythonSensor(task_id='1',
                                 python_callable=lambda: False,
                                 mode="reschedule")

        ti0 = TI(task=task0, execution_date=DEFAULT_DATE)
        ti1 = TI(task=task1, execution_date=DEFAULT_DATE)

        dag.create_dagrun(
            execution_date=ti0.execution_date,
            state=State.RUNNING,
            run_type=DagRunType.SCHEDULED,
        )

        ti0.run()
        ti1.run()

        with create_session() as session:

            def count_task_reschedule(task_id):
                return (session.query(TaskReschedule).filter(
                    TaskReschedule.dag_id == dag.dag_id,
                    TaskReschedule.task_id == task_id,
                    TaskReschedule.execution_date == DEFAULT_DATE,
                    TaskReschedule.try_number == 1,
                ).count())

            assert count_task_reschedule(ti0.task_id) == 1
            assert count_task_reschedule(ti1.task_id) == 1
            qry = session.query(TI).filter(TI.dag_id == dag.dag_id,
                                           TI.task_id == ti0.task_id).all()
            clear_task_instances(qry, session, dag=dag)
            assert count_task_reschedule(ti0.task_id) == 0
            assert count_task_reschedule(ti1.task_id) == 1
예제 #7
0
    def test_get_classpath(self):
        # Test the classpath in/out airflow
        obj1 = NamedHivePartitionSensor(partition_names=['test_partition'], task_id='meta_partition_test_1')
        obj1_classpath = SensorInstance.get_classpath(obj1)
        obj1_importpath = (
            "airflow.providers.apache.hive.sensors.named_hive_partition.NamedHivePartitionSensor"
        )

        assert obj1_classpath == obj1_importpath

        def test_callable():
            return

        obj3 = PythonSensor(python_callable=test_callable, task_id='python_sensor_test')
        obj3_classpath = SensorInstance.get_classpath(obj3)
        obj3_importpath = "airflow.sensors.python.PythonSensor"

        assert obj3_classpath == obj3_importpath
            json.dump(random_forest_metrics, f)
        print('RandomForestClassifier model showed better results')
    print(f'Save model and metrics into {output_model_path}')


with DAG(
    dag_id='train_validate',
    start_date=airflow.utils.dates.days_ago(1),
    schedule_interval='@weekly',
    max_active_runs=1,
) as dag:
    data_sensor = PythonSensor(
        task_id='data_sensor',
        python_callable=_wait_for_file,
        op_kwargs={'path': '/opt/airflow/data/raw/{{ ds }}/data.csv'},
        timeout=6000,
        poke_interval=10,
        retries=100,
        mode='poke',
    )

    target_sensor = PythonSensor(
        task_id='target_sensor',
        python_callable=_wait_for_file,
        op_kwargs={'path': '/opt/airflow/data/raw/{{ ds }}/target.csv'},
        timeout=6000,
        poke_interval=10,
        retries=100,
        mode='poke',
    )
    start_date=airflow.utils.dates.days_ago(3),
    schedule_interval=None,
)


def _wait_for_supermarket(supermarket_id_):
    supermarket_path = Path("/data/" + supermarket_id_)
    data_files = supermarket_path.glob("data-*.csv")
    success_file = supermarket_path / "_SUCCESS"
    return data_files and success_file.exists()


for supermarket_id in range(1, 5):
    wait = PythonSensor(
        task_id=f"wait_for_supermarket_{supermarket_id}",
        python_callable=_wait_for_supermarket,
        op_kwargs={"supermarket_id_": f"supermarket{supermarket_id}"},
        dag=dag1,
    )
    copy = DummyOperator(task_id=f"copy_to_raw_supermarket_{supermarket_id}",
                         dag=dag1)
    process = DummyOperator(task_id=f"process_supermarket_{supermarket_id}",
                            dag=dag1)
    trigger_create_metrics_dag = TriggerDagRunOperator(
        task_id=f"trigger_create_metrics_dag_supermarket_{supermarket_id}",
        trigger_dag_id="listing_6_04_dag02",
        dag=dag1,
    )
    wait >> copy >> process >> trigger_create_metrics_dag

compute_differences = DummyOperator(task_id="compute_differences", dag=dag2)
update_dashboard = DummyOperator(task_id="update_dashboard", dag=dag2)
예제 #10
0
    pd.DataFrame(np.array(preds).T, columns=['target']).to_csv(output_path,
                                                               index=False)
    print(f'Predict test data and save into {output_path}')


with DAG(
        dag_id='predict',
        start_date=airflow.utils.dates.days_ago(1),
        schedule_interval='@daily',
        max_active_runs=1,
) as dag:
    data_sensor = PythonSensor(
        task_id='data_sensor',
        python_callable=_wait_for_file,
        op_kwargs={'path': '/opt/airflow/data/raw/{{ ds }}/test.csv'},
        timeout=60,
        poke_interval=10,
        retries=100,
        mode='poke',
    )

    model_sensor = PythonSensor(
        task_id='model_sensor',
        python_callable=_wait_for_file,
        op_kwargs={'path': '{{ var.value.model_path }}'
                   },  #Variable.get('model_path')},
        timeout=60,
        poke_interval=10,
        retries=100,
        mode='poke',
    )
    return os.path.exists("/opt/airflow/data/wait.txt")


with DAG(
        "08_sensor",
        default_args=default_args,
        description="A simple tutorial DAG",
        schedule_interval=timedelta(days=1),
) as dag:
    t1 = BashOperator(
        task_id="touch_file_1",
        bash_command="touch /opt/airflow/data/1.txt",
    )

    wait = PythonSensor(
        task_id="wait_for_file",
        python_callable=_wait_for_file,
        timeout=6000,
        poke_interval=10,
        retries=100,
        mode="poke",
    )

    t3 = BashOperator(
        task_id="touch_file_3",
        depends_on_past=True,
        bash_command="touch /opt/airflow/data/2.txt",
    )

    t1 >> wait >> t3
예제 #12
0
}

with DAG(
        dag_id="3_dag_inference",
        default_args=default_args,
        schedule_interval="@daily",
        start_date=days_ago(5),
) as dag:

    data_sensor = PythonSensor(
        task_id="data_sensor",
        python_callable=_wait_for_file,
        op_kwargs={
            "pre_folder_name": "raw",
            "folder_name": "{{ ds }}",
            "file_name": "data.csv"
        },
        timeout=6000,
        poke_interval=10,
        retries=100,
        mode="poke",
    )

    preprocess = DockerOperator(
        task_id="docker-airflow-inference-preprocess",
        image="airflow-train-preprocess",
        command=
        "--input_dir /data/raw/{{ ds }} --output_dir /data/preprocessed/{{ ds }} --mode=inference",
        network_mode="bridge",
        do_xcom_push=False,
        volumes=[f"{Variable.get('data_folder_path')}:/data"])
예제 #13
0
from pathlib import Path

import airflow.utils.dates
from airflow import DAG
from airflow.sensors.python import PythonSensor

dag = DAG(
    dag_id="listing_6_02",
    start_date=airflow.utils.dates.days_ago(3),
    schedule_interval="0 16 * * *",
    description="A batch workflow for ingesting supermarket promotions data.",
    default_args={"depends_on_past": True},
)


def _wait_for_supermarket(supermarket_id_):
    supermarket_path = Path("/data/" + supermarket_id_)
    data_files = supermarket_path.glob("data-*.csv")
    success_file = supermarket_path / "_SUCCESS"
    return data_files and success_file.exists()


wait_for_supermarket_1 = PythonSensor(
    task_id="wait_for_supermarket_1",
    python_callable=_wait_for_supermarket,
    op_kwargs={"supermarket_id": "supermarket1"},
    dag=dag,
)

def _data_ready_for_predict():
    return os.path.exists("/opt/airflow/data/raw/{{ ds }}/data.csv")


with DAG(
        "data_ready_sensor",
        default_args=default_args,
        description="This DAG checks that data is ready",
        schedule_interval=timedelta(days=1),
) as dag:
    wait_data_ready_for_train = PythonSensor(
        task_id="data_ready_for_train",
        python_callable=_data_ready_for_train,
        timeout=6000,
        poke_interval=10,
        retries=100,
        mode="poke",
    )

    wait_data_ready_for_predict = PythonSensor(
        task_id="data_ready_for_predict",
        python_callable=_data_ready_for_predict,
        timeout=6000,
        poke_interval=10,
        retries=100,
        mode="poke",
    )

    t = BashOperator(
        task_id="touch_file",
예제 #15
0
    start_date=airflow.utils.dates.days_ago(14),
    schedule_interval="0 16 * * *",
    description=
    "A batch workflow for ingesting supermarket promotions data, demonstrating the PythonSensor.",
)

create_metrics = DummyOperator(task_id="create_metrics", dag=dag)


def _wait_for_supermarket(supermarket_id_):
    supermarket_path = Path("/data/" + supermarket_id_)
    data_files = supermarket_path.glob("data-*.csv")
    success_file = supermarket_path / "_SUCCESS"
    return data_files and success_file.exists()


for supermarket_id in range(1, 5):
    wait = PythonSensor(
        task_id=f"wait_for_supermarket_{supermarket_id}",
        python_callable=_wait_for_supermarket,
        op_kwargs={"supermarket_id_": f"supermarket{supermarket_id}"},
        timeout=600,
        mode="reschedule",
        dag=dag,
    )
    copy = DummyOperator(task_id=f"copy_to_raw_supermarket_{supermarket_id}",
                         dag=dag)
    process = DummyOperator(task_id=f"process_supermarket_{supermarket_id}",
                            dag=dag)
    wait >> copy >> process >> create_metrics
예제 #16
0
    "retries": 1,
    "retry_delay": timedelta(minutes=1),
}

with DAG(
        dag_id="2_dag_train_model",
        default_args=default_args,
        schedule_interval="@daily",
        start_date=days_ago(5),
) as dag:

    file_sensor = PythonSensor(
        task_id="file_sensor",
        python_callable=_wait_for_file,
        op_kwargs={"folder_name": "{{ ds }}"},
        timeout=6000,
        poke_interval=10,
        retries=100,
        mode="poke",
    )

    preprocess = DockerOperator(
        task_id="docker-airflow-train-preprocess",
        image="airflow-train-preprocess",
        command=
        "--input_dir /data/raw/{{ ds }} --output_dir /data/preprocessed/{{ ds }} --mode=train",
        network_mode="bridge",
        do_xcom_push=False,
        volumes=[f"{Variable.get('data_folder_path')}:/data"])

    split = DockerOperator(