예제 #1
0
    def test_externally_triggered_dagrun(self):
        TI = TaskInstance

        # Create the dagrun between two "scheduled" execution dates of the DAG
        execution_date = DEFAULT_DATE + timedelta(days=2)
        execution_ds = execution_date.strftime('%Y-%m-%d')
        execution_ds_nodash = execution_ds.replace('-', '')

        dag = DAG(
            TEST_DAG_ID,
            default_args=self.args,
            schedule_interval=timedelta(weeks=1),
            start_date=DEFAULT_DATE)
        task = DummyOperator(task_id='test_externally_triggered_dag_context',
                             dag=dag)
        dag.create_dagrun(run_id=DagRun.id_for_date(execution_date),
                          execution_date=execution_date,
                          state=State.RUNNING,
                          external_trigger=True)
        task.run(
            start_date=execution_date, end_date=execution_date)

        ti = TI(task=task, execution_date=execution_date)
        context = ti.get_template_context()

        # next_ds/prev_ds should be the execution date for manually triggered runs
        self.assertEqual(context['next_ds'], execution_ds)
        self.assertEqual(context['next_ds_nodash'], execution_ds_nodash)

        self.assertEqual(context['prev_ds'], execution_ds)
        self.assertEqual(context['prev_ds_nodash'], execution_ds_nodash)
예제 #2
0
    def test_skipping(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task.set_upstream(latest_task)

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        assert exec_date_to_latest_state == {
            datetime.datetime(2016, 1, 1): 'success',
            datetime.datetime(2016, 1, 1, 12): 'success',
            datetime.datetime(2016, 1, 2): 'success',
        }

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        assert exec_date_to_downstream_state == {
            datetime.datetime(2016, 1, 1): 'skipped',
            datetime.datetime(2016, 1, 1, 12): 'skipped',
            datetime.datetime(2016, 1, 2): 'success',
        }
    def test_depends_on_past(self):
        dag = DAG(
            dag_id='test_depends_on_past',
            start_date=DEFAULT_DATE
        )

        task = DummyOperator(
            task_id='test_dop_task',
            dag=dag,
            depends_on_past=True,
        )
        dag.clear()

        run_date = task.start_date + datetime.timedelta(days=5)

        ti = TI(task, run_date)

        # depends_on_past prevents the run
        task.run(start_date=run_date, end_date=run_date)
        ti.refresh_from_db()
        self.assertIs(ti.state, None)

        # ignore first depends_on_past to allow the run
        task.run(
            start_date=run_date,
            end_date=run_date,
            ignore_first_depends_on_past=True)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
    def test_skipping(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task.set_upstream(latest_task)

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            datetime.datetime(2016, 1, 1): 'success',
            datetime.datetime(2016, 1, 1, 12): 'success',
            datetime.datetime(2016, 1, 2): 'success', },
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            datetime.datetime(2016, 1, 1): 'skipped',
            datetime.datetime(2016, 1, 1, 12): 'skipped',
            datetime.datetime(2016, 1, 2): 'success',},
            exec_date_to_downstream_state)
    def test_with_dag_run(self):
        value = False
        dag = DAG('shortcircuit_operator_test_with_dag_run',
                  default_args={
                      'owner': 'airflow',
                      'start_date': DEFAULT_DATE
                  },
                  schedule_interval=INTERVAL)
        short_op = ShortCircuitOperator(task_id='make_choice',
                                        dag=dag,
                                        python_callable=lambda: value)
        branch_1 = DummyOperator(task_id='branch_1', dag=dag)
        branch_1.set_upstream(short_op)
        branch_2 = DummyOperator(task_id='branch_2', dag=dag)
        branch_2.set_upstream(branch_1)
        upstream = DummyOperator(task_id='upstream', dag=dag)
        upstream.set_downstream(short_op)
        dag.clear()

        logging.error("Tasks {}".format(dag.tasks))
        dr = dag.create_dagrun(
            run_id="manual__",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEqual(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEqual(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEqual(ti.state, State.SKIPPED)
            else:
                raise Exception

        value = True
        dag.clear()
        dr.verify_integrity()
        upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEqual(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEqual(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEqual(ti.state, State.NONE)
            else:
                raise Exception
예제 #6
0
    def test_with_dag_run(self):
        value = False
        dag = DAG('shortcircuit_operator_test_with_dag_run',
                  default_args={
                       'owner': 'airflow',
                       'start_date': DEFAULT_DATE
                  },
                  schedule_interval=INTERVAL)
        short_op = ShortCircuitOperator(task_id='make_choice',
                                        dag=dag,
                                        python_callable=lambda: value)
        branch_1 = DummyOperator(task_id='branch_1', dag=dag)
        branch_1.set_upstream(short_op)
        branch_2 = DummyOperator(task_id='branch_2', dag=dag)
        branch_2.set_upstream(branch_1)
        upstream = DummyOperator(task_id='upstream', dag=dag)
        upstream.set_downstream(short_op)
        dag.clear()

        logging.error("Tasks {}".format(dag.tasks))
        dr = dag.create_dagrun(
            run_id="manual__",
            start_date=datetime.datetime.now(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        value = True
        dag.clear()
        dr.verify_integrity()
        upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise
예제 #7
0
    def test_clear_skipped_downstream_task(self):
        """
        After a downstream task is skipped by ShortCircuitOperator, clearing the skipped task
        should not cause it to be executed.
        """
        dag = DAG('shortcircuit_clear_skipped_downstream_task',
                  default_args={
                      'owner': 'airflow',
                      'start_date': DEFAULT_DATE
                  },
                  schedule_interval=INTERVAL)
        short_op = ShortCircuitOperator(task_id='make_choice',
                                        dag=dag,
                                        python_callable=lambda: False)
        downstream = DummyOperator(task_id='downstream', dag=dag)

        short_op >> downstream

        dag.clear()

        dr = dag.create_dagrun(
            run_type=DagRunType.MANUAL,
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        downstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEqual(ti.state, State.SUCCESS)
            elif ti.task_id == 'downstream':
                self.assertEqual(ti.state, State.SKIPPED)
            else:
                raise ValueError(f'Invalid task id {ti.task_id} found!')

        # Clear downstream
        with create_session() as session:
            clear_task_instances([t for t in tis if t.task_id == "downstream"],
                                 session=session,
                                 dag=dag)

        # Run downstream again
        downstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        # Check if the states are correct.
        for ti in dr.get_task_instances():
            if ti.task_id == 'make_choice':
                self.assertEqual(ti.state, State.SUCCESS)
            elif ti.task_id == 'downstream':
                self.assertEqual(ti.state, State.SKIPPED)
            else:
                raise ValueError(f'Invalid task id {ti.task_id} found!')
    def test_skipping_dagrun(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        self.dag_file_processor._process_task_instances(self.dag, task_instances_list=latest_instances)

        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        self.dag_file_processor._process_task_instances(self.dag, task_instances_list=downstream_instances)

        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        self.dag_file_processor._process_task_instances(self.dag, task_instances_list=downstream_instances)
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
예제 #9
0
    def test_skipping_non_latest(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)
        downstream_task3 = DummyOperator(
            task_id='downstream_3',
            trigger_rule=TriggerRule.NONE_FAILED,
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)
        downstream_task3.set_upstream(downstream_task)

        self.dag.create_dagrun(
            run_id="scheduled__1",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        self.dag.create_dagrun(
            run_id="scheduled__2",
            start_date=timezone.utcnow(),
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING,
        )

        self.dag.create_dagrun(
            run_id="scheduled__3",
            start_date=timezone.utcnow(),
            execution_date=END_DATE,
            state=State.RUNNING,
        )

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task3.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): None,
            timezone.datetime(2016, 1, 1, 12): None,
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_3')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
예제 #10
0
    def test_not_skipping_external(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        self.dag.create_dagrun(
            run_id="manual__1",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
            external_trigger=True,
        )

        self.dag.create_dagrun(
            run_id="manual__2",
            start_date=timezone.utcnow(),
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING,
            external_trigger=True,
        )

        self.dag.create_dagrun(
            run_id="manual__3",
            start_date=timezone.utcnow(),
            execution_date=END_DATE,
            state=State.RUNNING,
            external_trigger=True,
        )

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
예제 #11
0
def test_marquez_dag(mock_get_or_create_marquez_client,
                     mock_uuid,
                     clear_db_airflow_dags,
                     session=None):

    dag = DAG(DAG_ID,
              schedule_interval='@daily',
              default_args=DAG_DEFAULT_ARGS,
              description=DAG_DESCRIPTION)
    # (1) Mock the marquez client method calls
    mock_marquez_client = mock.Mock()
    mock_get_or_create_marquez_client.return_value = mock_marquez_client
    run_id_completed = "my-test_marquez_dag-uuid-completed"
    run_id_failed = "my-test_marquez_dag-uuid-failed"
    mock_uuid.side_effect = [run_id_completed, run_id_failed]

    # (2) Add task that will be marked as completed
    task_will_complete = DummyOperator(task_id=TASK_ID_COMPLETED, dag=dag)
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # (3) Add task that will be marked as failed
    task_will_fail = DummyOperator(task_id=TASK_ID_FAILED, dag=dag)
    failed_task_location = get_location(task_will_complete.dag.fileloc)

    # (4) Create DAG run and mark as running
    dagrun = dag.create_dagrun(run_id=DAG_RUN_ID,
                               execution_date=DEFAULT_DATE,
                               state=State.RUNNING)

    # Assert namespace meta call
    mock_marquez_client.create_namespace.assert_called_once_with(
        DAG_NAMESPACE, DAG_OWNER)

    # Assert source and dataset meta calls
    mock_marquez_client.create_source.assert_not_called()
    mock_marquez_client.create_dataset.assert_not_called()

    # Assert job meta calls
    create_job_calls = [
        mock.call(job_name=f"{DAG_ID}.{TASK_ID_COMPLETED}",
                  job_type=JobType.BATCH,
                  location=completed_task_location,
                  input_dataset=None,
                  output_dataset=None,
                  context=mock.ANY,
                  description=DAG_DESCRIPTION,
                  namespace_name=DAG_NAMESPACE,
                  run_id=None),
        mock.call(job_name=f"{DAG_ID}.{TASK_ID_FAILED}",
                  job_type=JobType.BATCH,
                  location=failed_task_location,
                  input_dataset=None,
                  output_dataset=None,
                  context=mock.ANY,
                  description=DAG_DESCRIPTION,
                  namespace_name=DAG_NAMESPACE,
                  run_id=None)
    ]
    log.info(
        f"{ [name for name, args, kwargs in mock_marquez_client.mock_calls]}")
    mock_marquez_client.create_job.assert_has_calls(create_job_calls)

    # Assert job run meta calls
    create_job_run_calls = [
        mock.call(job_name=f"{DAG_ID}.{TASK_ID_COMPLETED}",
                  run_id=mock.ANY,
                  run_args=DAG_RUN_ARGS,
                  nominal_start_time=mock.ANY,
                  nominal_end_time=mock.ANY,
                  namespace_name=DAG_NAMESPACE),
        mock.call(job_name=f"{DAG_ID}.{TASK_ID_FAILED}",
                  run_id=mock.ANY,
                  run_args=DAG_RUN_ARGS,
                  nominal_start_time=mock.ANY,
                  nominal_end_time=mock.ANY,
                  namespace_name=DAG_NAMESPACE)
    ]
    mock_marquez_client.create_job_run.assert_has_calls(create_job_run_calls)

    # (5) Start task that will be marked as completed
    task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    # (6) Start task that will be marked as failed
    ti1 = TaskInstance(task=task_will_fail, execution_date=DEFAULT_DATE)
    ti1.state = State.FAILED
    session.add(ti1)
    session.commit()

    dag.handle_callback(dagrun, success=True, session=session)

    # Assert start run meta calls
    start_job_run_calls = [
        mock.call(run_id_completed, mock.ANY),
        mock.call(run_id_failed, mock.ANY)
    ]
    mock_marquez_client.mark_job_run_as_started.assert_has_calls(
        start_job_run_calls)

    mock_marquez_client.mark_job_run_as_completed.assert_called_once_with(
        run_id=run_id_completed)

    # When a task run completes, the task outputs are also updated in order
    # to link a job version (=task version) to a dataset version.
    # Using a DummyOperator, no outputs exists, so assert that the create
    # dataset call is not invoked.
    mock_marquez_client.create_dataset.assert_not_called()

    dag.handle_callback(dagrun, success=False, session=session)
    mock_marquez_client.mark_job_run_as_failed.assert_called_once_with(
        run_id=run_id_failed)

    # Assert an attempt to version the outputs of a task is not made when
    # a task fails
    mock_marquez_client.create_dataset.assert_not_called()
예제 #12
0
def test_marquez_dag(job_id_mapping, mock_get_or_create_openlineage_client,
                     clear_db_airflow_dags, session=None):

    dag = DAG(
        DAG_ID,
        schedule_interval='@daily',
        default_args=DAG_DEFAULT_ARGS,
        description=DAG_DESCRIPTION
    )
    # (1) Mock the marquez client method calls
    mock_marquez_client = mock.Mock()
    mock_get_or_create_openlineage_client.return_value = mock_marquez_client
    run_id_completed = f"{DAG_RUN_ID}.{TASK_ID_COMPLETED}"
    run_id_failed = f"{DAG_RUN_ID}.{TASK_ID_FAILED}"
    # mock_uuid.side_effect = [run_id_completed, run_id_failed]

    # (2) Add task that will be marked as completed
    task_will_complete = DummyOperator(
        task_id=TASK_ID_COMPLETED,
        dag=dag
    )
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # (3) Add task that will be marked as failed
    task_will_fail = DummyOperator(
        task_id=TASK_ID_FAILED,
        dag=dag
    )
    failed_task_location = get_location(task_will_complete.dag.fileloc)

    # (4) Create DAG run and mark as running
    dagrun = dag.create_dagrun(
        run_id=DAG_RUN_ID,
        execution_date=DEFAULT_DATE,
        state=State.RUNNING)

    # Assert emit calls
    start_time = '2016-01-01T00:00:00.000000Z'
    end_time = '2016-01-02T00:00:00.000000Z'

    emit_calls = [
        mock.call(RunEvent(
            eventType=RunState.START,
            eventTime=mock.ANY,
            run=Run(run_id_completed, {"nominalTime": NominalTimeRunFacet(start_time, end_time)}),
            job=Job("default", f"{DAG_ID}.{TASK_ID_COMPLETED}", {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location)
            }),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        )),
        mock.call(RunEvent(
            eventType=RunState.START,
            eventTime=mock.ANY,
            run=Run(run_id_failed, {"nominalTime": NominalTimeRunFacet(start_time, end_time)}),
            job=Job("default", f"{DAG_ID}.{TASK_ID_FAILED}", {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", failed_task_location)
            }),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        ))
    ]
    log.info(
        f"{ [name for name, args, kwargs in mock_marquez_client.mock_calls]}")
    mock_marquez_client.emit.assert_has_calls(emit_calls)

    # (5) Start task that will be marked as completed
    task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    # (6) Start task that will be marked as failed
    ti1 = TaskInstance(task=task_will_fail, execution_date=DEFAULT_DATE)
    ti1.state = State.FAILED
    session.add(ti1)
    session.commit()

    job_id_mapping.pop.side_effect = [run_id_completed, run_id_failed]

    dag.handle_callback(dagrun, success=False, session=session)

    emit_calls += [
        mock.call(RunEvent(
            eventType=RunState.COMPLETE,
            eventTime=mock.ANY,
            run=Run(run_id_completed),
            job=Job("default", f"{DAG_ID}.{TASK_ID_COMPLETED}"),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        )),
        mock.call(RunEvent(
            eventType=RunState.FAIL,
            eventTime=mock.ANY,
            run=Run(run_id_failed),
            job=Job("default", f"{DAG_ID}.{TASK_ID_FAILED}"),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        ))
    ]
    mock_marquez_client.emit.assert_has_calls(emit_calls)
    def test_skipping_dagrun(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        self.dag.create_dagrun(
            run_id="manual__1",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        self.dag.create_dagrun(
            run_id="manual__2",
            start_date=timezone.utcnow(),
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING
        )

        self.dag.create_dagrun(
            run_id="manual__3",
            start_date=timezone.utcnow(),
            execution_date=END_DATE,
            state=State.RUNNING
        )

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
예제 #14
0
class ShortCircuitOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dag = DAG('shortcircuit_operator_test',
                       default_args={
                           'owner': 'airflow',
                           'start_date': DEFAULT_DATE},
                       schedule_interval=INTERVAL)
        self.short_op = ShortCircuitOperator(task_id='make_choice',
                                             dag=self.dag,
                                             python_callable=lambda: self.value)

        self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag)
        self.branch_1.set_upstream(self.short_op)
        self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag)
        self.branch_2.set_upstream(self.branch_1)
        self.upstream = DummyOperator(task_id='upstream', dag=self.dag)
        self.upstream.set_downstream(self.short_op)
        self.dag.clear()

        self.value = True

    def test_without_dag_run(self):
        """This checks the defensive against non existent tasks in a dag run"""
        self.value = False
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        session = Session()
        tis = session.query(TI).filter(
            TI.dag_id == self.dag.dag_id,
            TI.execution_date == DEFAULT_DATE
        )

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        self.value = True
        self.dag.clear()

        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise

        session.close()

    def test_with_dag_run(self):
        self.value = False
        logging.error("Tasks {}".format(self.dag.tasks))
        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=datetime.datetime.now(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        self.value = True
        self.dag.clear()
        dr.verify_integrity()
        self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise
예제 #15
0
파일: sensors.py 프로젝트: fossabot/airflow
    def test_external_task_sensor_fn_multiple_execution_dates(self):
        bash_command_code = """
{% set s=execution_date.time().second %}
echo "second is {{ s }}"
if [[ $(( {{ s }} % 60 )) == 1 ]]
    then
        exit 1
fi
exit 0
"""
        dag_external_id = TEST_DAG_ID + '_external'
        dag_external = DAG(dag_external_id,
                           default_args=self.args,
                           schedule_interval=timedelta(seconds=1))
        task_external_with_failure = BashOperator(
            task_id="task_external_with_failure",
            bash_command=bash_command_code,
            retries=0,
            dag=dag_external)
        task_external_without_failure = DummyOperator(
            task_id="task_external_without_failure",
            retries=0,
            dag=dag_external)

        task_external_without_failure.run(start_date=DEFAULT_DATE,
                                          end_date=DEFAULT_DATE +
                                          timedelta(seconds=1),
                                          ignore_ti_state=True)

        session = settings.Session()
        TI = TaskInstance
        try:
            task_external_with_failure.run(start_date=DEFAULT_DATE,
                                           end_date=DEFAULT_DATE +
                                           timedelta(seconds=1),
                                           ignore_ti_state=True)
            # The test_with_failure task is excepted to fail
            # once per minute (the run on the first second of
            # each minute).
        except Exception as e:
            failed_tis = session.query(TI).filter(
                TI.dag_id == dag_external_id, TI.state == State.FAILED,
                TI.execution_date == DEFAULT_DATE +
                timedelta(seconds=1)).all()
            if (len(failed_tis) == 1
                    and failed_tis[0].task_id == 'task_external_with_failure'):
                pass
            else:
                raise e

        dag_id = TEST_DAG_ID
        dag = DAG(dag_id,
                  default_args=self.args,
                  schedule_interval=timedelta(minutes=1))
        task_without_failure = ExternalTaskSensor(
            task_id='task_without_failure',
            external_dag_id=dag_external_id,
            external_task_id='task_external_without_failure',
            execution_date_fn=lambda dt:
            [dt + timedelta(seconds=i) for i in range(2)],
            allowed_states=['success'],
            retries=0,
            timeout=1,
            poke_interval=1,
            dag=dag)
        task_with_failure = ExternalTaskSensor(
            task_id='task_with_failure',
            external_dag_id=dag_external_id,
            external_task_id='task_external_with_failure',
            execution_date_fn=lambda dt:
            [dt + timedelta(seconds=i) for i in range(2)],
            allowed_states=['success'],
            retries=0,
            timeout=1,
            poke_interval=1,
            dag=dag)

        task_without_failure.run(start_date=DEFAULT_DATE,
                                 end_date=DEFAULT_DATE,
                                 ignore_ti_state=True)

        with self.assertRaises(AirflowSensorTimeout):
            task_with_failure.run(start_date=DEFAULT_DATE,
                                  end_date=DEFAULT_DATE,
                                  ignore_ti_state=True)
예제 #16
0
class ShortCircuitOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dag = DAG('shortcircuit_operator_test',
                       default_args={
                           'owner': 'airflow',
                           'start_date': DEFAULT_DATE
                       },
                       schedule_interval=INTERVAL)
        self.short_op = ShortCircuitOperator(
            task_id='make_choice',
            dag=self.dag,
            python_callable=lambda: self.value)

        self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag)
        self.branch_1.set_upstream(self.short_op)
        self.upstream = DummyOperator(task_id='upstream', dag=self.dag)
        self.upstream.set_downstream(self.short_op)
        self.dag.clear()

        self.value = True

    def test_without_dag_run(self):
        """This checks the defensive against non existent tasks in a dag run"""
        self.value = False
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        session = Session()
        tis = session.query(TI).filter(TI.dag_id == self.dag.dag_id,
                                       TI.execution_date == DEFAULT_DATE)

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        self.value = True
        self.dag.clear()

        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise

        session.close()

    def test_with_dag_run(self):
        self.value = False
        dr = self.dag.create_dagrun(run_id="manual__",
                                    start_date=datetime.datetime.now(),
                                    execution_date=DEFAULT_DATE,
                                    state=State.RUNNING)

        self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        self.value = True
        self.dag.clear()

        self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise
    def test_external_task_sensor_fn_multiple_execution_dates(self):
        bash_command_code = """
{% set s=execution_date.time().second %}
echo "second is {{ s }}"
if [[ $(( {{ s }} % 60 )) == 1 ]]
    then
        exit 1
fi
exit 0
"""
        dag_external_id = TEST_DAG_ID + '_external'
        dag_external = DAG(
            dag_external_id,
            default_args=self.args,
            schedule_interval=timedelta(seconds=1))
        task_external_with_failure = BashOperator(
            task_id="task_external_with_failure",
            bash_command=bash_command_code,
            retries=0,
            dag=dag_external)
        task_external_without_failure = DummyOperator(
            task_id="task_external_without_failure",
            retries=0,
            dag=dag_external)

        task_external_without_failure.run(
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + timedelta(seconds=1),
            ignore_ti_state=True)

        session = settings.Session()
        TI = TaskInstance
        try:
            task_external_with_failure.run(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE + timedelta(seconds=1),
                ignore_ti_state=True)
            # The test_with_failure task is excepted to fail
            # once per minute (the run on the first second of
            # each minute).
        except Exception as e:
            failed_tis = session.query(TI).filter(
                TI.dag_id == dag_external_id,
                TI.state == State.FAILED,
                TI.execution_date == DEFAULT_DATE + timedelta(seconds=1)).all()
            if len(failed_tis) == 1 and \
               failed_tis[0].task_id == 'task_external_with_failure':
                pass
            else:
                raise e

        dag_id = TEST_DAG_ID
        dag = DAG(
            dag_id,
            default_args=self.args,
            schedule_interval=timedelta(minutes=1))
        task_without_failure = ExternalTaskSensor(
            task_id='task_without_failure',
            external_dag_id=dag_external_id,
            external_task_id='task_external_without_failure',
            execution_date_fn=lambda dt: [dt + timedelta(seconds=i)
                                          for i in range(2)],
            allowed_states=['success'],
            retries=0,
            timeout=1,
            poke_interval=1,
            dag=dag)
        task_with_failure = ExternalTaskSensor(
            task_id='task_with_failure',
            external_dag_id=dag_external_id,
            external_task_id='task_external_with_failure',
            execution_date_fn=lambda dt: [dt + timedelta(seconds=i)
                                          for i in range(2)],
            allowed_states=['success'],
            retries=0,
            timeout=1,
            poke_interval=1,
            dag=dag)

        task_without_failure.run(
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)

        with self.assertRaises(AirflowSensorTimeout):
            task_with_failure.run(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_ti_state=True)
 def test_dummy_operator(self):
     d = DummyOperator(task_id=TEST_TASK_ID, dag=self.dag)
     d.run(start_date=DEFAULT_DATE,
           end_date=DEFAULT_DATE,
           ignore_ti_state=True)