Exemple #1
0
    def test_dag_with_system_exit(self):
        """
        Test to check that a DAG with a system.exit() doesn't break the scheduler.
        """

        dag_id = 'exit_test_dag'
        dag_ids = [dag_id]
        dag_directory = os.path.join(models.DAGS_FOLDER,
                                     "..",
                                     "dags_with_system_exit")
        dag_file = os.path.join(dag_directory,
                                'b_test_scheduler_dags.py')

        dagbag = DagBag(dag_folder=dag_file)
        for dag_id in dag_ids:
            dag = dagbag.get_dag(dag_id)
            dag.clear()

        scheduler = SchedulerJob(dag_ids=dag_ids,
                                 subdir= dag_directory,
                                 num_runs=1,
                                 **self.default_scheduler_args)
        scheduler.run()
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
Exemple #2
0
 def do_schedule(function, function2):
     scheduler = SchedulerJob(
         num_runs=1,
         executor=executor,
     )
     scheduler.heartrate = 0
     scheduler.run()
    def test_scheduler_do_not_schedule_removed_task(self):
        dag = DAG(
            dag_id='test_scheduler_do_not_schedule_removed_task',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.schedule_dag(dag)
        self.assertIsNotNone(dr)

        dag = DAG(
            dag_id='test_scheduler_do_not_schedule_removed_task',
            start_date=DEFAULT_DATE)

        queue = mock.Mock()
        scheduler.process_dag(dag, queue=queue)

        queue.put.assert_not_called()
Exemple #4
0
    def test_scheduler_verify_max_active_runs(self):
        """
        Test if a a dagrun will not be scheduled if max_dag_runs has been reached
        """
        dag = DAG(
            dag_id='test_scheduler_verify_max_active_runs',
            start_date=DEFAULT_DATE)
        dag.max_active_runs = 1

        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        dr = scheduler.create_dag_run(dag)
        self.assertIsNone(dr)
    def test_scheduler_process_execute_task(self):
        """
        Test if process dag sends a task to the executor
        """
        dag = DAG(
            dag_id='test_scheduler_process_execute_task',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()
        dr = scheduler.schedule_dag(dag)
        self.assertIsNotNone(dr)

        queue = mock.Mock()
        scheduler.process_dag(dag, queue=queue)

        queue.put.assert_called_with(
            ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None)
        )

        tis = dr.get_task_instances(state=State.SCHEDULED)
        self.assertIsNotNone(tis)
Exemple #6
0
    def test_scheduler_process_task_instances(self):
        """
        Test if _process_task_instances puts the right task instances into the
        queue.
        """
        dag = DAG(
            dag_id='test_scheduler_process_execute_task',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        queue.append.assert_called_with(
            (dag.dag_id, dag_task1.task_id, DEFAULT_DATE)
        )
Exemple #7
0
    def test_scheduler_do_not_schedule_too_early(self):
        dag = DAG(
            dag_id='test_scheduler_do_not_schedule_too_early',
            start_date=datetime.datetime(2200, 1, 1))
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNone(dr)

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        queue.put.assert_not_called()
Exemple #8
0
    def test_scheduler_does_not_run_excluded(self):
        dag = DAG(
            dag_id='test_scheduler_does_not_run_excluded',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        tis = dr.get_task_instances(session=session)
        for ti in tis:
            ti.state = State.EXCLUDED

        session.commit()
        session.close()

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        queue.put.assert_not_called()
Exemple #9
0
    def test_scheduler_do_not_schedule_too_early(self):
        dag = DAG(
            dag_id='test_scheduler_do_not_schedule_too_early',
            start_date=datetime.datetime(2200, 1, 1))
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNone(dr)

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        queue.put.assert_not_called()
Exemple #10
0
    def test_scheduler_process_execute_task(self):
        """
        Test if process dag sends a task to the executor
        """
        dag = DAG(dag_id='test_scheduler_process_execute_task',
                  start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()
        dr = scheduler.schedule_dag(dag)
        self.assertIsNotNone(dr)

        queue = mock.Mock()
        scheduler.process_dag(dag, queue=queue)

        queue.put.assert_called_with(
            ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None))

        tis = dr.get_task_instances(state=State.SCHEDULED)
        self.assertIsNotNone(tis)
Exemple #11
0
    def test_scheduler_verify_max_active_runs(self):
        """
        Test if a a dagrun will not be scheduled if max_dag_runs has been reached
        """
        dag = DAG(
            dag_id='test_scheduler_verify_max_active_runs',
            start_date=DEFAULT_DATE)
        dag.max_active_runs = 1

        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        dr = scheduler.create_dag_run(dag)
        self.assertIsNone(dr)
Exemple #12
0
    def test_scheduler_pooled_tasks(self):
        """
        Test that the scheduler handles queued tasks correctly
        See issue #1299
        """
        session = settings.Session()
        if not (session.query(Pool).filter(
                Pool.pool == 'test_queued_pool').first()):
            pool = Pool(pool='test_queued_pool', slots=5)
            session.merge(pool)
            session.commit()
        session.close()

        dag_id = 'test_scheduled_queued_tasks'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        scheduler = SchedulerJob(dag_id, num_runs=10)
        scheduler.run()

        task_1 = dag.tasks[0]
        ti = TI(task_1, dag.start_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)

        dag.clear()
Exemple #13
0
    def test_scheduler_fail_dagrun_timeout(self):
        """
        Test if a a dagrun wil be set failed if timeout
        """
        dag = DAG(
            dag_id='test_scheduler_fail_dagrun_timeout',
            start_date=DEFAULT_DATE)
        dag.dagrun_timeout = datetime.timedelta(seconds=60)

        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1)
        session.merge(dr)
        session.commit()

        dr2 = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr2)

        dr.refresh_from_db(session=session)
        self.assertEquals(dr.state, State.FAILED)
    def test_scheduler_process_check_heartrate(self):
        """
        Test if process dag honors the heartrate
        """
        dag = DAG(
            dag_id='test_scheduler_process_check_heartrate',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        orm_dag.last_scheduler_run = datetime.datetime.now()
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        scheduler.heartrate = 1000

        dag.clear()

        dr = scheduler.schedule_dag(dag)
        self.assertIsNotNone(dr)

        queue = mock.Mock()
        scheduler.process_dag(dag, queue=queue)

        queue.put.assert_not_called()
Exemple #15
0
    def test_dag_with_system_exit(self):
        """
        Test to check that a DAG with a system.exit() doesn't break the scheduler.
        """

        dag_id = 'exit_test_dag'
        dag_ids = [dag_id]
        dag_directory = os.path.join(models.DAGS_FOLDER,
                                     "..",
                                     "dags_with_system_exit")
        dag_file = os.path.join(dag_directory,
                                'b_test_scheduler_dags.py')

        dagbag = DagBag(dag_folder=dag_file)
        for dag_id in dag_ids:
            dag = dagbag.get_dag(dag_id)
            dag.clear()

        scheduler = SchedulerJob(dag_ids=dag_ids,
                                 subdir= dag_directory,
                                 num_runs=1,
                                 **self.default_scheduler_args)
        scheduler.run()
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
Exemple #16
0
    def test_scheduler_do_not_run_finished(self):
        dag = DAG(dag_id='test_scheduler_do_not_run_finished',
                  start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        tis = dr.get_task_instances(session=session)
        for ti in tis:
            ti.state = State.SUCCESS

        session.commit()
        session.close()

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        queue.put.assert_not_called()
Exemple #17
0
    def test_scheduler_process_task_instances(self):
        """
        Test if _process_task_instances puts the right task instances into the
        queue.
        """
        dag = DAG(dag_id='test_scheduler_process_execute_task',
                  start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        queue.append.assert_called_with(
            (dag.dag_id, dag_task1.task_id, DEFAULT_DATE))
Exemple #18
0
    def test_scheduler_add_new_task(self):
        """
        Test if a task instance will be added if the dag is updated
        """
        dag = DAG(dag_id='test_scheduler_add_new_task',
                  start_date=DEFAULT_DATE)

        dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 1)

        dag_task2 = DummyOperator(task_id='dummy2', dag=dag, owner='airflow')

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
Exemple #19
0
    def test_scheduler_pooled_tasks(self):
        """
        Test that the scheduler handles queued tasks correctly
        See issue #1299
        """
        session = settings.Session()
        if not (
                session.query(Pool)
                .filter(Pool.pool == 'test_queued_pool')
                .first()):
            pool = Pool(pool='test_queued_pool', slots=5)
            session.merge(pool)
            session.commit()
        session.close()

        dag_id = 'test_scheduled_queued_tasks'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        scheduler = SchedulerJob(dag_id, num_runs=10)
        scheduler.run()

        task_1 = dag.tasks[0]
        ti = TI(task_1, dag.start_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)

        dag.clear()
Exemple #20
0
    def test_scheduler_fail_dagrun_timeout(self):
        """
        Test if a a dagrun wil be set failed if timeout
        """
        dag = DAG(dag_id='test_scheduler_fail_dagrun_timeout',
                  start_date=DEFAULT_DATE)
        dag.dagrun_timeout = datetime.timedelta(seconds=60)

        dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1)
        session.merge(dr)
        session.commit()

        dr2 = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr2)

        dr.refresh_from_db(session=session)
        self.assertEquals(dr.state, State.FAILED)
Exemple #21
0
    def test_scheduler_verify_max_active_runs_and_dagrun_timeout(self):
        """
        Test if a a dagrun will not be scheduled if max_dag_runs has been reached and dagrun_timeout is not reached
        Test if a a dagrun will be scheduled if max_dag_runs has been reached but dagrun_timeout is also reached
        """
        dag = DAG(
            dag_id='test_scheduler_verify_max_active_runs_and_dagrun_timeout',
            start_date=DEFAULT_DATE)
        dag.max_active_runs = 1
        dag.dagrun_timeout = datetime.timedelta(seconds=60)

        dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        # Should not be scheduled as DagRun has not timedout and max_active_runs is reached
        new_dr = scheduler.create_dag_run(dag)
        self.assertIsNone(new_dr)

        # Should be scheduled as dagrun_timeout has passed
        dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1)
        session.merge(dr)
        session.commit()
        new_dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(new_dr)
Exemple #22
0
    def _test_schedule(self,
                       interval,
                       schedule_criteria,
                       start_date=True,
                       default_start_date=-10):
        print("Test {} date of {} schedule job.".format(
            "start" if start_date else "end", interval))
        today = datetime.today()
        today_utc = datetime.now(timezone.utc)
        _start_date = today + timedelta(days=default_start_date)
        _end_date = None
        catchup = start_date

        def format(date):
            if date is None:
                return "-"
            else:
                return date.strftime("%Y/%m/%d %H:%M:%S")

        deltas = (
            (-2, "day_before_yesterday"),
            (-1, "yesterday"),
            (0, "today"),
            (1, "tomorrow"),
            (2, "day_after_tomorrow"),
        )

        for delta, description in deltas:
            date = today + timedelta(days=delta)

            if start_date:
                _start_date = datetime.today() + timedelta(days=delta)
            else:
                _end_date = date

            dag = DAG(f"dag_starts_{description}",
                      start_date=_start_date,
                      end_date=_end_date,
                      schedule_interval=interval,
                      catchup=catchup)

            scheduler = SchedulerJob()
            dag.clear()
            dr = scheduler.create_dag_run(dag)
            execution_date = None if dr is None else dr.execution_date
            dates = (f"today={format(today)} (UTC={format(today_utc)}), ",
                     f"start_date={format(dag.start_date)}, ",
                     f"end_date={format(dag.end_date)}, ",
                     f"execution_date={format(execution_date)}")
            dates = "\n\t" + "\n\t".join(dates)

            if schedule_criteria(delta):
                self.assertIsNotNone(dr)
                print(f"{description} is scheduled {dates}.")
            else:
                self.assertIsNone(dr)
                print(f"{description} is not scheduled {dates}.")
Exemple #23
0
 def do_schedule(function, function2):
     # Use a empty file since the above mock will return the
     # expected DAGs. Also specify only a single file so that it doesn't
     # try to schedule the above DAG repeatedly.
     scheduler = SchedulerJob(num_runs=1,
                              executor=executor,
                              subdir=os.path.join(models.DAGS_FOLDER,
                                                  "no_dags.py"))
     scheduler.heartrate = 0
     scheduler.run()
Exemple #24
0
 def do_schedule(function, function2):
     # Use a empty file since the above mock will return the
     # expected DAGs. Also specify only a single file so that it doesn't
     # try to schedule the above DAG repeatedly.
     scheduler = SchedulerJob(num_runs=1,
                              executor=executor,
                              subdir=os.path.join(
                                  settings.DAGS_FOLDER, "no_dags.py"))
     scheduler.heartrate = 0
     scheduler.run()
Exemple #25
0
    def evaluate_dagrun(
            self,
            dag_id,
            first_task_state,
            second_task_state,
            dagrun_state,
            run_kwargs=None,
            advance_execution_date=False,
            session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob()
        dag = get_dag(dag_id, TEST_DAGS_FOLDER)
        dr = scheduler.schedule_dag(dag)
        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.schedule_dag(dag)
        ex_date = dr.execution_date

        # if 'test_dagrun_states_deadlock' in dag_id and run_kwargs:
        #     import ipdb; ipdb.set_trace()

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        task_1, task_2 = dag.tasks
        ti = TI(task_1, ex_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, first_task_state)
        ti = TI(task_2, ex_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, second_task_state)

        # load dagrun
        dr = session.query(DagRun).filter(
            DagRun.dag_id == dag.dag_id,
            DagRun.execution_date == ex_date
        ).first()

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        # import ipdb; ipdb.set_trace()

        dag.get_active_runs()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemple #26
0
    def test_dag_get_active_runs(self):
        """
        Test to check that a DAG returns it's active runs
        """

        now = datetime.datetime.now()
        six_hours_ago_to_the_hour = (now -
                                     datetime.timedelta(hours=6)).replace(
                                         minute=0, second=0, microsecond=0)

        START_DATE = six_hours_ago_to_the_hour
        DAG_NAME1 = 'get_active_runs_test'

        default_args = {
            'owner': 'airflow',
            'depends_on_past': False,
            'start_date': START_DATE
        }
        dag1 = DAG(DAG_NAME1,
                   schedule_interval='* * * * *',
                   max_active_runs=1,
                   default_args=default_args)

        run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1)
        run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1)
        run_this_2.set_upstream(run_this_1)
        run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1)
        run_this_3.set_upstream(run_this_2)

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag1.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag1.clear()

        dr = scheduler.create_dag_run(dag1)

        # We had better get a dag run
        self.assertIsNotNone(dr)

        execution_date = dr.execution_date

        running_dates = dag1.get_active_runs()

        try:
            running_date = running_dates[0]
        except:
            running_date = 'Except'

        self.assertEqual(execution_date, running_date,
                         'Running Date must match Execution Date')
Exemple #27
0
    def test_dag_get_active_runs(self):
        """
        Test to check that a DAG returns it's active runs
        """

        now = datetime.datetime.now()
        six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace(minute=0, second=0, microsecond=0)

        START_DATE = six_hours_ago_to_the_hour
        DAG_NAME1 = 'get_active_runs_test'

        default_args = {
            'owner': 'airflow',
            'depends_on_past': False,
            'start_date': START_DATE

        }
        dag1 = DAG(DAG_NAME1,
                   schedule_interval='* * * * *',
                   max_active_runs=1,
                   default_args=default_args
                   )

        run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1)
        run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1)
        run_this_2.set_upstream(run_this_1)
        run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1)
        run_this_3.set_upstream(run_this_2)

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag1.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag1.clear()

        dr = scheduler.create_dag_run(dag1)

        # We had better get a dag run
        self.assertIsNotNone(dr)

        execution_date = dr.execution_date

        running_dates = dag1.get_active_runs()

        try:
            running_date = running_dates[0]
        except:
            running_date = 'Except'

        self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
Exemple #28
0
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        dag.clear()
        target_dag.clear()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)
        self.assertFalse(queue.append.called)

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_first_depends_on_past=True
        )
        job.run()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)

        self.assertTrue(queue.append.called)
        target_dag.clear()
        dag.clear()
Exemple #29
0
    def evaluate_dagrun(
            self,
            dag_id,
            expected_task_states,  # dict of task_id: state
            dagrun_state,
            run_kwargs=None,
            advance_execution_date=False,
            session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS.
        This is hackish: a dag run is created but its tasks are
        run by a backfill.
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob()
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        dr = scheduler.schedule_dag(dag)

        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.schedule_dag(dag)
        ex_date = dr.execution_date

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        for task_id, expected_state in expected_task_states.items():
            task = dag.get_task(task_id)
            ti = TI(task, ex_date)
            ti.refresh_from_db()
            self.assertEqual(ti.state, expected_state)

        # load dagrun
        dr = DagRun.find(dag_id=dag_id, execution_date=ex_date)
        dr = dr[0]
        dr.dag = dag

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        dr.update_state()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemple #30
0
    def evaluate_dagrun(
            self,
            dag_id,
            expected_task_states,  # dict of task_id: state
            dagrun_state,
            run_kwargs=None,
            advance_execution_date=False,
            session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS.
        This is hackish: a dag run is created but its tasks are
        run by a backfill.
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob(**self.default_scheduler_args)
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        dr = scheduler.create_dag_run(dag)

        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.create_dag_run(dag)
        ex_date = dr.execution_date

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        for task_id, expected_state in expected_task_states.items():
            task = dag.get_task(task_id)
            ti = TI(task, ex_date)
            ti.refresh_from_db()
            self.assertEqual(ti.state, expected_state)

        # load dagrun
        dr = DagRun.find(dag_id=dag_id, execution_date=ex_date)
        dr = dr[0]
        dr.dag = dag

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        dr.update_state()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemple #31
0
    def test_scheduler_auto_align(self):
        """
        Test if the schedule_interval will be auto aligned with the start_date
        such that if the start_date coincides with the schedule the first
        execution_date will be start_date, otherwise it will be start_date +
        interval.
        """
        dag = DAG(
            dag_id='test_scheduler_auto_align_1',
            start_date=datetime.datetime(2016, 1, 1, 10, 10, 0),
            schedule_interval="4 5 * * *"
        )
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 2, 5, 4))

        dag = DAG(
            dag_id='test_scheduler_auto_align_2',
            start_date=datetime.datetime(2016, 1, 1, 10, 10, 0),
            schedule_interval="10 10 * * *"
        )
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        self.assertEquals(dr.execution_date, datetime.datetime(2016, 1, 1, 10, 10))
Exemple #32
0
    def evaluate_dagrun(self,
                        dag_id,
                        first_task_state,
                        second_task_state,
                        dagrun_state,
                        run_kwargs=None,
                        advance_execution_date=False,
                        session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob()
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        dr = scheduler.schedule_dag(dag)
        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.schedule_dag(dag)
        ex_date = dr.execution_date

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        task_1, task_2 = dag.tasks
        ti = TI(task_1, ex_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, first_task_state)
        ti = TI(task_2, ex_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, second_task_state)

        # load dagrun
        dr = session.query(DagRun).filter(
            DagRun.dag_id == dag.dag_id,
            DagRun.execution_date == ex_date).first()

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        dag.get_active_runs()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemple #33
0
    def test_scheduler_dagrun_once(self):
        """
        Test if the scheduler does not create multiple dagruns
        if a dag is scheduled with @once and a start_date
        """
        dag = DAG('test_scheduler_dagrun_once',
                  start_date=datetime.datetime(2015, 1, 1),
                  schedule_interval="@once")

        scheduler = SchedulerJob()
        dag.clear()
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        dr = scheduler.create_dag_run(dag)
        self.assertIsNone(dr)
    def evaluate_dagrun(
            self,
            dag_id,
            expected_task_states,  # dict of task_id: state
            dagrun_state,
            run_kwargs=None,
            advance_execution_date=False,
            session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob()
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        dr = scheduler.schedule_dag(dag)
        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.schedule_dag(dag)
        ex_date = dr.execution_date

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        for task_id, expected_state in expected_task_states.items():
            task = dag.get_task(task_id)
            ti = TI(task, ex_date)
            ti.refresh_from_db()
            self.assertEqual(ti.state, expected_state)

        # load dagrun
        dr = session.query(DagRun).filter(
            DagRun.dag_id == dag.dag_id,
            DagRun.execution_date == ex_date
        ).first()

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        dag.get_active_runs()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemple #35
0
    def run_single_scheduler_loop_with_no_dags(dags_folder):
        """
        Utility function that runs a single scheduler loop without actually
        changing/scheduling any dags. This is useful to simulate the other side effects of
        running a scheduler loop, e.g. to see what parse errors there are in the
        dags_folder.

        :param dags_folder: the directory to traverse
        :type directory: str
        """
        scheduler = SchedulerJob(
            dag_id='this_dag_doesnt_exist',  # We don't want to actually run anything
            num_runs=1,
            subdir=os.path.join(dags_folder))
        scheduler.heartrate = 0
        scheduler.run()
    def setUpClass(cls):
        from tests.compat import MagicMock
        from airflow.jobs import SchedulerJob

        cls.dag = DAG(
            'test_dag',
            default_args={
                'owner': 'airflow',
                'start_date': DEFAULT_DATE},
            schedule_interval=INTERVAL)

        cls.dag.create_dagrun(
            run_id="manual__1",
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        cls.dag.create_dagrun(
            run_id="manual__2",
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING
        )

        cls.dag.create_dagrun(
            run_id="manual__3",
            execution_date=END_DATE,
            state=State.RUNNING
        )

        cls.dag_file_processor = SchedulerJob(dag_ids=[], log=MagicMock())
Exemple #37
0
    def test_scheduler_dagrun_once(self):
        """
        Test if the scheduler does not create multiple dagruns
        if a dag is scheduled with @once and a start_date
        """
        dag = DAG(
            'test_scheduler_dagrun_once',
            start_date=datetime.datetime(2015, 1, 1),
            schedule_interval="@once")

        scheduler = SchedulerJob()
        dag.clear()
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        dr = scheduler.create_dag_run(dag)
        self.assertIsNone(dr)
    def test_scheduler_multiprocessing(self):
        """
        Test that the scheduler can successfully queue multiple dags in parallel
        """
        dag_ids = ['test_start_date_scheduling', 'test_dagrun_states_success']
        for dag_id in dag_ids:
            dag = self.dagbag.get_dag(dag_id)
            dag.clear()

        scheduler = SchedulerJob(dag_ids=dag_ids, num_runs=2)
        scheduler.run()

        # zero tasks ran
        dag_id = 'test_start_date_scheduling'
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)
Exemple #39
0
    def test_scheduler_multiprocessing(self):
        """
        Test that the scheduler can successfully queue multiple dags in parallel
        """
        dag_ids = ['test_start_date_scheduling', 'test_dagrun_states_success']
        for dag_id in dag_ids:
            dag = self.dagbag.get_dag(dag_id)
            dag.clear()

        scheduler = SchedulerJob(dag_ids=dag_ids, num_runs=2)
        scheduler.run()

        # zero tasks ran
        dag_id = 'test_start_date_scheduling'
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)
Exemple #40
0
    def test_scheduler_verify_pool_full(self, mock_pool_full):
        """
        Test task instances not queued when pool is full
        """
        mock_pool_full.return_value = False

        dag = DAG(
            dag_id='test_scheduler_verify_pool_full',
            start_date=DEFAULT_DATE)

        DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow',
            pool='test_scheduler_verify_pool_full')

        session = settings.Session()
        pool = Pool(pool='test_scheduler_verify_pool_full', slots=1)
        session.add(pool)
        orm_dag = DagModel(dag_id=dag.dag_id)
        orm_dag.is_paused = False
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        # Create 2 dagruns, which will create 2 task instances.
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        self.assertEquals(dr.execution_date, DEFAULT_DATE)
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        queue = []
        scheduler._process_task_instances(dag, queue=queue)
        self.assertEquals(len(queue), 2)
        dagbag = SimpleDagBag([dag])

        # Recreated part of the scheduler here, to kick off tasks -> executor
        for ti_key in queue:
            task = dag.get_task(ti_key[1])
            ti = models.TaskInstance(task, ti_key[2])
            # Task starts out in the scheduled state. All tasks in the
            # scheduled state will be sent to the executor
            ti.state = State.SCHEDULED

            # Also save this task instance to the DB.
            session.merge(ti)
            session.commit()

        scheduler._execute_task_instances(dagbag,
                                          (State.SCHEDULED,
                                           State.UP_FOR_RETRY))

        self.assertEquals(len(scheduler.executor.queued_tasks), 1)
Exemple #41
0
    def test_retry_handling_job(self):
        """
        Integration test of the scheduler not accidentally resetting
        the try_numbers for a task
        """
        dag = self.dagbag.get_dag('test_retry_handling_job')
        dag_task1 = dag.get_task("test_retry_handling_op")
        dag.clear()

        scheduler = SchedulerJob(dag_id=dag.dag_id, num_runs=1)
        scheduler.heartrate = 0
        scheduler.run()

        session = settings.Session()
        ti = session.query(TI).filter(TI.dag_id == dag.dag_id,
                                      TI.task_id == dag_task1.task_id).first()

        # make sure the counter has increased
        self.assertEqual(ti.try_number, 2)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)
Exemple #42
0
    def test_scheduler_run_duration(self):
        """
        Verifies that the scheduler run duration limit is followed.
        """
        dag_id = 'test_start_date_scheduling'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        self.assertTrue(dag.start_date > DEFAULT_DATE)

        expected_run_duration = 5
        start_time = datetime.datetime.now()
        scheduler = SchedulerJob(dag_id,
                                 run_duration=expected_run_duration,
                                 **self.default_scheduler_args)
        scheduler.run()
        end_time = datetime.datetime.now()

        run_duration = (end_time - start_time).total_seconds()
        logging.info("Test ran in %.2fs, expected %.2fs", run_duration,
                     expected_run_duration)
        self.assertLess(run_duration - expected_run_duration, 5.0)
Exemple #43
0
    def test_scheduler_max_active_runs_respected_after_clear(self):
        """
        Test if _process_task_instances only schedules ti's up to max_active_runs
        (related to issue AIRFLOW-137)
        """
        dag = DAG(
            dag_id='test_scheduler_max_active_runs_respected_after_clear',
            start_date=DEFAULT_DATE)
        dag.max_active_runs = 3

        dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        # First create up to 3 dagruns in RUNNING state.
        scheduler.create_dag_run(dag)

        # Reduce max_active_runs to 1
        dag.max_active_runs = 1

        queue = mock.Mock()
        # and schedule them in, so we can check how many
        # tasks are put on the queue (should be one, not 3)
        scheduler._process_task_instances(dag, queue=queue)

        queue.append.assert_called_with(
            (dag.dag_id, dag_task1.task_id, DEFAULT_DATE))
Exemple #44
0
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        dag.clear()
        target_dag.clear()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)
        self.assertFalse(queue.append.called)

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_first_depends_on_past=True
        )
        job.run()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)

        self.assertTrue(queue.append.called)
        target_dag.clear()
        dag.clear()
Exemple #45
0
    def test_scheduler_verify_pool_full(self, mock_pool_full):
        """
        Test task instances not queued when pool is full
        """
        mock_pool_full.return_value = False

        dag = DAG(
            dag_id='test_scheduler_verify_pool_full',
            start_date=DEFAULT_DATE)

        DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow',
            pool='test_scheduler_verify_pool_full')

        session = settings.Session()
        pool = Pool(pool='test_scheduler_verify_pool_full', slots=1)
        session.add(pool)
        orm_dag = DagModel(dag_id=dag.dag_id)
        orm_dag.is_paused = False
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        # Create 2 dagruns, which will create 2 task instances.
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        self.assertEquals(dr.execution_date, DEFAULT_DATE)
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        queue = []
        scheduler._process_task_instances(dag, queue=queue)
        self.assertEquals(len(queue), 2)
        dagbag = SimpleDagBag([dag])

        # Recreated part of the scheduler here, to kick off tasks -> executor
        for ti_key in queue:
            task = dag.get_task(ti_key[1])
            ti = models.TaskInstance(task, ti_key[2])
            # Task starts out in the scheduled state. All tasks in the
            # scheduled state will be sent to the executor
            ti.state = State.SCHEDULED

            # Also save this task instance to the DB.
            session.merge(ti)
            session.commit()

        scheduler._execute_task_instances(dagbag,
                                          (State.SCHEDULED,
                                           State.UP_FOR_RETRY))

        self.assertEquals(len(scheduler.executor.queued_tasks), 1)
Exemple #46
0
    def test_scheduler_process_check_heartrate(self):
        """
        Test if process dag honors the heartrate
        """
        dag = DAG(dag_id='test_scheduler_process_check_heartrate',
                  start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        orm_dag.last_scheduler_run = datetime.datetime.now()
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        scheduler.heartrate = 1000

        dag.clear()

        dr = scheduler.schedule_dag(dag)
        self.assertIsNotNone(dr)

        queue = mock.Mock()
        scheduler.process_dag(dag, queue=queue)

        queue.put.assert_not_called()
Exemple #47
0
    def test_scheduler_run_duration(self):
        """
        Verifies that the scheduler run duration limit is followed.
        """
        dag_id = 'test_start_date_scheduling'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        self.assertTrue(dag.start_date > DEFAULT_DATE)

        expected_run_duration = 5
        start_time = datetime.datetime.now()
        scheduler = SchedulerJob(dag_id,
                                 run_duration=expected_run_duration,
                                 **self.default_scheduler_args)
        scheduler.run()
        end_time = datetime.datetime.now()

        run_duration = (end_time - start_time).total_seconds()
        _log.info("Test ran in %.2fs, expected %.2fs",
                     run_duration,
                     expected_run_duration)
        assert run_duration - expected_run_duration < 5.0
Exemple #48
0
    def test_scheduler_max_active_runs_respected_after_clear(self):
        """
        Test if _process_task_instances only schedules ti's up to max_active_runs
        (related to issue AIRFLOW-137)
        """
        dag = DAG(
            dag_id='test_scheduler_max_active_runs_respected_after_clear',
            start_date=DEFAULT_DATE)
        dag.max_active_runs = 3

        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        # First create up to 3 dagruns in RUNNING state.
        scheduler.create_dag_run(dag)

        # Reduce max_active_runs to 1
        dag.max_active_runs = 1

        queue = mock.Mock()
        # and schedule them in, so we can check how many
        # tasks are put on the queue (should be one, not 3)
        scheduler._process_task_instances(dag, queue=queue)

        queue.append.assert_called_with(
            (dag.dag_id, dag_task1.task_id, DEFAULT_DATE)
        )
Exemple #49
0
    def test_scheduler_verify_max_active_runs_and_dagrun_timeout(self):
        """
        Test if a a dagrun will not be scheduled if max_dag_runs has been reached and dagrun_timeout is not reached
        Test if a a dagrun will be scheduled if max_dag_runs has been reached but dagrun_timeout is also reached
        """
        dag = DAG(
            dag_id='test_scheduler_verify_max_active_runs_and_dagrun_timeout',
            start_date=DEFAULT_DATE)
        dag.max_active_runs = 1
        dag.dagrun_timeout = datetime.timedelta(seconds=60)

        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        # Should not be scheduled as DagRun has not timedout and max_active_runs is reached
        new_dr = scheduler.create_dag_run(dag)
        self.assertIsNone(new_dr)

        # Should be scheduled as dagrun_timeout has passed
        dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1)
        session.merge(dr)
        session.commit()
        new_dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(new_dr)
Exemple #50
0
    def test_scheduler_add_new_task(self):
        """
        Test if a task instance will be added if the dag is updated
        """
        dag = DAG(
            dag_id='test_scheduler_add_new_task',
            start_date=DEFAULT_DATE)

        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 1)

        dag_task2 = DummyOperator(
            task_id='dummy2',
            dag=dag,
            owner='airflow')

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
Exemple #51
0
    def test_dagrun_root_fail_unfinished(self):
        """
        DagRuns with one unfinished and one failed root task -> RUNNING
        """
        # Run both the failed and successful tasks
        scheduler = SchedulerJob(**self.default_scheduler_args)
        dag_id = 'test_dagrun_states_root_fail_unfinished'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        dr = scheduler.create_dag_run(dag)
        try:
            dag.run(start_date=dr.execution_date, end_date=dr.execution_date)
        except AirflowException:  # Expect an exception since there is a failed task
            pass

        # Mark the successful task as never having run since we want to see if the
        # dagrun will be in a running state despite haveing an unfinished task.
        session = settings.Session()
        ti = dr.get_task_instance('test_dagrun_unfinished', session=session)
        ti.state = State.NONE
        session.commit()
        dr_state = dr.update_state()
        self.assertEqual(dr_state, State.RUNNING)
Exemple #52
0
    def test_scheduler_do_not_schedule_removed_task(self):
        dag = DAG(dag_id='test_scheduler_do_not_schedule_removed_task',
                  start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(task_id='dummy', dag=dag, owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.schedule_dag(dag)
        self.assertIsNotNone(dr)

        dag = DAG(dag_id='test_scheduler_do_not_schedule_removed_task',
                  start_date=DEFAULT_DATE)

        queue = mock.Mock()
        scheduler.process_dag(dag, queue=queue)

        queue.put.assert_not_called()
Exemple #53
0
    def test_scheduler_start_date(self):
        """
        Test that the scheduler respects start_dates, even when DAGS have run
        """

        dag_id = 'test_start_date_scheduling'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        self.assertTrue(dag.start_date > DEFAULT_DATE)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # zero tasks ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)

        # previously, running this backfill would kick off the Scheduler
        # because it would take the most recent run and start from there
        # That behavior still exists, but now it will only do so if after the
        # start date
        backfill = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE)
        backfill.run()

        # one task ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # still one task
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
Exemple #54
0
    def test_scheduler_pooled_tasks(self):
        """
        Test that the scheduler handles queued tasks correctly
        See issue #1299
        """
        session = settings.Session()
        if not (session.query(Pool).filter(
                Pool.pool == 'test_queued_pool').first()):
            pool = Pool(pool='test_queued_pool', slots=5)
            session.merge(pool)
            session.commit()
        session.close()

        dag_id = 'test_scheduled_queued_tasks'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        scheduler = SchedulerJob(dag_id,
                                 num_runs=1,
                                 executor=TestExecutor(),
                                 **self.default_scheduler_args)
        scheduler.run()

        task_1 = dag.tasks[0]
        logging.info("Trying to find task {}".format(task_1))
        ti = TI(task_1, dag.start_date)
        ti.refresh_from_db()
        logging.error("TI is: {}".format(ti))
        self.assertEqual(ti.state, State.QUEUED)

        # now we use a DIFFERENT scheduler and executor
        # to simulate the num-runs CLI arg
        scheduler2 = SchedulerJob(dag_id,
                                  num_runs=5,
                                  executor=DEFAULT_EXECUTOR.__class__(),
                                  **self.default_scheduler_args)
        scheduler2.run()

        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)
        dag.clear()
Exemple #55
0
    def test_scheduler_start_date(self):
        """
        Test that the scheduler respects start_dates, even when DAGS have run
        """

        dag_id = 'test_start_date_scheduling'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        self.assertTrue(dag.start_date > DEFAULT_DATE)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # zero tasks ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 0)

        # previously, running this backfill would kick off the Scheduler
        # because it would take the most recent run and start from there
        # That behavior still exists, but now it will only do so if after the
        # start date
        backfill = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE)
        backfill.run()

        # one task ran
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)

        scheduler = SchedulerJob(dag_id,
                                 num_runs=2,
                                 **self.default_scheduler_args)
        scheduler.run()

        # still one task
        session = settings.Session()
        self.assertEqual(
            len(session.query(TI).filter(TI.dag_id == dag_id).all()), 1)
Exemple #56
0
    def test_scheduler_pooled_tasks(self):
        """
        Test that the scheduler handles queued tasks correctly
        See issue #1299
        """
        session = settings.Session()
        if not (
                session.query(Pool)
                .filter(Pool.pool == 'test_queued_pool')
                .first()):
            pool = Pool(pool='test_queued_pool', slots=5)
            session.merge(pool)
            session.commit()
        session.close()

        dag_id = 'test_scheduled_queued_tasks'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        scheduler = SchedulerJob(dag_id,
                                 num_runs=1,
                                 executor=TestExecutor(),
                                 **self.default_scheduler_args)
        scheduler.run()

        task_1 = dag.tasks[0]
        logging.info("Trying to find task {}".format(task_1))
        ti = TI(task_1, dag.start_date)
        ti.refresh_from_db()
        logging.error("TI is: {}".format(ti))
        self.assertEqual(ti.state, State.QUEUED)

        # now we use a DIFFERENT scheduler and executor
        # to simulate the num-runs CLI arg
        scheduler2 = SchedulerJob(
            dag_id,
            num_runs=5,
            executor=DEFAULT_EXECUTOR.__class__(),
            **self.default_scheduler_args)
        scheduler2.run()

        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)
        dag.clear()
Exemple #57
0
 def do_schedule(function, function2):
     scheduler = SchedulerJob(num_runs=1, executor=executor,)
     scheduler.run()