Ejemplo n.º 1
0
    def test_scheduler_process_check_heartrate(self):
        """
        Test if process dag honors the heartrate
        """
        dag = DAG(
            dag_id='test_scheduler_process_check_heartrate',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        orm_dag.last_scheduler_run = datetime.datetime.now()
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        scheduler.heartrate = 1000

        dag.clear()

        dr = scheduler.schedule_dag(dag)
        self.assertIsNotNone(dr)

        queue = mock.Mock()
        scheduler.process_dag(dag, queue=queue)

        queue.put.assert_not_called()
Ejemplo n.º 2
0
 def setUp(self):
     configuration.load_test_config()
     from airflow.contrib.hooks.ssh_hook import SSHHook
     hook = SSHHook(ssh_conn_id='ssh_default')
     hook.no_host_key_check = True
     args = {
         'owner': 'airflow',
         'start_date': DEFAULT_DATE,
         'provide_context': True
     }
     dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args)
     dag.schedule_interval = '@once'
     self.hook = hook
     self.dag = dag
     self.test_dir = "/tmp"
     self.test_local_dir = "/tmp/tmp2"
     self.test_remote_dir = "/tmp/tmp1"
     self.test_local_filename = 'test_local_file'
     self.test_remote_filename = 'test_remote_file'
     self.test_local_filepath = '{0}/{1}'.format(self.test_dir,
                                                 self.test_local_filename)
     # Local Filepath with Intermediate Directory
     self.test_local_filepath_int_dir = '{0}/{1}'.format(self.test_local_dir,
                                                         self.test_local_filename)
     self.test_remote_filepath = '{0}/{1}'.format(self.test_dir,
                                                  self.test_remote_filename)
     # Remote Filepath with Intermediate Directory
     self.test_remote_filepath_int_dir = '{0}/{1}'.format(self.test_remote_dir,
                                                          self.test_remote_filename)
Ejemplo n.º 3
0
    def test_scheduler_do_not_schedule_removed_task(self):
        dag = DAG(
            dag_id='test_scheduler_do_not_schedule_removed_task',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.schedule_dag(dag)
        self.assertIsNotNone(dr)

        dag = DAG(
            dag_id='test_scheduler_do_not_schedule_removed_task',
            start_date=DEFAULT_DATE)

        queue = mock.Mock()
        scheduler.process_dag(dag, queue=queue)

        queue.put.assert_not_called()
Ejemplo n.º 4
0
    def test_clear_task_instances_without_task(self):
        dag = DAG('test_clear_task_instances_without_task', start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE + datetime.timedelta(days=10))
        task0 = DummyOperator(task_id='task0', owner='test', dag=dag)
        task1 = DummyOperator(task_id='task1', owner='test', dag=dag, retries=2)
        ti0 = TI(task=task0, execution_date=DEFAULT_DATE)
        ti1 = TI(task=task1, execution_date=DEFAULT_DATE)
        ti0.run()
        ti1.run()

        # Remove the task from dag.
        dag.task_dict = {}
        self.assertFalse(dag.has_task(task0.task_id))
        self.assertFalse(dag.has_task(task1.task_id))

        session = settings.Session()
        qry = session.query(TI).filter(
            TI.dag_id == dag.dag_id).all()
        clear_task_instances(qry, session)
        session.commit()
        # When dag is None, max_tries will be maximum of original max_tries or try_number.
        ti0.refresh_from_db()
        ti1.refresh_from_db()
        # Next try to run will be try 2
        self.assertEqual(ti0.try_number, 2)
        self.assertEqual(ti0.max_tries, 1)
        self.assertEqual(ti1.try_number, 2)
        self.assertEqual(ti1.max_tries, 2)
Ejemplo n.º 5
0
    def test_following_previous_schedule_daily_dag_CET_to_CEST(self):
        """
        Make sure DST transitions are properly observed
        """
        local_tz = pendulum.timezone('Europe/Zurich')
        start = local_tz.convert(datetime.datetime(2018, 3, 25, 2),
                                 dst_rule=pendulum.PRE_TRANSITION)

        utc = timezone.convert_to_utc(start)

        dag = DAG('tz_dag', start_date=start, schedule_interval='0 3 * * *')

        prev = dag.previous_schedule(utc)
        prev_local = local_tz.convert(prev)

        self.assertEqual(prev_local.isoformat(), "2018-03-24T03:00:00+01:00")
        self.assertEqual(prev.isoformat(), "2018-03-24T02:00:00+00:00")

        _next = dag.following_schedule(utc)
        next_local = local_tz.convert(_next)

        self.assertEqual(next_local.isoformat(), "2018-03-25T03:00:00+02:00")
        self.assertEqual(_next.isoformat(), "2018-03-25T01:00:00+00:00")

        prev = dag.previous_schedule(_next)
        prev_local = local_tz.convert(prev)

        self.assertEqual(prev_local.isoformat(), "2018-03-24T03:00:00+01:00")
        self.assertEqual(prev.isoformat(), "2018-03-24T02:00:00+00:00")
Ejemplo n.º 6
0
    def test_sync_to_db(self, mock_now):
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
        )
        with dag:
            DummyOperator(task_id='task', owner='owner1')
            SubDagOperator(
                task_id='subtask',
                owner='owner2',
                subdag=DAG(
                    'dag.subtask',
                    start_date=DEFAULT_DATE,
                )
            )
        now = datetime.datetime.utcnow().replace(tzinfo=pendulum.timezone('UTC'))
        mock_now.return_value = now
        session = settings.Session()
        dag.sync_to_db(session=session)

        orm_dag = session.query(DagModel).filter(DagModel.dag_id == 'dag').one()
        self.assertEqual(set(orm_dag.owners.split(', ')), {'owner1', 'owner2'})
        self.assertEqual(orm_dag.last_scheduler_run, now)
        self.assertTrue(orm_dag.is_active)
        self.assertIsNone(orm_dag.default_view)
        self.assertEqual(orm_dag.get_default_view(),
                         configuration.conf.get('webserver', 'dag_default_view').lower())
        self.assertEqual(orm_dag.safe_dag_id, 'dag')

        orm_subdag = session.query(DagModel).filter(
            DagModel.dag_id == 'dag.subtask').one()
        self.assertEqual(set(orm_subdag.owners.split(', ')), {'owner1', 'owner2'})
        self.assertEqual(orm_subdag.last_scheduler_run, now)
        self.assertTrue(orm_subdag.is_active)
        self.assertEqual(orm_subdag.safe_dag_id, 'dag__dot__subtask')
Ejemplo n.º 7
0
    def test_skip(self, mock_now):
        session = settings.Session()
        now = datetime.datetime.utcnow().replace(tzinfo=pendulum.timezone('UTC'))
        mock_now.return_value = now
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
        )
        with dag:
            tasks = [DummyOperator(task_id='task')]
        dag_run = dag.create_dagrun(
            run_id='manual__' + now.isoformat(),
            state=State.FAILED,
        )
        SkipMixin().skip(
            dag_run=dag_run,
            execution_date=now,
            tasks=tasks,
            session=session)

        session.query(TI).filter(
            TI.dag_id == 'dag',
            TI.task_id == 'task',
            TI.state == State.SKIPPED,
            TI.start_date == now,
            TI.end_date == now,
        ).one()
Ejemplo n.º 8
0
    def test_following_previous_schedule(self):
        """
        Make sure DST transitions are properly observed
        """
        local_tz = pendulum.timezone('Europe/Zurich')
        start = local_tz.convert(datetime.datetime(2018, 10, 28, 2, 55),
                                 dst_rule=pendulum.PRE_TRANSITION)
        self.assertEqual(start.isoformat(), "2018-10-28T02:55:00+02:00",
                         "Pre-condition: start date is in DST")

        utc = timezone.convert_to_utc(start)

        dag = DAG('tz_dag', start_date=start, schedule_interval='*/5 * * * *')
        _next = dag.following_schedule(utc)
        next_local = local_tz.convert(_next)

        self.assertEqual(_next.isoformat(), "2018-10-28T01:00:00+00:00")
        self.assertEqual(next_local.isoformat(), "2018-10-28T02:00:00+01:00")

        prev = dag.previous_schedule(utc)
        prev_local = local_tz.convert(prev)

        self.assertEqual(prev_local.isoformat(), "2018-10-28T02:50:00+02:00")

        prev = dag.previous_schedule(_next)
        prev_local = local_tz.convert(prev)

        self.assertEqual(prev_local.isoformat(), "2018-10-28T02:55:00+02:00")
        self.assertEqual(prev, utc)
Ejemplo n.º 9
0
    def test_scheduler_process_task_instances(self):
        """
        Test if _process_task_instances puts the right task instances into the
        queue.
        """
        dag = DAG(
            dag_id='test_scheduler_process_execute_task',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        queue.append.assert_called_with(
            (dag.dag_id, dag_task1.task_id, DEFAULT_DATE)
        )
    def setUp(self):
        configuration.load_test_config()
        from airflow.contrib.hooks.ssh_hook import SSHHook
        from airflow.hooks.S3_hook import S3Hook

        hook = SSHHook(ssh_conn_id='ssh_default')
        s3_hook = S3Hook('aws_default')
        hook.no_host_key_check = True
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE,
            'provide_context': True
        }
        dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args)
        dag.schedule_interval = '@once'

        self.hook = hook
        self.s3_hook = s3_hook

        self.ssh_client = self.hook.get_conn()
        self.sftp_client = self.ssh_client.open_sftp()

        self.dag = dag
        self.s3_bucket = BUCKET
        self.sftp_path = SFTP_PATH
        self.s3_key = S3_KEY
Ejemplo n.º 11
0
    def test_scheduler_does_not_run_excluded(self):
        dag = DAG(
            dag_id='test_scheduler_does_not_run_excluded',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        tis = dr.get_task_instances(session=session)
        for ti in tis:
            ti.state = State.EXCLUDED

        session.commit()
        session.close()

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        queue.put.assert_not_called()
Ejemplo n.º 12
0
    def test_scheduler_do_not_schedule_too_early(self):
        dag = DAG(
            dag_id='test_scheduler_do_not_schedule_too_early',
            start_date=datetime.datetime(2200, 1, 1))
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNone(dr)

        queue = mock.Mock()
        scheduler._process_task_instances(dag, queue=queue)

        queue.put.assert_not_called()
Ejemplo n.º 13
0
    def test_scheduler_verify_max_active_runs(self):
        """
        Test if a a dagrun will not be scheduled if max_dag_runs has been reached
        """
        dag = DAG(
            dag_id='test_scheduler_verify_max_active_runs',
            start_date=DEFAULT_DATE)
        dag.max_active_runs = 1

        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)

        dr = scheduler.create_dag_run(dag)
        self.assertIsNone(dr)
Ejemplo n.º 14
0
    def test_scheduler_fail_dagrun_timeout(self):
        """
        Test if a a dagrun wil be set failed if timeout
        """
        dag = DAG(
            dag_id='test_scheduler_fail_dagrun_timeout',
            start_date=DEFAULT_DATE)
        dag.dagrun_timeout = datetime.timedelta(seconds=60)

        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        dr.start_date = datetime.datetime.now() - datetime.timedelta(days=1)
        session.merge(dr)
        session.commit()

        dr2 = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr2)

        dr.refresh_from_db(session=session)
        self.assertEquals(dr.state, State.FAILED)
Ejemplo n.º 15
0
    def test_dag_clear(self):
        dag = DAG('test_dag_clear', start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE + datetime.timedelta(days=10))
        task0 = DummyOperator(task_id='test_dag_clear_task_0', owner='test', dag=dag)
        ti0 = TI(task=task0, execution_date=DEFAULT_DATE)
        # Next try to run will be try 1
        self.assertEqual(ti0.try_number, 1)
        ti0.run()
        self.assertEqual(ti0.try_number, 2)
        dag.clear()
        ti0.refresh_from_db()
        self.assertEqual(ti0.try_number, 2)
        self.assertEqual(ti0.state, State.NONE)
        self.assertEqual(ti0.max_tries, 1)

        task1 = DummyOperator(task_id='test_dag_clear_task_1', owner='test',
                              dag=dag, retries=2)
        ti1 = TI(task=task1, execution_date=DEFAULT_DATE)
        self.assertEqual(ti1.max_tries, 2)
        ti1.try_number = 1
        # Next try will be 2
        ti1.run()
        self.assertEqual(ti1.try_number, 3)
        self.assertEqual(ti1.max_tries, 2)

        dag.clear()
        ti0.refresh_from_db()
        ti1.refresh_from_db()
        # after clear dag, ti2 should show attempt 3 of 5
        self.assertEqual(ti1.max_tries, 4)
        self.assertEqual(ti1.try_number, 3)
        # after clear dag, ti1 should show attempt 2 of 2
        self.assertEqual(ti0.try_number, 2)
        self.assertEqual(ti0.max_tries, 1)
Ejemplo n.º 16
0
    def test_scheduler_process_execute_task(self):
        """
        Test if process dag sends a task to the executor
        """
        dag = DAG(
            dag_id='test_scheduler_process_execute_task',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag.clear()
        dr = scheduler.schedule_dag(dag)
        self.assertIsNotNone(dr)

        queue = mock.Mock()
        scheduler.process_dag(dag, queue=queue)

        queue.put.assert_called_with(
            ((dag.dag_id, dag_task1.task_id, DEFAULT_DATE), None)
        )

        tis = dr.get_task_instances(state=State.SCHEDULED)
        self.assertIsNotNone(tis)
Ejemplo n.º 17
0
    def test_scheduler_verify_pool_full(self, mock_pool_full):
        """
        Test task instances not queued when pool is full
        """
        mock_pool_full.return_value = False

        dag = DAG(
            dag_id='test_scheduler_verify_pool_full',
            start_date=DEFAULT_DATE)

        DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow',
            pool='test_scheduler_verify_pool_full')

        session = settings.Session()
        pool = Pool(pool='test_scheduler_verify_pool_full', slots=1)
        session.add(pool)
        orm_dag = DagModel(dag_id=dag.dag_id)
        orm_dag.is_paused = False
        session.merge(orm_dag)
        session.commit()

        scheduler = SchedulerJob()
        dag.clear()

        # Create 2 dagruns, which will create 2 task instances.
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        self.assertEquals(dr.execution_date, DEFAULT_DATE)
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        queue = []
        scheduler._process_task_instances(dag, queue=queue)
        self.assertEquals(len(queue), 2)
        dagbag = SimpleDagBag([dag])

        # Recreated part of the scheduler here, to kick off tasks -> executor
        for ti_key in queue:
            task = dag.get_task(ti_key[1])
            ti = models.TaskInstance(task, ti_key[2])
            # Task starts out in the scheduled state. All tasks in the
            # scheduled state will be sent to the executor
            ti.state = State.SCHEDULED

            # Also save this task instance to the DB.
            session.merge(ti)
            session.commit()

        scheduler._execute_task_instances(dagbag,
                                          (State.SCHEDULED,
                                           State.UP_FOR_RETRY))

        self.assertEquals(len(scheduler.executor.queued_tasks), 1)
Ejemplo n.º 18
0
    def test_without_dag_run(self):
        """This checks the defensive against non existent tasks in a dag run"""
        value = False
        dag = DAG('shortcircuit_operator_test_without_dag_run',
                  default_args={
                       'owner': 'airflow',
                       'start_date': DEFAULT_DATE
                  },
                  schedule_interval=INTERVAL)
        short_op = ShortCircuitOperator(task_id='make_choice',
                                        dag=dag,
                                        python_callable=lambda: value)
        branch_1 = DummyOperator(task_id='branch_1', dag=dag)
        branch_1.set_upstream(short_op)
        branch_2 = DummyOperator(task_id='branch_2', dag=dag)
        branch_2.set_upstream(branch_1)
        upstream = DummyOperator(task_id='upstream', dag=dag)
        upstream.set_downstream(short_op)
        dag.clear()

        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        session = Session()
        tis = session.query(TI).filter(
            TI.dag_id == dag.dag_id,
            TI.execution_date == DEFAULT_DATE
        )

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        value = True
        dag.clear()

        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise

        session.close()
def dag(mocker):
    clear_session()
    configuration.load_test_config()
    dag = DAG(
        "test_dag",
        default_args=dict(owner="airflow", start_date=DEFAULT_DATE),
        schedule_interval=INTERVAL,
    )
    yield dag
    dag.clear()
    clear_session()
Ejemplo n.º 20
0
    def test_dag_get_active_runs(self):
        """
        Test to check that a DAG returns it's active runs
        """

        now = datetime.datetime.now()
        six_hours_ago_to_the_hour = (now - datetime.timedelta(hours=6)).replace(minute=0, second=0, microsecond=0)

        START_DATE = six_hours_ago_to_the_hour
        DAG_NAME1 = 'get_active_runs_test'

        default_args = {
            'owner': 'airflow',
            'depends_on_past': False,
            'start_date': START_DATE

        }
        dag1 = DAG(DAG_NAME1,
                   schedule_interval='* * * * *',
                   max_active_runs=1,
                   default_args=default_args
                   )

        run_this_1 = DummyOperator(task_id='run_this_1', dag=dag1)
        run_this_2 = DummyOperator(task_id='run_this_2', dag=dag1)
        run_this_2.set_upstream(run_this_1)
        run_this_3 = DummyOperator(task_id='run_this_3', dag=dag1)
        run_this_3.set_upstream(run_this_2)

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag1.dag_id)
        session.merge(orm_dag)
        session.commit()
        session.close()

        scheduler = SchedulerJob()
        dag1.clear()

        dr = scheduler.create_dag_run(dag1)

        # We had better get a dag run
        self.assertIsNotNone(dr)

        execution_date = dr.execution_date

        running_dates = dag1.get_active_runs()

        try:
            running_date = running_dates[0]
        except:
            running_date = 'Except'

        self.assertEqual(execution_date, running_date, 'Running Date must match Execution Date')
Ejemplo n.º 21
0
 def setUp(self):
     from airflow.www_rbac.views import dagbag
     from airflow.utils.state import State
     dag = DAG(self.DAG_ID, start_date=self.DEFAULT_DATE)
     dagbag.bag_dag(dag, parent_dag=dag, root_dag=dag)
     self.runs = []
     for rd in self.RUNS_DATA:
         run = dag.create_dagrun(
             run_id=rd[0],
             execution_date=rd[1],
             state=State.SUCCESS,
             external_trigger=True
         )
         self.runs.append(run)
Ejemplo n.º 22
0
 def setUp(self):
     configuration.load_test_config()
     from airflow.contrib.hooks.ssh_hook import SSHHook
     hook = SSHHook(ssh_conn_id='ssh_default')
     hook.no_host_key_check = True
     args = {
         'owner': 'airflow',
         'start_date': DEFAULT_DATE,
         'provide_context': True
     }
     dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args)
     dag.schedule_interval = '@once'
     self.hook = hook
     self.dag = dag
Ejemplo n.º 23
0
    def test_dagrun_success_conditions(self):
        session = settings.Session()

        dag = DAG(
            'test_dagrun_success_conditions',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        # A -> B
        # A -> C -> D
        # ordered: B, D, C, A or D, B, C, A or D, C, B, A
        with dag:
            op1 = DummyOperator(task_id='A')
            op2 = DummyOperator(task_id='B')
            op3 = DummyOperator(task_id='C')
            op4 = DummyOperator(task_id='D')
            op1.set_upstream([op2, op3])
            op3.set_upstream(op4)

        dag.clear()

        now = datetime.datetime.now()
        dr = dag.create_dagrun(run_id='test_dagrun_success_conditions',
                               state=State.RUNNING,
                               execution_date=now,
                               start_date=now)

        # op1 = root
        ti_op1 = dr.get_task_instance(task_id=op1.task_id)
        ti_op1.set_state(state=State.SUCCESS, session=session)

        ti_op2 = dr.get_task_instance(task_id=op2.task_id)
        ti_op3 = dr.get_task_instance(task_id=op3.task_id)
        ti_op4 = dr.get_task_instance(task_id=op4.task_id)

        # root is successful, but unfinished tasks
        state = dr.update_state()
        self.assertEqual(State.RUNNING, state)

        # one has failed, but root is successful
        ti_op2.set_state(state=State.FAILED, session=session)
        ti_op3.set_state(state=State.SUCCESS, session=session)
        ti_op4.set_state(state=State.SUCCESS, session=session)
        state = dr.update_state()
        self.assertEqual(State.SUCCESS, state)

        # upstream dependency failed, root has not run
        ti_op1.set_state(State.NONE, session)
        state = dr.update_state()
        self.assertEqual(State.FAILED, state)
Ejemplo n.º 24
0
    def test_scheduler_reschedule(self):
        """
        Checks if tasks that are not taken up by the executor
        get rescheduled
        """
        executor = TestExecutor()

        dagbag = DagBag(executor=executor)
        dagbag.dags.clear()
        dagbag.executor = executor

        dag = DAG(
            dag_id='test_scheduler_reschedule',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        dag.clear()
        dag.is_subdag = False

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        orm_dag.is_paused = False
        session.merge(orm_dag)
        session.commit()

        dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag)

        @mock.patch('airflow.models.DagBag', return_value=dagbag)
        @mock.patch('airflow.models.DagBag.collect_dags')
        def do_schedule(function, function2):
            # Use a empty file since the above mock will return the
            # expected DAGs. Also specify only a single file so that it doesn't
            # try to schedule the above DAG repeatedly.
            scheduler = SchedulerJob(num_runs=1,
                                     executor=executor,
                                     subdir=os.path.join(models.DAGS_FOLDER,
                                                         "no_dags.py"))
            scheduler.heartrate = 0
            scheduler.run()

        do_schedule()
        self.assertEquals(1, len(executor.queued_tasks))
        executor.queued_tasks.clear()

        do_schedule()
        self.assertEquals(2, len(executor.queued_tasks))
Ejemplo n.º 25
0
    def test_scheduler_dagrun_once(self):
        """
        Test if the scheduler does not create multiple dagruns
        if a dag is scheduled with @once and a start_date
        """
        dag = DAG(
            'test_scheduler_dagrun_once',
            start_date=datetime.datetime(2015, 1, 1),
            schedule_interval="@once")

        scheduler = SchedulerJob()
        dag.clear()
        dr = scheduler.create_dag_run(dag)
        self.assertIsNotNone(dr)
        dr = scheduler.create_dag_run(dag)
        self.assertIsNone(dr)
Ejemplo n.º 26
0
    def test_dagstats_crud(self):
        DagStat.create(dag_id='test_dagstats_crud')

        session = settings.Session()
        qry = session.query(DagStat).filter(DagStat.dag_id == 'test_dagstats_crud')
        self.assertEqual(len(qry.all()), len(State.dag_states))

        DagStat.set_dirty(dag_id='test_dagstats_crud')
        res = qry.all()

        for stat in res:
            self.assertTrue(stat.dirty)

        # create missing
        DagStat.set_dirty(dag_id='test_dagstats_crud_2')
        qry2 = session.query(DagStat).filter(DagStat.dag_id == 'test_dagstats_crud_2')
        self.assertEqual(len(qry2.all()), len(State.dag_states))

        dag = DAG(
            'test_dagstats_crud',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        with dag:
            op1 = DummyOperator(task_id='A')

        now = datetime.datetime.now()
        dr = dag.create_dagrun(
            run_id='manual__' + now.isoformat(),
            execution_date=now,
            start_date=now,
            state=State.FAILED,
            external_trigger=False,
        )

        DagStat.update(dag_ids=['test_dagstats_crud'])
        res = qry.all()
        for stat in res:
            if stat.state == State.FAILED:
                self.assertEqual(stat.count, 1)
            else:
                self.assertEqual(stat.count, 0)

        DagStat.update()
        res = qry2.all()
        for stat in res:
            self.assertFalse(stat.dirty)
    def setUp(self):
        self.dag = DAG('branch_operator_test',
                       default_args={
                           'owner': 'airflow',
                           'start_date': DEFAULT_DATE},
                       schedule_interval=INTERVAL)

        self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag)
        self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag)
Ejemplo n.º 28
0
 def setUp(self):
     configuration.load_test_config()
     app = application.create_app(testing=True)
     app.config['WTF_CSRF_METHODS'] = []
     self.app = app.test_client()
     self.session = Session()
     from airflow.www.views import dagbag
     from airflow.utils.state import State
     dag = DAG(self.DAG_ID, start_date=self.DEFAULT_DATE)
     dagbag.bag_dag(dag, parent_dag=dag, root_dag=dag)
     self.runs = []
     for rd in self.RUNS_DATA:
         run = dag.create_dagrun(
             run_id=rd[0],
             execution_date=rd[1],
             state=State.SUCCESS,
             external_trigger=True
         )
         self.runs.append(run)
Ejemplo n.º 29
0
    def test_scheduler_reschedule(self):
        """
        Checks if tasks that are not taken up by the executor
        get rescheduled
        """
        executor = TestExecutor()

        dagbag = DagBag(executor=executor)
        dagbag.dags.clear()
        dagbag.executor = executor

        dag = DAG(
            dag_id='test_scheduler_reschedule',
            start_date=DEFAULT_DATE)
        dag_task1 = DummyOperator(
            task_id='dummy',
            dag=dag,
            owner='airflow')

        dag.clear()
        dag.is_subdag = False

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        orm_dag.is_paused = False
        session.merge(orm_dag)
        session.commit()

        dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag)

        @mock.patch('airflow.models.DagBag', return_value=dagbag)
        @mock.patch('airflow.models.DagBag.collect_dags')
        def do_schedule(function, function2):
            scheduler = SchedulerJob(num_runs=1, executor=executor,)
            scheduler.heartrate = 0
            scheduler.run()

        do_schedule()
        self.assertEquals(1, len(executor.queued_tasks))
        executor.queued_tasks.clear()

        do_schedule()
        self.assertEquals(2, len(executor.queued_tasks))
Ejemplo n.º 30
0
    def test_get_num_task_instances(self):
        test_dag_id = 'test_get_num_task_instances_dag'
        test_task_id = 'task_1'

        test_dag = DAG(dag_id=test_dag_id, start_date=DEFAULT_DATE)
        test_task = DummyOperator(task_id=test_task_id, dag=test_dag)

        ti1 = TI(task=test_task, execution_date=DEFAULT_DATE)
        ti1.state = None
        ti2 = TI(task=test_task, execution_date=DEFAULT_DATE + datetime.timedelta(days=1))
        ti2.state = State.RUNNING
        ti3 = TI(task=test_task, execution_date=DEFAULT_DATE + datetime.timedelta(days=2))
        ti3.state = State.QUEUED
        ti4 = TI(task=test_task, execution_date=DEFAULT_DATE + datetime.timedelta(days=3))
        ti4.state = State.RUNNING
        session = settings.Session()
        session.merge(ti1)
        session.merge(ti2)
        session.merge(ti3)
        session.merge(ti4)
        session.commit()

        self.assertEqual(
            0,
            DAG.get_num_task_instances(test_dag_id, ['fakename'], session=session)
        )
        self.assertEqual(
            4,
            DAG.get_num_task_instances(test_dag_id, [test_task_id], session=session)
        )
        self.assertEqual(
            4,
            DAG.get_num_task_instances(
                test_dag_id, ['fakename', test_task_id], session=session)
        )
        self.assertEqual(
            1,
            DAG.get_num_task_instances(
                test_dag_id, [test_task_id], states=[None], session=session)
        )
        self.assertEqual(
            2,
            DAG.get_num_task_instances(
                test_dag_id, [test_task_id], states=[State.RUNNING], session=session)
        )
        self.assertEqual(
            3,
            DAG.get_num_task_instances(
                test_dag_id, [test_task_id],
                states=[None, State.RUNNING], session=session)
        )
        self.assertEqual(
            4,
            DAG.get_num_task_instances(
                test_dag_id, [test_task_id],
                states=[None, State.QUEUED, State.RUNNING], session=session)
        )
        session.close()
def generate_dag_run():
    return [DagRunOrder(payload={'timeout': i}) for i in range(10)]


def after_dags_handler():
    print("All target DAGs are finished")


args = {
    'start_date': days_ago(1),
    'owner': 'airflow',
}

dag = DAG(
    dag_id='trigger_with_multi_dagrun_sensor',
    max_active_runs=1,
    schedule_interval='@hourly',
    default_args=args,
)

gen_target_dag_run = TriggerMultiDagRunOperator(
    task_id='gen_target_dag_run',
    dag=dag,
    trigger_dag_id='common_target',
    python_callable=generate_dag_run,
)

# Wait until there is no running instance of target DAG
wait_target_dag = MultiDagRunSensor(task_id='wait_target_dag', dag=dag)
wait_target_dag.set_upstream(gen_target_dag_run)

after_dags_handler_op = PythonOperator(task_id='after_dags_handler',
Ejemplo n.º 32
0
 def _get_task_instance(self, state):
     dag = DAG('test_dag')
     task = Mock(dag=dag)
     ti = TaskInstance(task=task, state=state, execution_date=None)
     return ti
Ejemplo n.º 33
0
        def nested_subdags():
            from airflow.models import DAG
            from airflow.operators.dummy_operator import DummyOperator
            from airflow.operators.subdag_operator import SubDagOperator
            import datetime
            DAG_NAME = 'master'
            DEFAULT_ARGS = {
                'owner': 'owner1',
                'start_date': datetime.datetime(2016, 1, 1)
            }
            dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS)

            # master:
            #     A -> opSubdag_0
            #          master.opSubdag_0:
            #              -> opSubDag_A
            #                 master.opSubdag_0.opSubdag_A:
            #                     -> subdag_A.task
            #              -> opSubdag_B
            #                 master.opSubdag_0.opSubdag_B:
            #                     -> subdag_B.task
            #     A -> opSubdag_1
            #          master.opSubdag_1:
            #              -> opSubdag_C
            #                 master.opSubdag_1.opSubdag_C:
            #                     -> subdag_C.task
            #              -> opSubDag_D
            #                 master.opSubdag_1.opSubdag_D:
            #                     -> subdag_D.task

            with dag:

                def subdag_A():
                    subdag_A = DAG('master.opSubdag_0.opSubdag_A',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_A.task', dag=subdag_A)
                    return subdag_A

                def subdag_B():
                    subdag_B = DAG('master.opSubdag_0.opSubdag_B',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_B.task', dag=subdag_B)
                    return subdag_B

                def subdag_C():
                    subdag_C = DAG('master.opSubdag_1.opSubdag_C',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_C.task', dag=subdag_C)
                    return subdag_C

                def subdag_D():
                    subdag_D = DAG('master.opSubdag_1.opSubdag_D',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_D.task', dag=subdag_D)
                    return subdag_D

                def subdag_0():
                    subdag_0 = DAG('master.opSubdag_0',
                                   default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_A',
                                   dag=subdag_0,
                                   subdag=subdag_A())
                    SubDagOperator(task_id='opSubdag_B',
                                   dag=subdag_0,
                                   subdag=subdag_B())
                    return subdag_0

                def subdag_1():
                    subdag_1 = DAG('master.opSubdag_1',
                                   default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_C',
                                   dag=subdag_1,
                                   subdag=subdag_C())
                    SubDagOperator(task_id='opSubdag_D',
                                   dag=subdag_1,
                                   subdag=subdag_D())
                    return subdag_1

                opSubdag_0 = SubDagOperator(task_id='opSubdag_0',
                                            dag=dag,
                                            subdag=subdag_0())
                opSubdag_1 = SubDagOperator(task_id='opSubdag_1',
                                            dag=dag,
                                            subdag=subdag_1())

                opA = DummyOperator(task_id='A')
                opA.set_downstream(opSubdag_0)
                opA.set_downstream(opSubdag_1)

            return dag
Ejemplo n.º 34
0
 def subdag_C():
     subdag_C = DAG('master.opSubdag_1.opSubdag_C',
                    default_args=DEFAULT_ARGS)
     DummyOperator(task_id='subdag_C.task', dag=subdag_C)
     return subdag_C
Ejemplo n.º 35
0
 def subdag_B():
     subdag_B = DAG('nested_cycle.opSubdag_0.opSubdag_B',
                    default_args=DEFAULT_ARGS)
     DummyOperator(task_id='subdag_B.task', dag=subdag_B)
     return subdag_B
Ejemplo n.º 36
0
from datetime import datetime

from airflow.models import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator
from airflow.operators.subdag_operator import SubDagOperator

DEFAULT_DATE = datetime(2016, 1, 1)

default_args = {
    'owner': 'airflow',
    'start_date': DEFAULT_DATE,
    'run_as_user': '******'
}

dag = DAG(dag_id='impersonation_subdag', default_args=default_args)


def print_today():
    print('Today is {}'.format(datetime.utcnow()))


subdag = DAG('impersonation_subdag.test_subdag_operation',
             default_args=default_args)

PythonOperator(python_callable=print_today,
               task_id='exec_python_fn',
               dag=subdag)

BashOperator(task_id='exec_bash_operator',
             bash_command='echo "Running within SubDag"',
Ejemplo n.º 37
0
import airflow
import requests
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator

from airflow_breakfast.utils.slack import send_slack_message

args = {
    "owner": "godatadriven",
    "start_date": airflow.utils.dates.days_ago(10)
}

dag = DAG(
    dag_id="2_rockets",
    default_args=args,
    description=
    "DAG downloading rocket launches from Launch Library.",  # e.g. https://launchlibrary.net/1.4/launch?startdate=2019-04-10&enddate=2019-04-21
    schedule_interval="0 0 * * *",
)


def _download_rocket_launches(ds, next_ds, **_):
    query = f"https://launchlibrary.net/1.4/launch?startdate={ds}&enddate={next_ds}"
    result_path = f"/data/rocket_launches/ds={ds}"
    pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)
    result_file = posixpath.join(result_path, "launches.json")

    response = requests.get(query)
    with open(result_file, "w") as f:
        f.write(response.text)
        print(f"Wrote result to file {result_file}")
import yaml
import glob
from datetime import datetime

from airflow.models import DAG
from airflow.operators.postgres_operator import PostgresOperator

YAML_DIR = '/usr/local/airflow/dags'

default_args = {'start_date': datetime(2019, 1, 1)}

dag = DAG(dag_id='example_yaml', default_args=default_args)

with dag:
    for filename in glob.glob(YAML_DIR + '/*.yaml'):
        with open(filename, 'r') as stream:
            yaml_data = yaml.safe_load(stream)

            incremental_task = PostgresOperator(
                task_id=yaml_data['task_id'],
                sql=yaml_data['sql'],
            )

Ejemplo n.º 39
0
class TestSqlBranch(TestHiveEnvironment, unittest.TestCase):
    """
    Test for SQL Branch Operator
    """
    @classmethod
    def setUpClass(cls):
        super().setUpClass()

        with create_session() as session:
            session.query(DagRun).delete()
            session.query(TI).delete()

    def setUp(self):
        super().setUp()
        self.dag = DAG(
            "sql_branch_operator_test",
            default_args={
                "owner": "airflow",
                "start_date": DEFAULT_DATE
            },
            schedule_interval=INTERVAL,
        )
        self.branch_1 = DummyOperator(task_id="branch_1", dag=self.dag)
        self.branch_2 = DummyOperator(task_id="branch_2", dag=self.dag)
        self.branch_3 = None

    def tearDown(self):
        super().tearDown()

        with create_session() as session:
            session.query(DagRun).delete()
            session.query(TI).delete()

    def test_unsupported_conn_type(self):
        """ Check if BranchSqlOperator throws an exception for unsupported connection type """
        op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="redis_default",
            sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )

        with self.assertRaises(AirflowException):
            op.run(start_date=DEFAULT_DATE,
                   end_date=DEFAULT_DATE,
                   ignore_ti_state=True)

    def test_invalid_conn(self):
        """ Check if BranchSqlOperator throws an exception for invalid connection """
        op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="invalid_connection",
            sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )

        with self.assertRaises(AirflowException):
            op.run(start_date=DEFAULT_DATE,
                   end_date=DEFAULT_DATE,
                   ignore_ti_state=True)

    def test_invalid_follow_task_true(self):
        """ Check if BranchSqlOperator throws an exception for invalid connection """
        op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="invalid_connection",
            sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES",
            follow_task_ids_if_true=None,
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )

        with self.assertRaises(AirflowException):
            op.run(start_date=DEFAULT_DATE,
                   end_date=DEFAULT_DATE,
                   ignore_ti_state=True)

    def test_invalid_follow_task_false(self):
        """ Check if BranchSqlOperator throws an exception for invalid connection """
        op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="invalid_connection",
            sql="SELECT count(1) FROM INFORMATION_SCHEMA.TABLES",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false=None,
            dag=self.dag,
        )

        with self.assertRaises(AirflowException):
            op.run(start_date=DEFAULT_DATE,
                   end_date=DEFAULT_DATE,
                   ignore_ti_state=True)

    @pytest.mark.backend("mysql")
    def test_sql_branch_operator_mysql(self):
        """ Check if BranchSqlOperator works with backend """
        branch_op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="mysql_default",
            sql="SELECT 1",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )
        branch_op.run(start_date=DEFAULT_DATE,
                      end_date=DEFAULT_DATE,
                      ignore_ti_state=True)

    @pytest.mark.backend("postgres")
    def test_sql_branch_operator_postgres(self):
        """ Check if BranchSqlOperator works with backend """
        branch_op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="postgres_default",
            sql="SELECT 1",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )
        branch_op.run(start_date=DEFAULT_DATE,
                      end_date=DEFAULT_DATE,
                      ignore_ti_state=True)

    @mock.patch("airflow.operators.sql_branch_operator.BaseHook")
    def test_branch_single_value_with_dag_run(self, mock_hook):
        """ Check BranchSqlOperator branch operation """
        branch_op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="mysql_default",
            sql="SELECT 1",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )

        self.branch_1.set_upstream(branch_op)
        self.branch_2.set_upstream(branch_op)
        self.dag.clear()

        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        mock_hook.get_connection("mysql_default").conn_type = "mysql"
        mock_get_records = (mock_hook.get_connection.return_value.get_hook.
                            return_value.get_first)

        mock_get_records.return_value = 1

        branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        for ti in tis:
            if ti.task_id == "make_choice":
                self.assertEqual(ti.state, State.SUCCESS)
            elif ti.task_id == "branch_1":
                self.assertEqual(ti.state, State.NONE)
            elif ti.task_id == "branch_2":
                self.assertEqual(ti.state, State.SKIPPED)
            else:
                raise ValueError(f"Invalid task id {ti.task_id} found!")

    @mock.patch("airflow.operators.sql_branch_operator.BaseHook")
    def test_branch_true_with_dag_run(self, mock_hook):
        """ Check BranchSqlOperator branch operation """
        branch_op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="mysql_default",
            sql="SELECT 1",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )

        self.branch_1.set_upstream(branch_op)
        self.branch_2.set_upstream(branch_op)
        self.dag.clear()

        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        mock_hook.get_connection("mysql_default").conn_type = "mysql"
        mock_get_records = (mock_hook.get_connection.return_value.get_hook.
                            return_value.get_first)

        for true_value in SUPPORTED_TRUE_VALUES:
            mock_get_records.return_value = true_value

            branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

            tis = dr.get_task_instances()
            for ti in tis:
                if ti.task_id == "make_choice":
                    self.assertEqual(ti.state, State.SUCCESS)
                elif ti.task_id == "branch_1":
                    self.assertEqual(ti.state, State.NONE)
                elif ti.task_id == "branch_2":
                    self.assertEqual(ti.state, State.SKIPPED)
                else:
                    raise ValueError(f"Invalid task id {ti.task_id} found!")

    @mock.patch("airflow.operators.sql_branch_operator.BaseHook")
    def test_branch_false_with_dag_run(self, mock_hook):
        """ Check BranchSqlOperator branch operation """
        branch_op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="mysql_default",
            sql="SELECT 1",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )

        self.branch_1.set_upstream(branch_op)
        self.branch_2.set_upstream(branch_op)
        self.dag.clear()

        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        mock_hook.get_connection("mysql_default").conn_type = "mysql"
        mock_get_records = (mock_hook.get_connection.return_value.get_hook.
                            return_value.get_first)

        for false_value in SUPPORTED_FALSE_VALUES:
            mock_get_records.return_value = false_value

            branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

            tis = dr.get_task_instances()
            for ti in tis:
                if ti.task_id == "make_choice":
                    self.assertEqual(ti.state, State.SUCCESS)
                elif ti.task_id == "branch_1":
                    self.assertEqual(ti.state, State.SKIPPED)
                elif ti.task_id == "branch_2":
                    self.assertEqual(ti.state, State.NONE)
                else:
                    raise ValueError(f"Invalid task id {ti.task_id} found!")

    @mock.patch("airflow.operators.sql_branch_operator.BaseHook")
    def test_branch_list_with_dag_run(self, mock_hook):
        """ Checks if the BranchSqlOperator supports branching off to a list of tasks."""
        branch_op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="mysql_default",
            sql="SELECT 1",
            follow_task_ids_if_true=["branch_1", "branch_2"],
            follow_task_ids_if_false="branch_3",
            dag=self.dag,
        )

        self.branch_1.set_upstream(branch_op)
        self.branch_2.set_upstream(branch_op)
        self.branch_3 = DummyOperator(task_id="branch_3", dag=self.dag)
        self.branch_3.set_upstream(branch_op)
        self.dag.clear()

        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        mock_hook.get_connection("mysql_default").conn_type = "mysql"
        mock_get_records = (mock_hook.get_connection.return_value.get_hook.
                            return_value.get_first)
        mock_get_records.return_value = [["1"]]

        branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        for ti in tis:
            if ti.task_id == "make_choice":
                self.assertEqual(ti.state, State.SUCCESS)
            elif ti.task_id == "branch_1":
                self.assertEqual(ti.state, State.NONE)
            elif ti.task_id == "branch_2":
                self.assertEqual(ti.state, State.NONE)
            elif ti.task_id == "branch_3":
                self.assertEqual(ti.state, State.SKIPPED)
            else:
                raise ValueError(f"Invalid task id {ti.task_id} found!")

    @mock.patch("airflow.operators.sql_branch_operator.BaseHook")
    def test_invalid_query_result_with_dag_run(self, mock_hook):
        """ Check BranchSqlOperator branch operation """
        branch_op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="mysql_default",
            sql="SELECT 1",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )

        self.branch_1.set_upstream(branch_op)
        self.branch_2.set_upstream(branch_op)
        self.dag.clear()

        self.dag.create_dagrun(
            run_id="manual__",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        mock_hook.get_connection("mysql_default").conn_type = "mysql"
        mock_get_records = (mock_hook.get_connection.return_value.get_hook.
                            return_value.get_first)

        mock_get_records.return_value = ["Invalid Value"]

        with self.assertRaises(AirflowException):
            branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    @mock.patch("airflow.operators.sql_branch_operator.BaseHook")
    def test_with_skip_in_branch_downstream_dependencies(self, mock_hook):
        """ Test SQL Branch with skipping all downstream dependencies """
        branch_op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="mysql_default",
            sql="SELECT 1",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )

        branch_op >> self.branch_1 >> self.branch_2
        branch_op >> self.branch_2
        self.dag.clear()

        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        mock_hook.get_connection("mysql_default").conn_type = "mysql"
        mock_get_records = (mock_hook.get_connection.return_value.get_hook.
                            return_value.get_first)

        for true_value in SUPPORTED_TRUE_VALUES:
            mock_get_records.return_value = [true_value]

            branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

            tis = dr.get_task_instances()
            for ti in tis:
                if ti.task_id == "make_choice":
                    self.assertEqual(ti.state, State.SUCCESS)
                elif ti.task_id == "branch_1":
                    self.assertEqual(ti.state, State.NONE)
                elif ti.task_id == "branch_2":
                    self.assertEqual(ti.state, State.NONE)
                else:
                    raise ValueError(f"Invalid task id {ti.task_id} found!")

    @mock.patch("airflow.operators.sql_branch_operator.BaseHook")
    def test_with_skip_in_branch_downstream_dependencies2(self, mock_hook):
        """ Test skipping downstream dependency for false condition"""
        branch_op = BranchSqlOperator(
            task_id="make_choice",
            conn_id="mysql_default",
            sql="SELECT 1",
            follow_task_ids_if_true="branch_1",
            follow_task_ids_if_false="branch_2",
            dag=self.dag,
        )

        branch_op >> self.branch_1 >> self.branch_2
        branch_op >> self.branch_2
        self.dag.clear()

        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        mock_hook.get_connection("mysql_default").conn_type = "mysql"
        mock_get_records = (mock_hook.get_connection.return_value.get_hook.
                            return_value.get_first)

        for false_value in SUPPORTED_FALSE_VALUES:
            mock_get_records.return_value = [false_value]

            branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

            tis = dr.get_task_instances()
            for ti in tis:
                if ti.task_id == "make_choice":
                    self.assertEqual(ti.state, State.SUCCESS)
                elif ti.task_id == "branch_1":
                    self.assertEqual(ti.state, State.SKIPPED)
                elif ti.task_id == "branch_2":
                    self.assertEqual(ti.state, State.NONE)
                else:
                    raise ValueError(f"Invalid task id {ti.task_id} found!")
Ejemplo n.º 40
0
from airflow.contrib.operators.ssh_operator import SSHOperator

args = {
    'owner': 'shitao',
    #'start_date': airflow.utils.dates.days_ago(2),
    'start_date': datetime(2018, 1, 1),
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
}

dag = DAG(
    dag_id='shopping_cart',
    default_args=args,
    schedule_interval='0 5 * * *',
    dagrun_timeout=timedelta(minutes=60),
)

daily = SSHOperator(
    ssh_conn_id='ws@hdp-0',
    task_id='daily',
    command=
    'cd /usr/local/bigdata/jobtaskh0/pythonjob/pyspark_template/ && spark-submit \
                --num-executors 4 \
                --executor-memory 4G \
                --executor-cores 4 \
                --driver-memory 4G \
                --driver-cores 4 \
                --jars /usr/hdp/3.0.1.0-187/spark2/jars/mysql-connector-java-5.1.47.jar \
                --driver-class-path /usr/hdp/3.0.1.0-187/spark2/jars/mysql-connector-java-5.1.47.jar \
Ejemplo n.º 41
0
from airflow.contrib.operators.ssh_operator import SSHOperator

args = {
    'owner': 'mayx',
    'start_date': airflow.utils.dates.days_ago(2),
    'retries': 1,
    'retry_delay': timedelta(minutes=10),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
}

# day 类型的任务 根据相应的类型,打开或者关闭相关的注释
dag = DAG(
    dag_id='device_total_day',
    default_args=args,
    schedule_interval='30 5 * * *',
    dagrun_timeout=timedelta(minutes=60),
)

# # week 类型的任务
# dag = DAG(
#     dag_id='airflow_pyspark_template_week',
#     default_args=args,
#     schedule_interval='50 6 * * 1',
#     dagrun_timeout=timedelta(minutes=60),
# )
#
#
# # month 类型的任务 dag_id 需要修改
# dag = DAG(
#     dag_id='airflow_pyspark_template_week',
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from datetime import datetime
from time import sleep

from airflow.models import DAG
from airflow.operators.python import PythonOperator

DEFAULT_DATE = datetime(2016, 1, 1)

args = {
    'owner': 'airflow',
    'start_date': DEFAULT_DATE,
}


dag = DAG(dag_id='test_mark_success', default_args=args)
task = PythonOperator(
    task_id='task1',
    python_callable=lambda x: sleep(x),  # pylint: disable=W0108
    op_args=[600],
    dag=dag)
Ejemplo n.º 43
0
dag_id = 'salesforce_recommendation_reason'

# independent_reasons = {
#     'hot_location_longterm': hot_location_longterm,
#     'hot_location_occupancy': hot_location_occupancy,
#     'hot_location_shortterm': hot_location_shortterm,
# }


"""
Create a DAG to execute tasks
"""
dag = DAG(
    dag_id=dag_id,
    default_args=args,
    schedule_interval=None,
)

main_op = DummyOperator(
    task_id = 'main_entrance',
    dag= dag,
)

generate_pair_op = PythonOperator(
    task_id='generate_pairs',
    python_callable=generate_pairs,
    dag=dag,
)

merging_op = PythonOperator(
@stakeholders: People who learns
"""
from airflow.operators.python_operator import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.models import DAG
import datetime
import logging


def say_hello(**context):
    """
    Function is puting example string into task log.
    :param context:
    :return:
    """
    logging.info(f'Everything Works! {datetime.datetime.now()}')


dag = DAG(dag_id='hello_world',
          schedule_interval=None,
          start_date=datetime.datetime(2020, 1, 1),
          default_args={"owner": "airflow_lesson"})

start = DummyOperator(task_id='start_dag', dag=dag)

hello = PythonOperator(task_id='say_hello', python_callable=say_hello, dag=dag)

end = DummyOperator(task_id='end_dag', dag=dag)

start >> hello >> end
Ejemplo n.º 45
0
 def setUp(self):
     dag = DAG('dag_for_testing_filename_rendering',
               start_date=DEFAULT_DATE)
     task = DummyOperator(task_id='task_for_testing_filename_rendering',
                          dag=dag)
     self.ti = TaskInstance(task=task, execution_date=DEFAULT_DATE)
Ejemplo n.º 46
0
import airflow
from airflow.operators.bash_operator import BashOperator
from airflow.models import DAG

args = {
    'owner': 'Freddy Drennan',
    'start_date': airflow.utils.dates.days_ago(2),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True
}

dag = DAG(dag_id='update_ip',
          default_args=args,
          schedule_interval='@hourly',
          concurrency=1,
          max_active_runs=1,
          catchup=False)


task_1 = BashOperator(
    task_id='update_ip',
    bash_command='. /home/scripts/R/shell/update_ip',
    dag=dag
)


# assemble env vars
env_vars = Variable.get("atd_knack_services_postgrest", deserialize_json=True)
atd_knack_auth = Variable.get("atd_knack_auth", deserialize_json=True)
env_vars["KNACK_APP_ID"] = atd_knack_auth[app_name][env]["app_id"]
env_vars["KNACK_API_KEY"] = atd_knack_auth[app_name][env]["api_key"]
env_vars["SOCRATA_API_KEY_ID"] = Variable.get("atd_service_bot_socrata_api_key_id")
env_vars["SOCRATA_API_KEY_SECRET"] = Variable.get(
    "atd_service_bot_socrata_api_key_secret"
)
env_vars["SOCRATA_APP_TOKEN"] = Variable.get("atd_service_bot_socrata_app_token")

with DAG(
    dag_id="atd_knack_mmc_activities_to_s3_to_socrata",
    default_args=default_args,
    schedule_interval="20 6 * * *",
    dagrun_timeout=timedelta(minutes=300),
    tags=["production", "knack"],
    catchup=False,
) as dag:

    date = "{{ prev_execution_date_success or '1970-01-01' }}"

    t1 = DockerOperator(
        task_id="atd_knack_mmc_activities_to_postgrest",
        image=docker_image,
        api_version="auto",
        auto_remove=True,
        command=f'./atd-knack-services/services/{task_1_script}.py -a {app_name} -c {container} -d "{date}"',  # noqa
        docker_url="tcp://localhost:2376",
        network_mode="bridge",
        environment=env_vars,
Ejemplo n.º 48
0
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator

args = {
    'owner': 'noah',
    'depends_on_past': False,
    'start_date': datetime.utcnow(),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG(dag_id='mock_loan_predicting',
          default_args=args,
          schedule_interval="@once")

# Tasks
load_unseen_data = PythonOperator(
    task_id='load_raw_unseen_prediction_data',
    provide_context=True,
    python_callable=dag_functions.load_unseen_prediction_data_from_db,
    dag=dag)

wrangle_unseen_data = PythonOperator(
    task_id='wrangle_unseen_data',
    provide_context=True,
    python_callable=dag_functions.wrangle_unseen_data,
    dag=dag)
Ejemplo n.º 49
0
 def subdag_D():
     subdag_D = DAG('nested_cycle.opSubdag_1.opSubdag_D',
                    default_args=DEFAULT_ARGS)
     DummyOperator(task_id='subdag_D.task', dag=subdag_D)
     return subdag_D
Ejemplo n.º 50
0
from pprint import pprint

import airflow
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator

from scr.quality.prod import ppt_email

args = {
    'owner': 'lishulong',
    'start_date': airflow.utils.dates.days_ago(2),
    'depends_on_past': False,
}


def print_context(ds, **kwargs):
    pprint(kwargs)
    print(ds)
    ppt_email()
    return 'Whatever you return gets printed in the logs'


dag = DAG(dag_id='python_quality',
          default_args=args,
          schedule_interval='0 0 * * *')

s2 = PythonOperator(task_id='prod_email',
                    provide_context=True,
                    python_callable=print_context,
                    dag=dag)
Ejemplo n.º 51
0
        def nested_subdag_cycle():
            from airflow.models import DAG
            from airflow.operators.dummy_operator import DummyOperator
            from airflow.operators.subdag_operator import SubDagOperator
            import datetime
            DAG_NAME = 'nested_cycle'
            DEFAULT_ARGS = {
                'owner': 'owner1',
                'start_date': datetime.datetime(2016, 1, 1)
            }
            dag = DAG(DAG_NAME, default_args=DEFAULT_ARGS)

            # cycle:
            #     A -> opSubdag_0
            #          cycle.opSubdag_0:
            #              -> opSubDag_A
            #                 cycle.opSubdag_0.opSubdag_A:
            #                     -> subdag_A.task
            #              -> opSubdag_B
            #                 cycle.opSubdag_0.opSubdag_B:
            #                     -> subdag_B.task
            #     A -> opSubdag_1
            #          cycle.opSubdag_1:
            #              -> opSubdag_C
            #                 cycle.opSubdag_1.opSubdag_C:
            #                     -> subdag_C.task -> subdag_C.task  >Invalid Loop<
            #              -> opSubDag_D
            #                 cycle.opSubdag_1.opSubdag_D:
            #                     -> subdag_D.task

            with dag:

                def subdag_A():
                    subdag_A = DAG('nested_cycle.opSubdag_0.opSubdag_A',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_A.task', dag=subdag_A)
                    return subdag_A

                def subdag_B():
                    subdag_B = DAG('nested_cycle.opSubdag_0.opSubdag_B',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_B.task', dag=subdag_B)
                    return subdag_B

                def subdag_C():
                    subdag_C = DAG('nested_cycle.opSubdag_1.opSubdag_C',
                                   default_args=DEFAULT_ARGS)
                    opSubdag_C_task = DummyOperator(task_id='subdag_C.task',
                                                    dag=subdag_C)
                    # introduce a loop in opSubdag_C
                    opSubdag_C_task.set_downstream(opSubdag_C_task)
                    return subdag_C

                def subdag_D():
                    subdag_D = DAG('nested_cycle.opSubdag_1.opSubdag_D',
                                   default_args=DEFAULT_ARGS)
                    DummyOperator(task_id='subdag_D.task', dag=subdag_D)
                    return subdag_D

                def subdag_0():
                    subdag_0 = DAG('nested_cycle.opSubdag_0',
                                   default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_A',
                                   dag=subdag_0,
                                   subdag=subdag_A())
                    SubDagOperator(task_id='opSubdag_B',
                                   dag=subdag_0,
                                   subdag=subdag_B())
                    return subdag_0

                def subdag_1():
                    subdag_1 = DAG('nested_cycle.opSubdag_1',
                                   default_args=DEFAULT_ARGS)
                    SubDagOperator(task_id='opSubdag_C',
                                   dag=subdag_1,
                                   subdag=subdag_C())
                    SubDagOperator(task_id='opSubdag_D',
                                   dag=subdag_1,
                                   subdag=subdag_D())
                    return subdag_1

                opSubdag_0 = SubDagOperator(task_id='opSubdag_0',
                                            dag=dag,
                                            subdag=subdag_0())
                opSubdag_1 = SubDagOperator(task_id='opSubdag_1',
                                            dag=dag,
                                            subdag=subdag_1())

                opA = DummyOperator(task_id='A')
                opA.set_downstream(opSubdag_0)
                opA.set_downstream(opSubdag_1)

            return dag
Ejemplo n.º 52
0
ALERT_EMAIL_ADDRESSES = [
]  # List of email address to send email alerts to if this job fails
ENABLE_DELETE = True  # Whether the job should delete the logs or not. Included if you want to temporarily avoid deleting the logs

default_args = {
    'owner': DAG_OWNER_NAME,
    'email': ALERT_EMAIL_ADDRESSES,
    'email_on_failure': False,
    'email_on_retry': False,
    'start_date': START_DATE,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

dag = DAG(DAG_ID,
          default_args=default_args,
          schedule_interval=SCHEDULE_INTERVAL,
          start_date=START_DATE)
if hasattr(dag, 'doc_md'):
    dag.doc_md = __doc__
if hasattr(dag, 'catchup'):
    dag.catchup = False


def clear_missing_dags_fn(**context):

    logging.info("Starting to run Clear Process")

    try:
        host_name = socket.gethostname()
        host_ip = socket.gethostbyname(host_name)
        logging.info("Running on Machine with Host Name: " + host_name)
Ejemplo n.º 53
0
 def subdag_A():
     subdag_A = DAG('master.opSubdag_0.opSubdag_A',
                    default_args=DEFAULT_ARGS)
     DummyOperator(task_id='subdag_A.task', dag=subdag_A)
     return subdag_A
from builtins import range
from datetime import timedelta

import airflow
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator

args = {
    'owner': 'airflow',
    'start_date': airflow.utils.dates.days_ago(2),
}

dag = DAG(
    dag_id='example_bash_operator',
    default_args=args,
    schedule_interval='0 0 * * *',
    dagrun_timeout=timedelta(minutes=60),
)

run_this_last = DummyOperator(
    task_id='run_this_last',
    dag=dag,
)

# [START howto_operator_bash]
run_this = BashOperator(
    task_id='run_after_loop',
    bash_command='echo 1',
    dag=dag,
)
# [END howto_operator_bash]
Ejemplo n.º 55
0
from datetime import datetime

from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator

default_args = {
    'owner': 'ivanfdz',
    'start_date': datetime(2020, 5, 20, 11, 0, 0)
}


def hello_world_loop():
    for palabra in ['hello', 'world']:
        print(palabra)


with DAG('dag_prueba', default_args=default_args,
         schedule_interval='@daily') as dag:
    start = DummyOperator(task_id='start')

    prueba_python = PythonOperator(task_id='prueba_python',
                                   python_callable=hello_world_loop)

    prueba_bash = BashOperator(task_id='prueba_bash',
                               bash_command='echo prueba_bash')

start >> prueba_python >> prueba_bash
Ejemplo n.º 56
0
import pprint
from datetime import datetime

from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator

pp = pprint.PrettyPrinter(indent=4)

args = {
    # TODO use current date without harming dag functionality
    'start_date': datetime(2019, 6, 15),
    'owner': 'airflow',
}

dag = DAG(dag_id='proxyscraper_dag',
          default_args=args,
          schedule_interval='*/15 * * * *',
          catchup=False)

collect_proxies = BashOperator(
    task_id='get_proxies',
    bash_command=\
    "cd /FIFA/fifa_data/ && python3 -m fifa_data.spiders.fate_proxy ",
    dag=dag
)
Ejemplo n.º 57
0
from airflow.models import DAG
from airflow.providers.apache.spark.operators.spark_sql import SparkSqlOperator
from airflow.providers.jdbc.operators.jdbc import JdbcOperator
from airflow.utils.dates import days_ago
from datetime import timedelta, datetime as dt

args = {
    'owner': 'Seshu Edala',
}

with DAG(
        dag_id='album_external_to_album',
        default_args=args,
        schedule_interval='*/30 * * * *',
        dagrun_timeout=timedelta(minutes=5),
        start_date=days_ago(1),
        tags=['album_external_to_album', 'load_data', 'aluminium'],
        catchup=False,
) as dag:
    '''
-- create control table
create database if not exists meta;
create external table if not exists meta.control 
  (data_file string, al_table string, process_time timestamp) 
  using delta location "s3a://spark/warehouse/control";

-- switch to correct database
use music;

-- drop previous table
drop view if exists global_temp.album_{__signature__};
 def _get_task(self, **kwargs):
     return BaseOperator(task_id='test_task', dag=DAG('test_dag'), **kwargs)
Ejemplo n.º 59
0
from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from airflow.operators.sensors import ExternalTaskSensor

# https://towardsdatascience.com/dependencies-between-dags-in-apache-airflow-2f5935cde3f0

dag = DAG(
    dag_id='dependencia_tres',
    schedule_interval='@once',
    #owner: 'test',
    start_date=days_ago(0),
    catchup=False)


def print_success_message(**kwargs):
    print("Success!!")


def print_end_message(**kwargs):
    print("END")


externalsensor1 = ExternalTaskSensor(
    task_id='dependencia_dos_completed_Status',
    external_dag_id='dependencia_dos',
    external_task_id=None,
    check_existence=True)

success = PythonOperator(task_id='success',
                         python_callable=print_success_message,
Ejemplo n.º 60
0
"""_dags file for 'council districts' sde extraction."""
from airflow.models import DAG
from trident.util import general
from dags.sde.parks_jobs import sde_to_shp
from trident.util.sde_extract_tasks import create_sde_tasks

args = general.args
conf = general.config
schedule = general.schedule['gis_weekly']
start_date = general.start_date['gis_weekly']
folder = 'parks'
layer = 'parks'
datasd_name = 'parks_datasd'
md = 'park-locations'
path_to_file = conf['prod_data_dir'] + '/' + datasd_name

dag = DAG(dag_id='gis_{layer}'.format(layer=layer),
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

#: Create tasks dynamically
create_sde_tasks(dag=dag,
                 folder=folder,
                 layer=layer,
                 datasd_name=datasd_name,
                 md=md,
                 path_to_file=path_to_file,
                 sde_to_shp=sde_to_shp)