Ejemplo n.º 1
0
 def test_schedule_dag_fake_scheduled_previous(self):
     """
     Test scheduling a dag where there is a prior DagRun
     which has the same run_id as the next run should have
     """
     delta = timedelta(hours=1)
     dag = DAG(TEST_DAG_ID+'test_schedule_dag_fake_scheduled_previous',
             schedule_interval=delta,
             start_date=DEFAULT_DATE)
     dag.tasks = [models.BaseOperator(task_id="faketastic",
         owner='Also fake',
         start_date=DEFAULT_DATE)]
     scheduler = jobs.SchedulerJob(test_mode=True)
     trigger = models.DagRun(
                 dag_id=dag.dag_id,
                 run_id=models.DagRun.id_for_date(DEFAULT_DATE),
                 execution_date=DEFAULT_DATE,
                 state=utils.State.SUCCESS,
                 external_trigger=True)
     settings.Session().add(trigger)
     settings.Session().commit()
     dag_run = scheduler.schedule_dag(dag)
     assert dag_run is not None
     assert dag_run.dag_id == dag.dag_id
     assert dag_run.run_id is not None
     assert dag_run.run_id != ''
     assert dag_run.execution_date == DEFAULT_DATE+delta, (
             'dag_run.execution_date did not match expectation: {0}'
             .format(dag_run.execution_date))
     assert dag_run.state == models.State.RUNNING
     assert dag_run.external_trigger == False
Ejemplo n.º 2
0
 def setUp(self):
     configuration.test_mode()
     utils.initdb()
     args = {'owner': 'airflow', 'start_date': datetime(2015, 1, 1)}
     dag = DAG(TEST_DAG_ID, default_args=args)
     dag.clear(start_date=DEFAULT_DATE, end_date=datetime.now())
     self.dag = dag
Ejemplo n.º 3
0
class BranchOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dag = DAG('branch_operator_test',
                       default_args={
                           'owner': 'airflow',
                           'start_date': DEFAULT_DATE},
                       schedule_interval=INTERVAL)
        self.branch_op = BranchPythonOperator(task_id='make_choice',
                                              dag=self.dag,
                                              python_callable=lambda: 'branch_1')

        self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag)
        self.branch_1.set_upstream(self.branch_op)
        self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag)
        self.branch_2.set_upstream(self.branch_op)
        self.dag.clear()

    def test_without_dag_run(self):
        """This checks the defensive against non existent tasks in a dag run"""
        self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        session = Session()
        tis = session.query(TI).filter(
            TI.dag_id == self.dag.dag_id,
            TI.execution_date == DEFAULT_DATE
        )
        session.close()

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1':
                # should exist with state None
                self.assertEquals(ti.state, State.NONE)
            elif ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

    def test_with_dag_run(self):
        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=datetime.datetime.now(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1':
                self.assertEquals(ti.state, State.NONE)
            elif ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise
class BashOperatorTestCase(unittest.TestCase):
    def test_echo_env_variables(self):
        """
        Test that env variables are exported correctly to the
        task bash environment.
        """
        now = datetime.utcnow()
        now = now.replace(tzinfo=timezone.utc)

        self.dag = DAG(
            dag_id='bash_op_test', default_args={
                'owner': 'airflow',
                'retries': 100,
                'start_date': DEFAULT_DATE
            },
            schedule_interval='@daily',
            dagrun_timeout=timedelta(minutes=60))

        self.dag.create_dagrun(
            run_id='manual__' + DEFAULT_DATE.isoformat(),
            execution_date=DEFAULT_DATE,
            start_date=now,
            state=State.RUNNING,
            external_trigger=False,
        )

        import tempfile
        with tempfile.NamedTemporaryFile() as f:
            fname = f.name
            t = BashOperator(
                task_id='echo_env_vars',
                dag=self.dag,
                bash_command='echo $AIRFLOW_HOME>> {0};'
                             'echo $PYTHONPATH>> {0};'
                             'echo $AIRFLOW_CTX_DAG_ID >> {0};'
                             'echo $AIRFLOW_CTX_TASK_ID>> {0};'
                             'echo $AIRFLOW_CTX_EXECUTION_DATE>> {0};'
                             'echo $AIRFLOW_CTX_DAG_RUN_ID>> {0};'.format(fname)
            )

            original_AIRFLOW_HOME = os.environ['AIRFLOW_HOME']

            os.environ['AIRFLOW_HOME'] = 'MY_PATH_TO_AIRFLOW_HOME'
            t.run(DEFAULT_DATE, DEFAULT_DATE,
                  ignore_first_depends_on_past=True, ignore_ti_state=True)

            with open(fname, 'r') as fr:
                output = ''.join(fr.readlines())
                self.assertIn('MY_PATH_TO_AIRFLOW_HOME', output)
                # exported in run_unit_tests.sh as part of PYTHONPATH
                self.assertIn('tests/test_utils', output)
                self.assertIn('bash_op_test', output)
                self.assertIn('echo_env_vars', output)
                self.assertIn(DEFAULT_DATE.isoformat(), output)
                self.assertIn('manual__' + DEFAULT_DATE.isoformat(), output)

            os.environ['AIRFLOW_HOME'] = original_AIRFLOW_HOME
Ejemplo n.º 5
0
    def setUp(self):
        configuration.load_test_config()
        from airflow.contrib.hooks.fs_hook import FSHook

        hook = FSHook()
        args = {"owner": "airflow", "start_date": DEFAULT_DATE, "provide_context": True}
        dag = DAG(TEST_DAG_ID + "test_schedule_dag_once", default_args=args)
        dag.schedule_interval = "@once"
        self.hook = hook
        self.dag = dag
Ejemplo n.º 6
0
    def test_without_dag_run(self):
        """This checks the defensive against non existent tasks in a dag run"""
        value = False
        dag = DAG('shortcircuit_operator_test_without_dag_run',
                  default_args={
                       'owner': 'airflow',
                       'start_date': DEFAULT_DATE
                  },
                  schedule_interval=INTERVAL)
        short_op = ShortCircuitOperator(task_id='make_choice',
                                        dag=dag,
                                        python_callable=lambda: value)
        branch_1 = DummyOperator(task_id='branch_1', dag=dag)
        branch_1.set_upstream(short_op)
        branch_2 = DummyOperator(task_id='branch_2', dag=dag)
        branch_2.set_upstream(branch_1)
        upstream = DummyOperator(task_id='upstream', dag=dag)
        upstream.set_downstream(short_op)
        dag.clear()

        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        session = Session()
        tis = session.query(TI).filter(
            TI.dag_id == dag.dag_id,
            TI.execution_date == DEFAULT_DATE
        )

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        value = True
        dag.clear()

        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise

        session.close()
Ejemplo n.º 7
0
 def setUp(self):
     configuration.load_test_config()
     from airflow.contrib.hooks.fs_hook import FSHook
     hook = FSHook()
     args = {
         'owner': 'airflow',
         'start_date': DEFAULT_DATE,
         'provide_context': True
     }
     dag = DAG(TEST_DAG_ID+'test_schedule_dag_once', default_args=args)
     dag.schedule_interval = '@once'
     self.hook = hook
     self.dag = dag
Ejemplo n.º 8
0
 def setUp(self):
     configuration.test_mode()
     from airflow.contrib.hooks.ssh_hook import SSHHook
     hook = SSHHook()
     hook.no_host_key_check = True
     args = {
         'owner': 'airflow',
         'start_date': DEFAULT_DATE,
         'provide_context': True
     }
     dag = DAG(TEST_DAG_ID+'test_schedule_dag_once', default_args=args)
     dag.schedule_interval = '@once'
     self.hook = hook
     self.dag = dag
Ejemplo n.º 9
0
    def test_schedule_dag_once(self):
        """
        Tests scheduling a dag scheduled for @once - should be scheduled the first time
        it is called, and not scheduled the second.
        """
        dag = DAG(TEST_DAG_ID+'test_schedule_dag_once')
        dag.schedule_interval = '@once'
        dag.tasks = [models.BaseOperator(task_id="faketastic", owner='Also fake',
            start_date=datetime(2015, 1, 2, 0, 0))]
        dag_run = jobs.SchedulerJob(test_mode=True).schedule_dag(dag)
        dag_run2 = jobs.SchedulerJob(test_mode=True).schedule_dag(dag)

        assert dag_run is not None
        assert dag_run2 is None
Ejemplo n.º 10
0
 def test_schedule_dag_no_previous_runs(self):
     """
     Tests scheduling a dag with no previous runs
     """
     dag = DAG(TEST_DAG_ID+'test_schedule_dag_no_previous_runs')
     dag.tasks = [models.BaseOperator(task_id="faketastic", owner='Also fake',
         start_date=datetime(2015, 1, 2, 0, 0))]
     dag_run = jobs.SchedulerJob(test_mode=True).schedule_dag(dag)
     assert dag_run is not None
     assert dag_run.dag_id == dag.dag_id
     assert dag_run.run_id is not None
     assert dag_run.run_id != ''
     assert dag_run.execution_date == datetime(2015, 1, 2, 0, 0), (
             'dag_run.execution_date did not match expectation: {0}'
             .format(dag_run.execution_date))
     assert dag_run.state == models.State.RUNNING
     assert dag_run.external_trigger == False
Ejemplo n.º 11
0
    def setUp(self):
        configuration.load_test_config()
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }
        self.dag = DAG(TEST_DAG_ID, default_args=args)

        session = settings.Session()
        session.query(DagRun).delete()
        session.query(TaskInstance).delete()
        session.commit()
    def setUp(self):

        if sys.version_info[0] == 3:
            raise unittest.SkipTest('SSHExecuteOperatorTest won\'t work with '
                                    'python3. No need to test anything here')

        configuration.load_test_config()
        from airflow.contrib.hooks.ssh_hook import SSHHook
        hook = mock.MagicMock(spec=SSHHook)
        hook.no_host_key_check = True
        hook.Popen.return_value.stdout = StringIO(u'stdout')
        hook.Popen.return_value.returncode = False
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE,
            'provide_context': True
        }
        dag = DAG(TEST_DAG_ID+'test_schedule_dag_once', default_args=args)
        dag.schedule_interval = '@once'
        self.hook = hook
        self.dag = dag
Ejemplo n.º 13
0
    def test_with_dag_run(self):
        value = False
        dag = DAG('shortcircuit_operator_test_with_dag_run',
                  default_args={
                       'owner': 'airflow',
                       'start_date': DEFAULT_DATE
                  },
                  schedule_interval=INTERVAL)
        short_op = ShortCircuitOperator(task_id='make_choice',
                                        dag=dag,
                                        python_callable=lambda: value)
        branch_1 = DummyOperator(task_id='branch_1', dag=dag)
        branch_1.set_upstream(short_op)
        branch_2 = DummyOperator(task_id='branch_2', dag=dag)
        branch_2.set_upstream(branch_1)
        upstream = DummyOperator(task_id='upstream', dag=dag)
        upstream.set_downstream(short_op)
        dag.clear()

        logging.error("Tasks {}".format(dag.tasks))
        dr = dag.create_dagrun(
            run_id="manual__",
            start_date=datetime.datetime.now(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        value = True
        dag.clear()
        dr.verify_integrity()
        upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise
 def setUp(self):
     super().setUp()
     configuration.load_test_config()
     self.dag = DAG(
         'test_dag',
         default_args={
             'owner': 'airflow',
             'start_date': DEFAULT_DATE},
         schedule_interval=INTERVAL)
     self.addCleanup(self.dag.clear)
     freezer = freeze_time(FROZEN_NOW)
     freezer.start()
     self.addCleanup(freezer.stop)
    def test_external_dag_sensor(self):

        other_dag = DAG(
            'other_dag',
            default_args=self.args,
            end_date=DEFAULT_DATE,
            schedule_interval='@once')
        other_dag.create_dagrun(
            run_id='test',
            start_date=DEFAULT_DATE,
            execution_date=DEFAULT_DATE,
            state=State.SUCCESS)
        t = ExternalTaskSensor(
            task_id='test_external_dag_sensor_check',
            external_dag_id='other_dag',
            external_task_id=None,
            dag=self.dag
        )
        t.run(
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True
        )
    def setUp(self):
        configuration.load_test_config()
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE,
            'provide_context': True
        }
        dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args)
        dag.schedule_interval = '@once'
        self.dag = dag

        self.sensor = gcs_sensor.GoogleCloudStorageUploadSessionCompleteSensor(
            task_id='sensor',
            bucket='test-bucket',
            prefix='test-prefix/path',
            inactivity_period=12,
            poke_interval=10,
            min_objects=1,
            allow_delete=False,
            previous_num_objects=0,
            dag=self.dag
        )
        self.last_mocked_date = datetime(2019, 4, 24, 0, 0, 0)
Ejemplo n.º 17
0
    def setUp(self):
        self.dag = DAG('branch_operator_test',
                       default_args={
                           'owner': 'airflow',
                           'start_date': DEFAULT_DATE},
                       schedule_interval=INTERVAL)
        self.branch_op = BranchPythonOperator(task_id='make_choice',
                                              dag=self.dag,
                                              python_callable=lambda: 'branch_1')

        self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag)
        self.branch_1.set_upstream(self.branch_op)
        self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag)
        self.branch_2.set_upstream(self.branch_op)
        self.dag.clear()
Ejemplo n.º 18
0
    def setUp(self):
        self.dag = DAG('shortcircuit_operator_test',
                       default_args={
                           'owner': 'airflow',
                           'start_date': DEFAULT_DATE},
                       schedule_interval=INTERVAL)
        self.short_op = ShortCircuitOperator(task_id='make_choice',
                                             dag=self.dag,
                                             python_callable=lambda: self.value)

        self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag)
        self.branch_1.set_upstream(self.short_op)
        self.upstream = DummyOperator(task_id='upstream', dag=self.dag)
        self.upstream.set_downstream(self.short_op)
        self.dag.clear()

        self.value = True
Ejemplo n.º 19
0

def addition():
    logging.info(f"2 + 2 = {2+2}")


def subtraction():
    logging.info(f"6 -2 = {6-2}")


def division():
    logging.info(f"10 / 2 = {int(10/2)}")


dag = DAG(
    "lesson1.exercise3",
    schedule_interval='@hourly',
    start_date=datetime.datetime.now() - datetime.timedelta(days=1))

hello_world_task = PythonOperator(
    task_id="hello_world",
    python_callable=hello_world,
    dag=dag)

#
# Define an addition task that calls the `addition` function above
#
addition_task = PythonOperator(
    task_id='addition',
    python_callable=addition,
    dag=dag)
    
class LatestOnlyOperatorTest(unittest.TestCase):

    def setUp(self):
        super().setUp()
        configuration.load_test_config()
        self.dag = DAG(
            'test_dag',
            default_args={
                'owner': 'airflow',
                'start_date': DEFAULT_DATE},
            schedule_interval=INTERVAL)
        self.addCleanup(self.dag.clear)
        freezer = freeze_time(FROZEN_NOW)
        freezer.start()
        self.addCleanup(freezer.stop)

    def test_run(self):
        task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    def test_skipping(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

    def test_skipping_dagrun(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        self.dag.create_dagrun(
            run_id="manual__1",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        self.dag.create_dagrun(
            run_id="manual__2",
            start_date=timezone.utcnow(),
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING
        )

        self.dag.create_dagrun(
            run_id="manual__3",
            start_date=timezone.utcnow(),
            execution_date=END_DATE,
            state=State.RUNNING
        )

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
Ejemplo n.º 21
0
import pprint as pp
import airflow.utils.dates
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta

default_args = {
    "owner": "airflow",
    "start_date": airflow.utils.dates.days_ago(1)
}

with DAG(dag_id="sleep_dag",
         default_args=default_args,
         schedule_interval="@daily") as dag:

    t1 = DummyOperator(task_id="t1")

    t2 = BashOperator(task_id="t2", bash_command="sleep 30")

    t1 >> t2
Ejemplo n.º 22
0
import datetime
import airflow
from airflow import DAG
from airflow.operators.hive_operator import HiveOperator
from datetime import timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0, hour=0, minute=0, second=1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'provide_context': True
}

dag = DAG('mapr_hive_task_dag',
          default_args=default_args,
          description='MapR single task DAG',
          schedule_interval=timedelta(minutes=15))

insert_current_datetime = HiveOperator(
    task_id='insert_current_datetime_task',
    hql="insert into table datetimes values ('" +
    datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") + "');",
    dag=dag)

dag.doc_md = __doc__
Ejemplo n.º 23
0
from datetime import datetime, timedelta
from operators.emr_spark_operator import EMRSparkOperator

default_args = {
    'owner': '*****@*****.**',
    'depends_on_past': False,
    'start_date': datetime(2016, 8, 18),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=30),
}

dag = DAG('bugzilla_dataset',
          default_args=default_args,
          schedule_interval='@daily')

connection_details = BaseHook.get_connection('bugzilla_db')

env = {
    "DATABASE_USER": connection_details.login,
    "DATABASE_PASSWORD": connection_details.password,
    "DATABASE_HOST": connection_details.host,
    "DATABASE_PORT": connection_details.port,
    "DATABASE_NAME": connection_details.schema,
}

t0 = EMRSparkOperator(
    task_id="update_bugs",
    job_name="Bugzilla Dataset Update",
Ejemplo n.º 24
0
class BaseSensorTest(unittest.TestCase):
    def setUp(self):
        configuration.load_test_config()
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }
        self.dag = DAG(TEST_DAG_ID, default_args=args)

        session = settings.Session()
        session.query(DagRun).delete()
        session.query(TaskInstance).delete()
        session.commit()

    def _make_dag_run(self):
        return self.dag.create_dagrun(
            run_id='manual__',
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

    def _make_sensor(self, return_value, **kwargs):
        poke_interval = 'poke_interval'
        timeout = 'timeout'
        if poke_interval not in kwargs:
            kwargs[poke_interval] = 0
        if timeout not in kwargs:
            kwargs[timeout] = 0

        sensor = DummySensor(
            task_id=SENSOR_OP,
            return_value=return_value,
            dag=self.dag,
            **kwargs
        )

        dummy_op = DummyOperator(
            task_id=DUMMY_OP,
            dag=self.dag
        )
        dummy_op.set_upstream(sensor)
        return sensor

    @classmethod
    def _run(cls, task):
        task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

    def test_ok(self):
        sensor = self._make_sensor(True)
        dr = self._make_dag_run()

        self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.SUCCESS)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

    def test_fail(self):
        sensor = self._make_sensor(False)
        dr = self._make_dag_run()

        with self.assertRaises(AirflowSensorTimeout):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.FAILED)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

    def test_soft_fail(self):
        sensor = self._make_sensor(False, soft_fail=True)
        dr = self._make_dag_run()

        self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            self.assertEquals(ti.state, State.SKIPPED)

    def test_soft_fail_with_retries(self):
        sensor = self._make_sensor(
            return_value=False,
            soft_fail=True,
            retries=1,
            retry_delay=timedelta(milliseconds=1))
        dr = self._make_dag_run()

        # first run fails and task instance is marked up to retry
        with self.assertRaises(AirflowSensorTimeout):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.UP_FOR_RETRY)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        sleep(0.001)
        # after retry DAG run is skipped
        self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            self.assertEquals(ti.state, State.SKIPPED)
Ejemplo n.º 25
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2017, 8, 17),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'catchup': False
}

timestamp = """{{ ts }}"""

dag = DAG('salesforce_data_processing',
          default_args=default_args,
          schedule_interval=None,
          catchup=False)


@provide_session
def get_conn(conn_id, session=None):
    conn = (session.query(Connection).filter(
        Connection.conn_id == conn_id).first())
    return conn


## Requires a slack connection stored with a token under
## the extra section with the format of {"token":"TOKEN_HERE"}
## The Conn Type can be left blank.

Ejemplo n.º 26
0
from datetime import timedelta

from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.dates import days_ago

args = {
    'owner': 'airflow',
}

dag = DAG(
    dag_id='example_bash_operator',
    default_args=args,
    schedule_interval='0 0 * * *',
    start_date=days_ago(2),
    dagrun_timeout=timedelta(minutes=60),
    tags=['example', 'example2'],
    params={"example_key": "example_value"},
)

run_this_last = DummyOperator(
    task_id='run_this_last',
    dag=dag,
)

# [START howto_operator_bash]
run_this = BashOperator(
    task_id='run_after_loop',
    bash_command='echo 1',
    dag=dag,
 def setUp(self):
     args = {
         'owner': 'airflow',
         'start_date': DEFAULT_DATE
     }
     self.dag = DAG(TEST_DAG_ID, default_args=args)
Ejemplo n.º 28
0
from airflow.operators.bash_operator import BashOperator
from airflow import DAG
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'start_date': datetime.now() - timedelta(minutes=20)
}
dag = DAG(dag_id='run_telus_pipeline', default_args=default_args)

t1_bash = """gsutil -m mv gs://telus_poc_input/sample_geo_data* gs://telus_poc_ready"""
t2_bash = """
ts=$(date +%s)
run_job='gcloud dataflow jobs run telus_dataflow_$ts --gcs-location gs://telus_poc_ready/dataflow --format="value(id)"'
jobid=`eval $run_job`
echo "SUBMITTED DATAFLOW JOB: $jobid"
done=0
max=100
i=0
while : ; do
  if [[ $i -gt $max ]];then
        echo "Max wait exceeded for step, exiting..."
        exit -1
  fi
  echo "Checking status..."
  check_status='gcloud dataflow jobs show '$jobid' --format="value(state)"'
  status=`eval $check_status`
  echo "DATAFLOW JOB with id $jobid is $status"
  if [[ $status == 'Done' ]]; then
        echo "Dataflow job done ... moving on"
        break
Ejemplo n.º 29
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': dt.datetime(2017, 6, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('etl_social', default_args=default_args, schedule_interval=None)

extract = BashOperator(
    task_id='extract',
    bash_command=
    'cd ~/ETL_twiiter; export PYTHONPATH=.; python etl/task/extract.py -i ~/twitter_dados/input_twitter/ -o ~/twitter_dados/clean_tweets/',
    dag=dag)

tag_sentiment = BashOperator(
    task_id='tag_sentiment',
    bash_command=
    'cd ~/ETL_twiiter; export PYTHONPATH=.; python etl/task/tag_sentiments.py -i ~/twitter_dados/clean_tweets/ -o ~/twitter_dados/tag_sentiments/ -cl etl/data/class_nb.bin',
    dag=dag)

indexes = BashOperator(
    task_id='indexer',
Ejemplo n.º 30
0
from airflow.contrib.operators.ecs_operator import ECSOperator
from airflow.operators.sensors import TimeSensor  # in Airflow 2.0 should be  "from airflow.sensors ..."

default_args = {
    'owner': XXX,
    'depends_on_past': False,
    'start_date': datetime(2018, 5, 1),
    'email': XXX,
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
}

with DAG('etl_adwords',
         default_args=default_args,
         schedule_interval='@daily',
         max_active_runs=1) as dag:
    (TimeSensor(task_id='hold_on', target_time=time(hour=3), dag=dag) >>
     ECSOperator(
         task_id='run_ecs',
         task_definition='airflow-etl-adwords',
         cluster='ecs',
         overrides={
             'containerOverrides': [{
                 'name':
                 'app',
                 'environment': [{
                     'name': 'EXECUTION_DATE',
                     'value': '{{ ds }}'
                 }, {
                     'name': 'RDS_ENDPOINT',
Ejemplo n.º 31
0
class OOBashSensor(BaseSensorOperator):
    def poke(self, context):
        retcode = subprocess.call(['sudo', '--non-interactive', '/usr/local/bin/docker-trampoline', self.task_id,
            context['ds'], context['execution_date'].isoformat(), (context['execution_date'] + context['dag'].schedule_interval).isoformat()] +
            self.params.get('argv', []))
        if retcode == 42:
            return True
        elif retcode == 13:
            return False
        else:
            raise AirflowException('Unexpected exit code: {:d}'.format(retcode))

dag = DAG(
    dag_id='hist_canning',
    schedule_interval=timedelta(days=1),
    start_date=datetime(2012, 12, 5),
    #end_date=datetime(2017, 7, 7), # NB: end_date is included
    default_args={
        'retries': 1,
    })

# NB: removing an Operator from DAG leaves some trash in the database tracking
# old state of that operator, but it seems to trigger no issues with 1.8.0

OOBashSensor(task_id='reports_raw_sensor', poke_interval=5*60, timeout=12*3600, retries=0, dag=dag)
BashOperator(pool='datacollector_disk_io', task_id='canning', bash_command='shovel_jump.sh', dag=dag)
BashOperator(pool='datacollector_disk_io', task_id='tar_reports_raw', bash_command='shovel_jump.sh', dag=dag)
BashOperator(pool='datacollector_disk_io', task_id='reports_tgz_s3_sync', bash_command='shovel_jump.sh', dag=dag)
BashOperator(pool='datacollector_disk_io', task_id='reports_tgz_s3_ls', bash_command='shovel_jump.sh', dag=dag)
BashOperator(pool='datacollector_disk_io', task_id='reports_tgz_cleanup', bash_command='shovel_jump.sh', dag=dag)
BashOperator(pool='datacollector_disk_io', task_id='canned_s3_sync', bash_command='shovel_jump.sh', dag=dag)
BashOperator(pool='datacollector_disk_io', task_id='canned_s3_ls', bash_command='shovel_jump.sh', dag=dag)
Ejemplo n.º 32
0
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.postgres_operator import PostgresOperator
from airflow.hooks.postgres_hook import PostgresHook

dag = DAG('Metadata',
          description='Create metadata for fact table',
          schedule_interval='@daily',
          start_date=datetime(2019, 8, 1),
          catchup=False)

count_facts = PostgresOperator(task_id='count_facts',
                               sql="""
    CREATE TABLE IF NOT EXISTS metadata (
	    checkpoint_date date, 
	    fact varchar,
	    nrows integer
    );
    INSERT INTO metadata (checkpoint_date, fact, nrows) 
    (
    	select current_date, fact_type, count(fact_type) from fact group by fact_type
    )
    ;
    """,
                               postgres_conn_id='datawarehouse',
                               autocommit=True,
                               database='dwh',
                               dag=dag)
default_args = {
    'owner': 'Airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 11, 23),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=2),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('simple-dataflow', default_args=default_args)

def csv_pop(data, *args, **kwargs):
    # print(args)
    # print(kwargs)
    # print(os.getcwd())

    print("data: {}".format(data))

    path = "data/csv/aapl.csv"
    df = pd.read_csv(path)

    if df.size > 0:
        item = df.iloc[0].to_dict()
        df.drop(0, inplace=True)
        df.to_csv(path, index=False)
Ejemplo n.º 34
0
from airflow.utils import TriggerRule

today = datetime.today()

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime.combine(today, time(13, 00, 0)) - timedelta(days=1),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
}
TR = TriggerRule
dag = DAG('ods_objectrocket',
          default_args=default_args,
          schedule_interval=timedelta(days=1))
script_folder = DAGS_FOLDER + '/../scripts/'
t0 = BashOperator(task_id='ods_load_batch_0',
                  bash_command=script_folder +
                  'ods_objectrocket/ods_load_batch_0.sh;',
                  dag=dag)
t1 = BashOperator(task_id='ods_load_batch_1',
                  bash_command=script_folder +
                  'ods_objectrocket/ods_load_batch_1.sh;',
                  dag=dag)
t5 = BashOperator(task_id='verify_load',
                  bash_command=script_folder +
                  'ods_archiving/checkDailyLoad.sh ods_objectrocket;',
                  dag=dag,
                  trigger_rule=TR.ALL_DONE)
Ejemplo n.º 35
0
class ShortCircuitOperatorTest(unittest.TestCase):
    def setUp(self):
        self.dag = DAG('shortcircuit_operator_test',
                       default_args={
                           'owner': 'airflow',
                           'start_date': DEFAULT_DATE},
                       schedule_interval=INTERVAL)
        self.short_op = ShortCircuitOperator(task_id='make_choice',
                                             dag=self.dag,
                                             python_callable=lambda: self.value)

        self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag)
        self.branch_1.set_upstream(self.short_op)
        self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag)
        self.branch_2.set_upstream(self.branch_1)
        self.upstream = DummyOperator(task_id='upstream', dag=self.dag)
        self.upstream.set_downstream(self.short_op)
        self.dag.clear()

        self.value = True

    def test_without_dag_run(self):
        """This checks the defensive against non existent tasks in a dag run"""
        self.value = False
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        session = Session()
        tis = session.query(TI).filter(
            TI.dag_id == self.dag.dag_id,
            TI.execution_date == DEFAULT_DATE
        )

        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        self.value = True
        self.dag.clear()

        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                # should not exist
                raise
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise

        session.close()

    def test_with_dag_run(self):
        self.value = False
        logging.error("Tasks {}".format(self.dag.tasks))
        dr = self.dag.create_dagrun(
            run_id="manual__",
            start_date=datetime.datetime.now(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.SKIPPED)
            else:
                raise

        self.value = True
        self.dag.clear()
        dr.verify_integrity()
        self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
        self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        tis = dr.get_task_instances()
        self.assertEqual(len(tis), 4)
        for ti in tis:
            if ti.task_id == 'make_choice':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'upstream':
                self.assertEquals(ti.state, State.SUCCESS)
            elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2':
                self.assertEquals(ti.state, State.NONE)
            else:
                raise
        );
        COMMIT;
    """)
    records = redshift_hook.get_records("""
        SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1
    """)    
    if len(records) > 0 and len(records[0]) > 0:
        logging.info(f'Youngest rider was born in {record[0][0]}')


#####################
# Initalizing the DAG 
#####################
dag = DAG(
        'lesson.demo1',
        start_date = datetime.datetime.now() - datetime.timedelta(days = 60),
        schedule_interval = "@monthly")


###########
# Operators
###########
create_table = PostgresOperator(
    task_id = 'create_table'
    postgres_conn_id = 'redshift',
    sql = sql.CREATE_STATIONS_TABLE_SQL,
    dag = dag)

copy_task = PythonOperator(
    task_id = 'copy_to_redshift',
    python_callable = load_data_to_redshift,
Ejemplo n.º 37
0
    compressed_sample_filename = compress_sample(result_filename, config)
    generate_tabix(compressed_sample_filename, config)
    copy_result(compressed_sample_filename, sample_id, config)
   
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime.datetime(2020, 01, 01),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("filter-vcf", default_args=default_args,
          schedule_interval=None, concurrency=20000, max_active_runs=20000)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)



filter_task = PythonOperator(
    task_id="filter_variants",
    python_callable=filter_variants,
    provide_context=True,
    dag=dag)
Ejemplo n.º 38
0
import datetime

import airflow
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator

from airflow.operators.bash_operator import BashOperator
from airflow.operators.hive_operator import HiveOperator

DAG_ARGS = {
    'owner': 'airflow',
    'start_date': airflow.utils.dates.days_ago(0, hour=1),
}

PROJECT_DAG = DAG(dag_id='final_project',
                  default_args=DAG_ARGS,
                  schedule_interval=datetime.timedelta(hours=1))

SENTINEL_START = DummyOperator(task_id='sentinel_start', dag=PROJECT_DAG)

MIMIC_ETL_FILES = [
    'admissions',
    'callout',
    'caregivers',
    'chartevents',
    'cptevents',
    'datetimeevents',
    'diagnoses_icd',
    'drgcodes',
    'd_cpt'
    'd_icd_diagnoses',
from airflow import DAG
from airflow.operators.http_operator import SimpleHttpOperator
from airflow.sensors.http_sensor import HttpSensor

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(2),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('example_http_operator', default_args=default_args)

dag.doc_md = __doc__

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = SimpleHttpOperator(
    task_id='post_op',
    endpoint='api/v1.0/nodes',
    data=json.dumps({"priority": 5}),
    headers={"Content-Type": "application/json"},
    response_check=lambda response: True if len(response.json()) == 0 else False,
    dag=dag)

t5 = SimpleHttpOperator(
    task_id='post_op_formenc',
    endpoint='nodes/url',
Ejemplo n.º 40
0
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'description': 'Wikipedia assistant',
    'depends_on_past': False,
    'start_date': datetime(2020, 7, 28),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('Wiki_Analyzer',
          default_args=default_args,
          schedule_interval=timedelta(days=20))


def validate_connections():
    print("Connections validation code goes here")


validate_connectivity = PythonOperator(
    task_id='validate_connectivity',
    provide_context=True,
    python_callable=validate_connections,
    dag=dag,
)

extract_wiki_data = BashOperator(
Ejemplo n.º 41
0
retailer_name = 'portland'
store_group_id = 1900020001181
category = 'Baseline Forecasting'
version = 'v1'
family = 'F1'

args = {
    'owner': 'Jonas',
    'email': ['*****@*****.**'],
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0)
}

dag = DAG(dag_id='Portland-1900020001181',
          default_args=args,
          tags=['Baseline-Forecast', 'Portland'])

#
# You can also access the DagRun object in templates

#baseline-denver-500020000113
#define cluster to use
new_cluster = {
    'spark_version': '7.0.x-scala2.12',
    'node_type_id': 'Standard_F16s',
    'driver_node_type_id': 'Standard_D16s_v3',
    'num_workers': 10
}

#define cluster to use
Ejemplo n.º 42
0
from airflow.providers.http.operators.http import SimpleHttpOperator
from airflow.providers.http.sensors.http import HttpSensor
from airflow.utils.dates import days_ago

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('example_http_operator',
          default_args=default_args,
          tags=['example'],
          start_date=days_ago(2))

dag.doc_md = __doc__

# task_post_op, task_get_op and task_put_op are examples of tasks created by instantiating operators
# [START howto_operator_http_task_post_op]
task_post_op = SimpleHttpOperator(
    task_id='post_op',
    endpoint='post',
    data=json.dumps({"priority": 5}),
    headers={"Content-Type": "application/json"},
    response_check=lambda response: response.json()['json']['priority'] == 5,
    dag=dag,
)
# [END howto_operator_http_task_post_op]
Ejemplo n.º 43
0
default_args = {
    'owner': 'Guto - IGTI',
    'depends_on_past': False,
    'start_date': datetime(2021, 3, 13, 20, 40),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

# Definição da DAG - Fluxo
dag = DAG(
    "treino-04",
    description="Paralelismos",
    default_args=default_args,
    #schedule_interval=timedelta(minutes=2)
    schedule_interval="*/10 * * * *"
)

start_preprocessing = BashOperator(
    task_id='start_preprocessing',
    bash_command = 'echo "Start Preprocessing! Vai!"',
    dag=dag
)

get_data = BashOperator(
    task_id='get_data',
    bash_command = 'curl http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip -o /usr/local/airflow/data/microdados_enade_2019.zip',
    dag=dag
)
# Global variables that are set using environment varaiables
GE_TUTORIAL_DB_URL = os.getenv('GE_TUTORIAL_DB_URL')
GE_TUTORIAL_PROJECT_PATH = os.getenv('GE_TUTORIAL_PROJECT_PATH')


default_args = {
    "owner": "Airflow",
    "start_date": airflow.utils.dates.days_ago(1)
}


# The DAG definition
dag = DAG(
    dag_id='ge_tutorials_dag',
    default_args=default_args,
    schedule_interval=None,
)


def load_files_into_db(ds, **kwargs):
    """
        A method to simply load CSV files into a database using SQLAlchemy.
    """

    engine = create_engine(GE_TUTORIAL_DB_URL)

    with engine.connect() as conn:
        conn.execute("drop table if exists npi_small cascade ")
        conn.execute("drop table if exists state_abbreviations cascade ")
        "ActionOnFailure": "TERMINATE_JOB_FLOW",
        "HadoopJarStep": {
            "Jar": "command-runner.jar",
            "Args": [
                "/usr/local/bin/processlogs",
                "--domain", "versioncheck.allizom.org",
                "--bucket", "amo-metrics-logs-stage",
                "--date", "{{ ds }}"
            ]
        }
    }
]

blp_dag = DAG(
    'mango_log_processing_adi',
    default_args=DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=6),
    schedule_interval='0 3 * * *'
)

blp_logs = EmrCreateJobFlowOperator(
    task_id='blp_create_job_flow',
    job_flow_overrides={'Steps': BLP_STEPS},
    aws_conn_id='aws_data_iam',
    emr_conn_id='emr_data_iam_mango',
    dag=blp_dag
)

blp_job_sensor = EmrJobFlowSensor(
    task_id='blp_check_job_flow',
    job_flow_id="{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}",
    aws_conn_id='aws_data_iam',
Ejemplo n.º 46
0
from airflow.operators.python_operator import PythonOperator
from airflow import DAG
from operators import CreateTablesOperator, LoadFromCSVOperator, FetchDataFromDBOperator
from helpers import SqlQueries, pyhelpers

import os
from datetime import datetime, timedelta, date

default_args = {
    'owner': 'nani',
    'start_date': datetime(2019, 1, 1),
    'retry_delay': timedelta(minutes=5)
}

with DAG('setup_base_data',
         default_args=default_args,
         schedule_interval='@once') as dag:
    start_task = DummyOperator(task_id='dummy_start')

    create_base_tables = CreateTablesOperator(
        task_id='create_dimension_and_fact_tables',
        sql_queries=SqlQueries.create_dim_and_fact_tables)

    get_data_for_dim_country = PythonOperator(
        task_id='get_dim_country_data',
        python_callable=pyhelpers.get_specific_columns_and_store_csv,
        op_kwargs={
            'source_file':
            os.getenv('path_to_data_folder') +
            "country_continent_isocodes.csv",
            'destination_file':
Ejemplo n.º 47
0
def create_dag(dag_id, schedule_interval, start_date):
    with DAG(dag_id=dag_id,
             schedule_interval=schedule_interval,
             start_date=start_date,
             default_args={
                  'queue': 'jobs_queue',
                  'postgres_conn_id': postgres_connection,
                  'do_xcom_push': True
              }) as dag:

        # task definitions

        # print_log task
        @dag.task()
        def print_to_log(dag_id, database):
            logging.info(f'{dag_id} started processing tables in database: {database}')

        print_logs = print_to_log(dag_id, 'postgres')

        # bash task for getting user name
        get_user = BashOperator(
            task_id='get_user',
            bash_command='whoami',
        )

        table_name = config[dag_id]['table_name']

        # calls the function for checking table existence
        check_table_exist = BranchPythonOperator(
            task_id='check_table_exist',
            python_callable=check_if_table_exists,
            op_args=[table_name, postgres_connection],
        )

        # inserts new row into the table
        insert_row = PostgresOperator(
            task_id='insert_new_row',
            sql=f'''
                INSERT INTO {table_name} VALUES
                (%s, \'{{{{ ti.xcom_pull(task_ids='get_user') }}}}\', %s);
                 ''',
            parameters=[
                uuid.uuid4().int % 123456789,
                datetime.now()
            ],
            trigger_rule=TriggerRule.NONE_FAILED,
        )
        # fetches results from the table
        query_the_table = PostgreSQLCountRows(
            task_id='query_the_table',
            table_name=table_name,
        )
        # creates a postgres table with table_name
        create_table = PostgresOperator(
            task_id='create_table',
            sql=f'''
                CREATE TABLE {table_name}(
                custom_id integer NOT NULL,
                user_name VARCHAR (50) NOT NULL, 
                timestamp TIMESTAMP NOT NULL);
                    ''',
        )

        # setting task order
        print_logs >> get_user >> check_table_exist >> [create_table, insert_row]
        create_table >> insert_row >> query_the_table
        return dag
number_of_tasks = int(number_of_tasks)
task_names = ['process_chunk_' + str(k) for k in range(0, number_of_tasks)]

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 9, 19),
    'email': [my_email_address],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(seconds=30),
}

dag = DAG(dag_name,
          catchup=False,
          default_args=default_args,
          schedule_interval=None)


# the following tasks are created by instantiating operators dynamically
def get_task(j, task_name_j):

    return SFTPToS3UploadPartOperator(
        task_id=task_name_j,
        conn_id_source=sftp_conn,
        file_source_path=source_path + filename,
        # access_key=access_key,
        # secret_key=secret_key,
        # session_token=session_token,
        upload_id=upload_id,
        bucket=bucket,
    import KubernetesPodOperator
from airflow.models import Variable

DEFAULT_ARGS = {
    'owner': 'de',
    'email': '*****@*****.**',
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
    'start_date': datetime(2020, 3, 17)
}

IMAGE_CONFIG = Variable.get('crosslend_images_config', deserialize_json=True)
CONFIG = Variable.get('immobilienscout24_conf',
                      deserialize_json=True)

with DAG(
        'flat-data-ingestion',
        default_args=DEFAULT_ARGS,
        schedule_interval='0 0  * * *'
) as dag:
    KubernetesPodOperator(
        namespace='Crosslend_Dataengineering',
        image=IMAGE_CONFIG['flat-data-ingestion'],
        cmds=["python", "main.py",
              "--config", json.dumps(CONFIG)],
        name="flat-data-ingestion",
        task_id="flat-data-ingestion",
        in_cluster=True
    )
Ejemplo n.º 50
0
"""
Add a Markdown description to a DAG or a task.
The description is shown in “Graph View” for DAGs, “Task Details” for tasks.
Doc: https://airflow.readthedocs.io/en/latest/concepts.html#documentation-notes
"""
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime

default_args = {
    'start_date': datetime.now()
}

dag = DAG(
    'description_markdown',
    default_args=default_args)
dag.doc_md = """
# Markdown hi
## Subheader
Here's a [url](www.airbnb.com)

My numbered list:

1. one
1. two

My bulleted list:

- first
- second
"""
Ejemplo n.º 51
0
    'depends_on_past': False,
    'start_date': datetime(2019, 1, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

# don't auto-schedule the dag
# https://airflow.readthedocs.io/en/stable/scheduler.html
dag = DAG('npmjs_static_2', default_args=default_args, schedule_interval=None)
# periodically run the dag
# dag = DAG('tutorial', default_args=default_args, schedule_interval=timedelta(days=1))

# load dep_tree for packages, relative to AIRFLOW_HOME
npmjs_dep_path = "./dags/npmjs.with_stats.dep_graph_2.pickle"
dep_tree = pickle.load(open(npmjs_dep_path, "rb"))
logging.info("loaded dep_tree with %d nodes", dep_tree.number_of_nodes())


def get_sanitized_pkgname(pkg_name):
    invalid_name = re.compile(r'[^a-zA-Z0-9_.-]')
    pkg_name = re.sub(invalid_name, '..', pkg_name)
    return pkg_name

Ejemplo n.º 52
0
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators import (StageToRedshiftOperator, LoadFactOperator,
                               LoadDimensionOperator, DataQualityOperator)
from helpers import SqlQueries

# AWS_KEY = os.environ.get('AWS_KEY')
# AWS_SECRET = os.environ.get('AWS_SECRET')

default_args = {
    'owner': 'ranjith',
    'start_date': datetime(2019, 1, 12),
}

dag = DAG('data_pipeline_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *')

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="staging_events",
    s3_bucket="udacity-dend",
    s3_key="log_data",
)

stage_songs_to_redshift = StageToRedshiftOperator(
class BaseSensorTest(unittest.TestCase):
    def setUp(self):
        configuration.load_test_config()
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }
        self.dag = DAG(TEST_DAG_ID, default_args=args)

        session = settings.Session()
        session.query(TaskReschedule).delete()
        session.query(DagRun).delete()
        session.query(TaskInstance).delete()
        session.commit()

    def _make_dag_run(self):
        return self.dag.create_dagrun(
            run_id='manual__',
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

    def _make_sensor(self, return_value, **kwargs):
        poke_interval = 'poke_interval'
        timeout = 'timeout'
        if poke_interval not in kwargs:
            kwargs[poke_interval] = 0
        if timeout not in kwargs:
            kwargs[timeout] = 0

        sensor = DummySensor(
            task_id=SENSOR_OP,
            return_value=return_value,
            dag=self.dag,
            **kwargs
        )

        dummy_op = DummyOperator(
            task_id=DUMMY_OP,
            dag=self.dag
        )
        dummy_op.set_upstream(sensor)
        return sensor

    @classmethod
    def _run(cls, task):
        task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

    def test_ok(self):
        sensor = self._make_sensor(True)
        dr = self._make_dag_run()

        self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.SUCCESS)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

    def test_fail(self):
        sensor = self._make_sensor(False)
        dr = self._make_dag_run()

        with self.assertRaises(AirflowSensorTimeout):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.FAILED)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

    def test_soft_fail(self):
        sensor = self._make_sensor(False, soft_fail=True)
        dr = self._make_dag_run()

        self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            self.assertEquals(ti.state, State.SKIPPED)

    def test_soft_fail_with_retries(self):
        sensor = self._make_sensor(
            return_value=False,
            soft_fail=True,
            retries=1,
            retry_delay=timedelta(milliseconds=1))
        dr = self._make_dag_run()

        # first run fails and task instance is marked up to retry
        with self.assertRaises(AirflowSensorTimeout):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.UP_FOR_RETRY)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        sleep(0.001)
        # after retry DAG run is skipped
        self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            self.assertEquals(ti.state, State.SKIPPED)

    def test_ok_with_reschedule(self):
        sensor = self._make_sensor(
            return_value=None,
            poke_interval=10,
            timeout=25,
            mode='reschedule')
        sensor.poke = Mock(side_effect=[False, False, True])
        dr = self._make_dag_run()

        # first poke returns False and task is re-scheduled
        date1 = timezone.utcnow()
        with freeze_time(date1):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                # verify task is re-scheduled, i.e. state set to NONE
                self.assertEquals(ti.state, State.NONE)
                # verify one row in task_reschedule table
                task_reschedules = TaskReschedule.find_for_task_instance(ti)
                self.assertEquals(len(task_reschedules), 1)
                self.assertEquals(task_reschedules[0].start_date, date1)
                self.assertEquals(task_reschedules[0].reschedule_date,
                                  date1 + timedelta(seconds=sensor.poke_interval))
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        # second poke returns False and task is re-scheduled
        date2 = date1 + timedelta(seconds=sensor.poke_interval)
        with freeze_time(date2):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                # verify task is re-scheduled, i.e. state set to NONE
                self.assertEquals(ti.state, State.NONE)
                # verify two rows in task_reschedule table
                task_reschedules = TaskReschedule.find_for_task_instance(ti)
                self.assertEquals(len(task_reschedules), 2)
                self.assertEquals(task_reschedules[1].start_date, date2)
                self.assertEquals(task_reschedules[1].reschedule_date,
                                  date2 + timedelta(seconds=sensor.poke_interval))
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        # third poke returns True and task succeeds
        date3 = date2 + timedelta(seconds=sensor.poke_interval)
        with freeze_time(date3):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.SUCCESS)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

    def test_fail_with_reschedule(self):
        sensor = self._make_sensor(
            return_value=False,
            poke_interval=10,
            timeout=5,
            mode='reschedule')
        dr = self._make_dag_run()

        # first poke returns False and task is re-scheduled
        date1 = timezone.utcnow()
        with freeze_time(date1):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.NONE)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        # second poke returns False, timeout occurs
        date2 = date1 + timedelta(seconds=sensor.poke_interval)
        with freeze_time(date2):
            with self.assertRaises(AirflowSensorTimeout):
                self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.FAILED)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

    def test_soft_fail_with_reschedule(self):
        sensor = self._make_sensor(
            return_value=False,
            poke_interval=10,
            timeout=5,
            soft_fail=True,
            mode='reschedule')
        dr = self._make_dag_run()

        # first poke returns False and task is re-scheduled
        date1 = timezone.utcnow()
        with freeze_time(date1):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.NONE)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        # second poke returns False, timeout occurs
        date2 = date1 + timedelta(seconds=sensor.poke_interval)
        with freeze_time(date2):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            self.assertEquals(ti.state, State.SKIPPED)

    def test_ok_with_reschedule_and_retry(self):
        sensor = self._make_sensor(
            return_value=None,
            poke_interval=10,
            timeout=5,
            retries=1,
            retry_delay=timedelta(seconds=10),
            mode='reschedule')
        sensor.poke = Mock(side_effect=[False, False, False, True])
        dr = self._make_dag_run()

        # first poke returns False and task is re-scheduled
        date1 = timezone.utcnow()
        with freeze_time(date1):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.NONE)
                # verify one row in task_reschedule table
                task_reschedules = TaskReschedule.find_for_task_instance(ti)
                self.assertEquals(len(task_reschedules), 1)
                self.assertEquals(task_reschedules[0].start_date, date1)
                self.assertEquals(task_reschedules[0].reschedule_date,
                                  date1 + timedelta(seconds=sensor.poke_interval))
                self.assertEqual(task_reschedules[0].try_number, 1)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        # second poke fails and task instance is marked up to retry
        date2 = date1 + timedelta(seconds=sensor.poke_interval)
        with freeze_time(date2):
            with self.assertRaises(AirflowSensorTimeout):
                self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.UP_FOR_RETRY)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        # third poke returns False and task is rescheduled again
        date3 = date2 + timedelta(seconds=sensor.poke_interval) + sensor.retry_delay
        with freeze_time(date3):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.NONE)
                # verify one row in task_reschedule table
                task_reschedules = TaskReschedule.find_for_task_instance(ti)
                self.assertEquals(len(task_reschedules), 1)
                self.assertEquals(task_reschedules[0].start_date, date3)
                self.assertEquals(task_reschedules[0].reschedule_date,
                                  date3 + timedelta(seconds=sensor.poke_interval))
                self.assertEqual(task_reschedules[0].try_number, 2)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        # fourth poke return True and task succeeds
        date4 = date3 + timedelta(seconds=sensor.poke_interval)
        with freeze_time(date4):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.SUCCESS)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

    def test_should_include_ready_to_reschedule_dep(self):
        sensor = self._make_sensor(True)
        deps = sensor.deps
        self.assertTrue(ReadyToRescheduleDep() in deps)

    def test_invalid_mode(self):
        with self.assertRaises(AirflowException):
            self._make_sensor(
                return_value=True,
                mode='foo')

    def test_ok_with_custom_reschedule_exception(self):
        sensor = self._make_sensor(
            return_value=None,
            mode='reschedule')
        date1 = timezone.utcnow()
        date2 = date1 + timedelta(seconds=60)
        date3 = date1 + timedelta(seconds=120)
        sensor.poke = Mock(side_effect=[
            AirflowRescheduleException(date2),
            AirflowRescheduleException(date3),
            True,
        ])
        dr = self._make_dag_run()

        # first poke returns False and task is re-scheduled
        with freeze_time(date1):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                # verify task is re-scheduled, i.e. state set to NONE
                self.assertEquals(ti.state, State.NONE)
                # verify one row in task_reschedule table
                task_reschedules = TaskReschedule.find_for_task_instance(ti)
                self.assertEquals(len(task_reschedules), 1)
                self.assertEquals(task_reschedules[0].start_date, date1)
                self.assertEquals(task_reschedules[0].reschedule_date, date2)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        # second poke returns False and task is re-scheduled
        with freeze_time(date2):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                # verify task is re-scheduled, i.e. state set to NONE
                self.assertEquals(ti.state, State.NONE)
                # verify two rows in task_reschedule table
                task_reschedules = TaskReschedule.find_for_task_instance(ti)
                self.assertEquals(len(task_reschedules), 2)
                self.assertEquals(task_reschedules[1].start_date, date2)
                self.assertEquals(task_reschedules[1].reschedule_date, date3)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

        # third poke returns True and task succeeds
        with freeze_time(date3):
            self._run(sensor)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                self.assertEquals(ti.state, State.SUCCESS)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)

    def test_reschedule_with_test_mode(self):
        sensor = self._make_sensor(
            return_value=None,
            poke_interval=10,
            timeout=25,
            mode='reschedule')
        sensor.poke = Mock(side_effect=[False])
        dr = self._make_dag_run()

        # poke returns False and AirflowRescheduleException is raised
        date1 = timezone.utcnow()
        with freeze_time(date1):
            for dt in self.dag.date_range(DEFAULT_DATE, end_date=DEFAULT_DATE):
                TaskInstance(sensor, dt).run(
                    ignore_ti_state=True,
                    test_mode=True)
        tis = dr.get_task_instances()
        self.assertEquals(len(tis), 2)
        for ti in tis:
            if ti.task_id == SENSOR_OP:
                # in test mode state is not modified
                self.assertEquals(ti.state, State.NONE)
                # in test mode no reschedule request is recorded
                task_reschedules = TaskReschedule.find_for_task_instance(ti)
                self.assertEquals(len(task_reschedules), 0)
            if ti.task_id == DUMMY_OP:
                self.assertEquals(ti.state, State.NONE)
Ejemplo n.º 54
0
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2015, 6, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('slacker',
          default_args=default_args,
          schedule_interval=timedelta(minutes=1))

t1 = BashOperator(task_id='hive',
                  bash_command='hive -f /home/maria_dev/slackbot/slackbot.sql',
                  dag=dag)

t2 = BashOperator(task_id='ok', bash_command='echo "ok!"', dag=dag)

t2.set_upstream(t1)
Ejemplo n.º 55
0
            GROUP BY city
        );
        COMMIT;
    """)


def log_oldest():
    redshift_hook = PostgresHook("redshift")
    records = redshift_hook.get_records("""
        SELECT birthyear FROM older_riders ORDER BY birthyear ASC LIMIT 1
    """)
    if len(records) > 0 and len(records[0]) > 0:
        logging.info(f"Oldest rider was born in {records[0][0]}")


dag = DAG("lesson3.exercise2", start_date=datetime.datetime.utcnow())

load_and_analyze = PythonOperator(
    task_id='load_and_analyze',
    dag=dag,
    python_callable=load_and_analyze,
    provide_context=True,
)

create_oldest_task = PostgresOperator(task_id="create_oldest",
                                      dag=dag,
                                      sql="""
        BEGIN;
        DROP TABLE IF EXISTS older_riders;
        CREATE TABLE older_riders AS (
            SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
Ejemplo n.º 56
0
from airflow import DAG
from airflow.operators.dagrun_operator import TriggerDagRunOperator
from airflow.sensors.external_task_sensor import ExternalTaskSensor
from airflow.utils.dates import days_ago

with DAG(dag_id="dag_referenced_task_dag_id_exists_fail",
         schedule_interval=None,
         start_date=days_ago(1)) as dag:
    TriggerDagRunOperator(task_id="test_trigger", trigger_dag_id="nonexistent")
    ExternalTaskSensor(task_id="test_sensor_dag",
                       external_dag_id="nonexistent")
    ExternalTaskSensor(task_id="test_sensor_task",
                       external_dag_id="nonexistent",
                       external_task_id="non-task")