def test_schedule_dag_fake_scheduled_previous(self): """ Test scheduling a dag where there is a prior DagRun which has the same run_id as the next run should have """ delta = timedelta(hours=1) dag = DAG(TEST_DAG_ID+'test_schedule_dag_fake_scheduled_previous', schedule_interval=delta, start_date=DEFAULT_DATE) dag.tasks = [models.BaseOperator(task_id="faketastic", owner='Also fake', start_date=DEFAULT_DATE)] scheduler = jobs.SchedulerJob(test_mode=True) trigger = models.DagRun( dag_id=dag.dag_id, run_id=models.DagRun.id_for_date(DEFAULT_DATE), execution_date=DEFAULT_DATE, state=utils.State.SUCCESS, external_trigger=True) settings.Session().add(trigger) settings.Session().commit() dag_run = scheduler.schedule_dag(dag) assert dag_run is not None assert dag_run.dag_id == dag.dag_id assert dag_run.run_id is not None assert dag_run.run_id != '' assert dag_run.execution_date == DEFAULT_DATE+delta, ( 'dag_run.execution_date did not match expectation: {0}' .format(dag_run.execution_date)) assert dag_run.state == models.State.RUNNING assert dag_run.external_trigger == False
def setUp(self): configuration.test_mode() utils.initdb() args = {'owner': 'airflow', 'start_date': datetime(2015, 1, 1)} dag = DAG(TEST_DAG_ID, default_args=args) dag.clear(start_date=DEFAULT_DATE, end_date=datetime.now()) self.dag = dag
class BranchOperatorTest(unittest.TestCase): def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.branch_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_op) self.dag.clear() def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) session.close() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': # should exist with state None self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise def test_with_dag_run(self): dr = self.dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.branch_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1': self.assertEquals(ti.state, State.NONE) elif ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise
class BashOperatorTestCase(unittest.TestCase): def test_echo_env_variables(self): """ Test that env variables are exported correctly to the task bash environment. """ now = datetime.utcnow() now = now.replace(tzinfo=timezone.utc) self.dag = DAG( dag_id='bash_op_test', default_args={ 'owner': 'airflow', 'retries': 100, 'start_date': DEFAULT_DATE }, schedule_interval='@daily', dagrun_timeout=timedelta(minutes=60)) self.dag.create_dagrun( run_id='manual__' + DEFAULT_DATE.isoformat(), execution_date=DEFAULT_DATE, start_date=now, state=State.RUNNING, external_trigger=False, ) import tempfile with tempfile.NamedTemporaryFile() as f: fname = f.name t = BashOperator( task_id='echo_env_vars', dag=self.dag, bash_command='echo $AIRFLOW_HOME>> {0};' 'echo $PYTHONPATH>> {0};' 'echo $AIRFLOW_CTX_DAG_ID >> {0};' 'echo $AIRFLOW_CTX_TASK_ID>> {0};' 'echo $AIRFLOW_CTX_EXECUTION_DATE>> {0};' 'echo $AIRFLOW_CTX_DAG_RUN_ID>> {0};'.format(fname) ) original_AIRFLOW_HOME = os.environ['AIRFLOW_HOME'] os.environ['AIRFLOW_HOME'] = 'MY_PATH_TO_AIRFLOW_HOME' t.run(DEFAULT_DATE, DEFAULT_DATE, ignore_first_depends_on_past=True, ignore_ti_state=True) with open(fname, 'r') as fr: output = ''.join(fr.readlines()) self.assertIn('MY_PATH_TO_AIRFLOW_HOME', output) # exported in run_unit_tests.sh as part of PYTHONPATH self.assertIn('tests/test_utils', output) self.assertIn('bash_op_test', output) self.assertIn('echo_env_vars', output) self.assertIn(DEFAULT_DATE.isoformat(), output) self.assertIn('manual__' + DEFAULT_DATE.isoformat(), output) os.environ['AIRFLOW_HOME'] = original_AIRFLOW_HOME
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.fs_hook import FSHook hook = FSHook() args = {"owner": "airflow", "start_date": DEFAULT_DATE, "provide_context": True} dag = DAG(TEST_DAG_ID + "test_schedule_dag_once", default_args=args) dag.schedule_interval = "@once" self.hook = hook self.dag = dag
def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" value = False dag = DAG('shortcircuit_operator_test_without_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close()
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.fs_hook import FSHook hook = FSHook() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID+'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def setUp(self): configuration.test_mode() from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook() hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID+'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def test_schedule_dag_once(self): """ Tests scheduling a dag scheduled for @once - should be scheduled the first time it is called, and not scheduled the second. """ dag = DAG(TEST_DAG_ID+'test_schedule_dag_once') dag.schedule_interval = '@once' dag.tasks = [models.BaseOperator(task_id="faketastic", owner='Also fake', start_date=datetime(2015, 1, 2, 0, 0))] dag_run = jobs.SchedulerJob(test_mode=True).schedule_dag(dag) dag_run2 = jobs.SchedulerJob(test_mode=True).schedule_dag(dag) assert dag_run is not None assert dag_run2 is None
def test_schedule_dag_no_previous_runs(self): """ Tests scheduling a dag with no previous runs """ dag = DAG(TEST_DAG_ID+'test_schedule_dag_no_previous_runs') dag.tasks = [models.BaseOperator(task_id="faketastic", owner='Also fake', start_date=datetime(2015, 1, 2, 0, 0))] dag_run = jobs.SchedulerJob(test_mode=True).schedule_dag(dag) assert dag_run is not None assert dag_run.dag_id == dag.dag_id assert dag_run.run_id is not None assert dag_run.run_id != '' assert dag_run.execution_date == datetime(2015, 1, 2, 0, 0), ( 'dag_run.execution_date did not match expectation: {0}' .format(dag_run.execution_date)) assert dag_run.state == models.State.RUNNING assert dag_run.external_trigger == False
def setUp(self): configuration.load_test_config() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE } self.dag = DAG(TEST_DAG_ID, default_args=args) session = settings.Session() session.query(DagRun).delete() session.query(TaskInstance).delete() session.commit()
def setUp(self): if sys.version_info[0] == 3: raise unittest.SkipTest('SSHExecuteOperatorTest won\'t work with ' 'python3. No need to test anything here') configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook hook = mock.MagicMock(spec=SSHHook) hook.no_host_key_check = True hook.Popen.return_value.stdout = StringIO(u'stdout') hook.Popen.return_value.returncode = False args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID+'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def test_with_dag_run(self): value = False dag = DAG('shortcircuit_operator_test_with_dag_run', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE }, schedule_interval=INTERVAL) short_op = ShortCircuitOperator(task_id='make_choice', dag=dag, python_callable=lambda: value) branch_1 = DummyOperator(task_id='branch_1', dag=dag) branch_1.set_upstream(short_op) branch_2 = DummyOperator(task_id='branch_2', dag=dag) branch_2.set_upstream(branch_1) upstream = DummyOperator(task_id='upstream', dag=dag) upstream.set_downstream(short_op) dag.clear() logging.error("Tasks {}".format(dag.tasks)) dr = dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise value = True dag.clear() dr.verify_integrity() upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
def setUp(self): super().setUp() configuration.load_test_config() self.dag = DAG( 'test_dag', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.addCleanup(self.dag.clear) freezer = freeze_time(FROZEN_NOW) freezer.start() self.addCleanup(freezer.stop)
def test_external_dag_sensor(self): other_dag = DAG( 'other_dag', default_args=self.args, end_date=DEFAULT_DATE, schedule_interval='@once') other_dag.create_dagrun( run_id='test', start_date=DEFAULT_DATE, execution_date=DEFAULT_DATE, state=State.SUCCESS) t = ExternalTaskSensor( task_id='test_external_dag_sensor_check', external_dag_id='other_dag', external_task_id=None, dag=self.dag ) t.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True )
def setUp(self): configuration.load_test_config() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.dag = dag self.sensor = gcs_sensor.GoogleCloudStorageUploadSessionCompleteSensor( task_id='sensor', bucket='test-bucket', prefix='test-prefix/path', inactivity_period=12, poke_interval=10, min_objects=1, allow_delete=False, previous_num_objects=0, dag=self.dag ) self.last_mocked_date = datetime(2019, 4, 24, 0, 0, 0)
def setUp(self): self.dag = DAG('branch_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.branch_op = BranchPythonOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: 'branch_1') self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.branch_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_op) self.dag.clear()
def setUp(self): self.dag = DAG('shortcircuit_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.short_op = ShortCircuitOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: self.value) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.short_op) self.upstream = DummyOperator(task_id='upstream', dag=self.dag) self.upstream.set_downstream(self.short_op) self.dag.clear() self.value = True
def addition(): logging.info(f"2 + 2 = {2+2}") def subtraction(): logging.info(f"6 -2 = {6-2}") def division(): logging.info(f"10 / 2 = {int(10/2)}") dag = DAG( "lesson1.exercise3", schedule_interval='@hourly', start_date=datetime.datetime.now() - datetime.timedelta(days=1)) hello_world_task = PythonOperator( task_id="hello_world", python_callable=hello_world, dag=dag) # # Define an addition task that calls the `addition` function above # addition_task = PythonOperator( task_id='addition', python_callable=addition, dag=dag)
class LatestOnlyOperatorTest(unittest.TestCase): def setUp(self): super().setUp() configuration.load_test_config() self.dag = DAG( 'test_dag', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.addCleanup(self.dag.clear) freezer = freeze_time(FROZEN_NOW) freezer.start() self.addCleanup(freezer.stop) def test_run(self): task = LatestOnlyOperator( task_id='latest', dag=self.dag) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) def test_skipping(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) def test_skipping_dagrun(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) self.dag.create_dagrun( run_id="manual__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.dag.create_dagrun( run_id="manual__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING ) self.dag.create_dagrun( run_id="manual__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state)
import pprint as pp import airflow.utils.dates from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.dummy_operator import DummyOperator from datetime import datetime, timedelta default_args = { "owner": "airflow", "start_date": airflow.utils.dates.days_ago(1) } with DAG(dag_id="sleep_dag", default_args=default_args, schedule_interval="@daily") as dag: t1 = DummyOperator(task_id="t1") t2 = BashOperator(task_id="t2", bash_command="sleep 30") t1 >> t2
import datetime import airflow from airflow import DAG from airflow.operators.hive_operator import HiveOperator from datetime import timedelta default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(0, hour=0, minute=0, second=1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), 'provide_context': True } dag = DAG('mapr_hive_task_dag', default_args=default_args, description='MapR single task DAG', schedule_interval=timedelta(minutes=15)) insert_current_datetime = HiveOperator( task_id='insert_current_datetime_task', hql="insert into table datetimes values ('" + datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") + "');", dag=dag) dag.doc_md = __doc__
from datetime import datetime, timedelta from operators.emr_spark_operator import EMRSparkOperator default_args = { 'owner': '*****@*****.**', 'depends_on_past': False, 'start_date': datetime(2016, 8, 18), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=30), } dag = DAG('bugzilla_dataset', default_args=default_args, schedule_interval='@daily') connection_details = BaseHook.get_connection('bugzilla_db') env = { "DATABASE_USER": connection_details.login, "DATABASE_PASSWORD": connection_details.password, "DATABASE_HOST": connection_details.host, "DATABASE_PORT": connection_details.port, "DATABASE_NAME": connection_details.schema, } t0 = EMRSparkOperator( task_id="update_bugs", job_name="Bugzilla Dataset Update",
class BaseSensorTest(unittest.TestCase): def setUp(self): configuration.load_test_config() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE } self.dag = DAG(TEST_DAG_ID, default_args=args) session = settings.Session() session.query(DagRun).delete() session.query(TaskInstance).delete() session.commit() def _make_dag_run(self): return self.dag.create_dagrun( run_id='manual__', start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) def _make_sensor(self, return_value, **kwargs): poke_interval = 'poke_interval' timeout = 'timeout' if poke_interval not in kwargs: kwargs[poke_interval] = 0 if timeout not in kwargs: kwargs[timeout] = 0 sensor = DummySensor( task_id=SENSOR_OP, return_value=return_value, dag=self.dag, **kwargs ) dummy_op = DummyOperator( task_id=DUMMY_OP, dag=self.dag ) dummy_op.set_upstream(sensor) return sensor @classmethod def _run(cls, task): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_ok(self): sensor = self._make_sensor(True) dr = self._make_dag_run() self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.SUCCESS) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) def test_fail(self): sensor = self._make_sensor(False) dr = self._make_dag_run() with self.assertRaises(AirflowSensorTimeout): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.FAILED) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) def test_soft_fail(self): sensor = self._make_sensor(False, soft_fail=True) dr = self._make_dag_run() self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: self.assertEquals(ti.state, State.SKIPPED) def test_soft_fail_with_retries(self): sensor = self._make_sensor( return_value=False, soft_fail=True, retries=1, retry_delay=timedelta(milliseconds=1)) dr = self._make_dag_run() # first run fails and task instance is marked up to retry with self.assertRaises(AirflowSensorTimeout): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.UP_FOR_RETRY) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) sleep(0.001) # after retry DAG run is skipped self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: self.assertEquals(ti.state, State.SKIPPED)
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2017, 8, 17), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), 'catchup': False } timestamp = """{{ ts }}""" dag = DAG('salesforce_data_processing', default_args=default_args, schedule_interval=None, catchup=False) @provide_session def get_conn(conn_id, session=None): conn = (session.query(Connection).filter( Connection.conn_id == conn_id).first()) return conn ## Requires a slack connection stored with a token under ## the extra section with the format of {"token":"TOKEN_HERE"} ## The Conn Type can be left blank.
from datetime import timedelta from airflow import DAG from airflow.operators.bash import BashOperator from airflow.operators.dummy_operator import DummyOperator from airflow.utils.dates import days_ago args = { 'owner': 'airflow', } dag = DAG( dag_id='example_bash_operator', default_args=args, schedule_interval='0 0 * * *', start_date=days_ago(2), dagrun_timeout=timedelta(minutes=60), tags=['example', 'example2'], params={"example_key": "example_value"}, ) run_this_last = DummyOperator( task_id='run_this_last', dag=dag, ) # [START howto_operator_bash] run_this = BashOperator( task_id='run_after_loop', bash_command='echo 1', dag=dag,
def setUp(self): args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE } self.dag = DAG(TEST_DAG_ID, default_args=args)
from airflow.operators.bash_operator import BashOperator from airflow import DAG from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'start_date': datetime.now() - timedelta(minutes=20) } dag = DAG(dag_id='run_telus_pipeline', default_args=default_args) t1_bash = """gsutil -m mv gs://telus_poc_input/sample_geo_data* gs://telus_poc_ready""" t2_bash = """ ts=$(date +%s) run_job='gcloud dataflow jobs run telus_dataflow_$ts --gcs-location gs://telus_poc_ready/dataflow --format="value(id)"' jobid=`eval $run_job` echo "SUBMITTED DATAFLOW JOB: $jobid" done=0 max=100 i=0 while : ; do if [[ $i -gt $max ]];then echo "Max wait exceeded for step, exiting..." exit -1 fi echo "Checking status..." check_status='gcloud dataflow jobs show '$jobid' --format="value(state)"' status=`eval $check_status` echo "DATAFLOW JOB with id $jobid is $status" if [[ $status == 'Done' ]]; then echo "Dataflow job done ... moving on" break
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': dt.datetime(2017, 6, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('etl_social', default_args=default_args, schedule_interval=None) extract = BashOperator( task_id='extract', bash_command= 'cd ~/ETL_twiiter; export PYTHONPATH=.; python etl/task/extract.py -i ~/twitter_dados/input_twitter/ -o ~/twitter_dados/clean_tweets/', dag=dag) tag_sentiment = BashOperator( task_id='tag_sentiment', bash_command= 'cd ~/ETL_twiiter; export PYTHONPATH=.; python etl/task/tag_sentiments.py -i ~/twitter_dados/clean_tweets/ -o ~/twitter_dados/tag_sentiments/ -cl etl/data/class_nb.bin', dag=dag) indexes = BashOperator( task_id='indexer',
from airflow.contrib.operators.ecs_operator import ECSOperator from airflow.operators.sensors import TimeSensor # in Airflow 2.0 should be "from airflow.sensors ..." default_args = { 'owner': XXX, 'depends_on_past': False, 'start_date': datetime(2018, 5, 1), 'email': XXX, 'email_on_failure': True, 'email_on_retry': True, 'retries': 2, 'retry_delay': timedelta(minutes=5), } with DAG('etl_adwords', default_args=default_args, schedule_interval='@daily', max_active_runs=1) as dag: (TimeSensor(task_id='hold_on', target_time=time(hour=3), dag=dag) >> ECSOperator( task_id='run_ecs', task_definition='airflow-etl-adwords', cluster='ecs', overrides={ 'containerOverrides': [{ 'name': 'app', 'environment': [{ 'name': 'EXECUTION_DATE', 'value': '{{ ds }}' }, { 'name': 'RDS_ENDPOINT',
class OOBashSensor(BaseSensorOperator): def poke(self, context): retcode = subprocess.call(['sudo', '--non-interactive', '/usr/local/bin/docker-trampoline', self.task_id, context['ds'], context['execution_date'].isoformat(), (context['execution_date'] + context['dag'].schedule_interval).isoformat()] + self.params.get('argv', [])) if retcode == 42: return True elif retcode == 13: return False else: raise AirflowException('Unexpected exit code: {:d}'.format(retcode)) dag = DAG( dag_id='hist_canning', schedule_interval=timedelta(days=1), start_date=datetime(2012, 12, 5), #end_date=datetime(2017, 7, 7), # NB: end_date is included default_args={ 'retries': 1, }) # NB: removing an Operator from DAG leaves some trash in the database tracking # old state of that operator, but it seems to trigger no issues with 1.8.0 OOBashSensor(task_id='reports_raw_sensor', poke_interval=5*60, timeout=12*3600, retries=0, dag=dag) BashOperator(pool='datacollector_disk_io', task_id='canning', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='tar_reports_raw', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='reports_tgz_s3_sync', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='reports_tgz_s3_ls', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='reports_tgz_cleanup', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='canned_s3_sync', bash_command='shovel_jump.sh', dag=dag) BashOperator(pool='datacollector_disk_io', task_id='canned_s3_ls', bash_command='shovel_jump.sh', dag=dag)
from datetime import datetime from airflow import DAG from airflow.operators.python_operator import PythonOperator from airflow.operators.bash_operator import BashOperator from airflow.operators.postgres_operator import PostgresOperator from airflow.hooks.postgres_hook import PostgresHook dag = DAG('Metadata', description='Create metadata for fact table', schedule_interval='@daily', start_date=datetime(2019, 8, 1), catchup=False) count_facts = PostgresOperator(task_id='count_facts', sql=""" CREATE TABLE IF NOT EXISTS metadata ( checkpoint_date date, fact varchar, nrows integer ); INSERT INTO metadata (checkpoint_date, fact, nrows) ( select current_date, fact_type, count(fact_type) from fact group by fact_type ) ; """, postgres_conn_id='datawarehouse', autocommit=True, database='dwh', dag=dag)
default_args = { 'owner': 'Airflow', 'depends_on_past': False, 'start_date': datetime(2019, 11, 23), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, 'retry_delay': timedelta(minutes=2), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('simple-dataflow', default_args=default_args) def csv_pop(data, *args, **kwargs): # print(args) # print(kwargs) # print(os.getcwd()) print("data: {}".format(data)) path = "data/csv/aapl.csv" df = pd.read_csv(path) if df.size > 0: item = df.iloc[0].to_dict() df.drop(0, inplace=True) df.to_csv(path, index=False)
from airflow.utils import TriggerRule today = datetime.today() default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime.combine(today, time(13, 00, 0)) - timedelta(days=1), 'email': ['*****@*****.**'], 'email_on_failure': True, 'retries': 3, 'retry_delay': timedelta(minutes=5), } TR = TriggerRule dag = DAG('ods_objectrocket', default_args=default_args, schedule_interval=timedelta(days=1)) script_folder = DAGS_FOLDER + '/../scripts/' t0 = BashOperator(task_id='ods_load_batch_0', bash_command=script_folder + 'ods_objectrocket/ods_load_batch_0.sh;', dag=dag) t1 = BashOperator(task_id='ods_load_batch_1', bash_command=script_folder + 'ods_objectrocket/ods_load_batch_1.sh;', dag=dag) t5 = BashOperator(task_id='verify_load', bash_command=script_folder + 'ods_archiving/checkDailyLoad.sh ods_objectrocket;', dag=dag, trigger_rule=TR.ALL_DONE)
class ShortCircuitOperatorTest(unittest.TestCase): def setUp(self): self.dag = DAG('shortcircuit_operator_test', default_args={ 'owner': 'airflow', 'start_date': DEFAULT_DATE}, schedule_interval=INTERVAL) self.short_op = ShortCircuitOperator(task_id='make_choice', dag=self.dag, python_callable=lambda: self.value) self.branch_1 = DummyOperator(task_id='branch_1', dag=self.dag) self.branch_1.set_upstream(self.short_op) self.branch_2 = DummyOperator(task_id='branch_2', dag=self.dag) self.branch_2.set_upstream(self.branch_1) self.upstream = DummyOperator(task_id='upstream', dag=self.dag) self.upstream.set_downstream(self.short_op) self.dag.clear() self.value = True def test_without_dag_run(self): """This checks the defensive against non existent tasks in a dag run""" self.value = False self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) session = Session() tis = session.query(TI).filter( TI.dag_id == self.dag.dag_id, TI.execution_date == DEFAULT_DATE ) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise self.value = True self.dag.clear() self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': # should not exist raise elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise session.close() def test_with_dag_run(self): self.value = False logging.error("Tasks {}".format(self.dag.tasks)) dr = self.dag.create_dagrun( run_id="manual__", start_date=datetime.datetime.now(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.SKIPPED) else: raise self.value = True self.dag.clear() dr.verify_integrity() self.upstream.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) self.short_op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) tis = dr.get_task_instances() self.assertEqual(len(tis), 4) for ti in tis: if ti.task_id == 'make_choice': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'upstream': self.assertEquals(ti.state, State.SUCCESS) elif ti.task_id == 'branch_1' or ti.task_id == 'branch_2': self.assertEquals(ti.state, State.NONE) else: raise
); COMMIT; """) records = redshift_hook.get_records(""" SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1 """) if len(records) > 0 and len(records[0]) > 0: logging.info(f'Youngest rider was born in {record[0][0]}') ##################### # Initalizing the DAG ##################### dag = DAG( 'lesson.demo1', start_date = datetime.datetime.now() - datetime.timedelta(days = 60), schedule_interval = "@monthly") ########### # Operators ########### create_table = PostgresOperator( task_id = 'create_table' postgres_conn_id = 'redshift', sql = sql.CREATE_STATIONS_TABLE_SQL, dag = dag) copy_task = PythonOperator( task_id = 'copy_to_redshift', python_callable = load_data_to_redshift,
compressed_sample_filename = compress_sample(result_filename, config) generate_tabix(compressed_sample_filename, config) copy_result(compressed_sample_filename, sample_id, config) default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime.datetime(2020, 01, 01), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("filter-vcf", default_args=default_args, schedule_interval=None, concurrency=20000, max_active_runs=20000) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) filter_task = PythonOperator( task_id="filter_variants", python_callable=filter_variants, provide_context=True, dag=dag)
import datetime import airflow from airflow import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.bash_operator import BashOperator from airflow.operators.hive_operator import HiveOperator DAG_ARGS = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(0, hour=1), } PROJECT_DAG = DAG(dag_id='final_project', default_args=DAG_ARGS, schedule_interval=datetime.timedelta(hours=1)) SENTINEL_START = DummyOperator(task_id='sentinel_start', dag=PROJECT_DAG) MIMIC_ETL_FILES = [ 'admissions', 'callout', 'caregivers', 'chartevents', 'cptevents', 'datetimeevents', 'diagnoses_icd', 'drgcodes', 'd_cpt' 'd_icd_diagnoses',
from airflow import DAG from airflow.operators.http_operator import SimpleHttpOperator from airflow.sensors.http_sensor import HttpSensor default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(2), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('example_http_operator', default_args=default_args) dag.doc_md = __doc__ # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = SimpleHttpOperator( task_id='post_op', endpoint='api/v1.0/nodes', data=json.dumps({"priority": 5}), headers={"Content-Type": "application/json"}, response_check=lambda response: True if len(response.json()) == 0 else False, dag=dag) t5 = SimpleHttpOperator( task_id='post_op_formenc', endpoint='nodes/url',
from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'description': 'Wikipedia assistant', 'depends_on_past': False, 'start_date': datetime(2020, 7, 28), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('Wiki_Analyzer', default_args=default_args, schedule_interval=timedelta(days=20)) def validate_connections(): print("Connections validation code goes here") validate_connectivity = PythonOperator( task_id='validate_connectivity', provide_context=True, python_callable=validate_connections, dag=dag, ) extract_wiki_data = BashOperator(
retailer_name = 'portland' store_group_id = 1900020001181 category = 'Baseline Forecasting' version = 'v1' family = 'F1' args = { 'owner': 'Jonas', 'email': ['*****@*****.**'], 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(0) } dag = DAG(dag_id='Portland-1900020001181', default_args=args, tags=['Baseline-Forecast', 'Portland']) # # You can also access the DagRun object in templates #baseline-denver-500020000113 #define cluster to use new_cluster = { 'spark_version': '7.0.x-scala2.12', 'node_type_id': 'Standard_F16s', 'driver_node_type_id': 'Standard_D16s_v3', 'num_workers': 10 } #define cluster to use
from airflow.providers.http.operators.http import SimpleHttpOperator from airflow.providers.http.sensors.http import HttpSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'airflow', 'depends_on_past': False, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('example_http_operator', default_args=default_args, tags=['example'], start_date=days_ago(2)) dag.doc_md = __doc__ # task_post_op, task_get_op and task_put_op are examples of tasks created by instantiating operators # [START howto_operator_http_task_post_op] task_post_op = SimpleHttpOperator( task_id='post_op', endpoint='post', data=json.dumps({"priority": 5}), headers={"Content-Type": "application/json"}, response_check=lambda response: response.json()['json']['priority'] == 5, dag=dag, ) # [END howto_operator_http_task_post_op]
default_args = { 'owner': 'Guto - IGTI', 'depends_on_past': False, 'start_date': datetime(2021, 3, 13, 20, 40), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } # Definição da DAG - Fluxo dag = DAG( "treino-04", description="Paralelismos", default_args=default_args, #schedule_interval=timedelta(minutes=2) schedule_interval="*/10 * * * *" ) start_preprocessing = BashOperator( task_id='start_preprocessing', bash_command = 'echo "Start Preprocessing! Vai!"', dag=dag ) get_data = BashOperator( task_id='get_data', bash_command = 'curl http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip -o /usr/local/airflow/data/microdados_enade_2019.zip', dag=dag )
# Global variables that are set using environment varaiables GE_TUTORIAL_DB_URL = os.getenv('GE_TUTORIAL_DB_URL') GE_TUTORIAL_PROJECT_PATH = os.getenv('GE_TUTORIAL_PROJECT_PATH') default_args = { "owner": "Airflow", "start_date": airflow.utils.dates.days_ago(1) } # The DAG definition dag = DAG( dag_id='ge_tutorials_dag', default_args=default_args, schedule_interval=None, ) def load_files_into_db(ds, **kwargs): """ A method to simply load CSV files into a database using SQLAlchemy. """ engine = create_engine(GE_TUTORIAL_DB_URL) with engine.connect() as conn: conn.execute("drop table if exists npi_small cascade ") conn.execute("drop table if exists state_abbreviations cascade ")
"ActionOnFailure": "TERMINATE_JOB_FLOW", "HadoopJarStep": { "Jar": "command-runner.jar", "Args": [ "/usr/local/bin/processlogs", "--domain", "versioncheck.allizom.org", "--bucket", "amo-metrics-logs-stage", "--date", "{{ ds }}" ] } } ] blp_dag = DAG( 'mango_log_processing_adi', default_args=DEFAULT_ARGS, dagrun_timeout=timedelta(hours=6), schedule_interval='0 3 * * *' ) blp_logs = EmrCreateJobFlowOperator( task_id='blp_create_job_flow', job_flow_overrides={'Steps': BLP_STEPS}, aws_conn_id='aws_data_iam', emr_conn_id='emr_data_iam_mango', dag=blp_dag ) blp_job_sensor = EmrJobFlowSensor( task_id='blp_check_job_flow', job_flow_id="{{ task_instance.xcom_pull('blp_create_job_flow', key='return_value') }}", aws_conn_id='aws_data_iam',
from airflow.operators.python_operator import PythonOperator from airflow import DAG from operators import CreateTablesOperator, LoadFromCSVOperator, FetchDataFromDBOperator from helpers import SqlQueries, pyhelpers import os from datetime import datetime, timedelta, date default_args = { 'owner': 'nani', 'start_date': datetime(2019, 1, 1), 'retry_delay': timedelta(minutes=5) } with DAG('setup_base_data', default_args=default_args, schedule_interval='@once') as dag: start_task = DummyOperator(task_id='dummy_start') create_base_tables = CreateTablesOperator( task_id='create_dimension_and_fact_tables', sql_queries=SqlQueries.create_dim_and_fact_tables) get_data_for_dim_country = PythonOperator( task_id='get_dim_country_data', python_callable=pyhelpers.get_specific_columns_and_store_csv, op_kwargs={ 'source_file': os.getenv('path_to_data_folder') + "country_continent_isocodes.csv", 'destination_file':
def create_dag(dag_id, schedule_interval, start_date): with DAG(dag_id=dag_id, schedule_interval=schedule_interval, start_date=start_date, default_args={ 'queue': 'jobs_queue', 'postgres_conn_id': postgres_connection, 'do_xcom_push': True }) as dag: # task definitions # print_log task @dag.task() def print_to_log(dag_id, database): logging.info(f'{dag_id} started processing tables in database: {database}') print_logs = print_to_log(dag_id, 'postgres') # bash task for getting user name get_user = BashOperator( task_id='get_user', bash_command='whoami', ) table_name = config[dag_id]['table_name'] # calls the function for checking table existence check_table_exist = BranchPythonOperator( task_id='check_table_exist', python_callable=check_if_table_exists, op_args=[table_name, postgres_connection], ) # inserts new row into the table insert_row = PostgresOperator( task_id='insert_new_row', sql=f''' INSERT INTO {table_name} VALUES (%s, \'{{{{ ti.xcom_pull(task_ids='get_user') }}}}\', %s); ''', parameters=[ uuid.uuid4().int % 123456789, datetime.now() ], trigger_rule=TriggerRule.NONE_FAILED, ) # fetches results from the table query_the_table = PostgreSQLCountRows( task_id='query_the_table', table_name=table_name, ) # creates a postgres table with table_name create_table = PostgresOperator( task_id='create_table', sql=f''' CREATE TABLE {table_name}( custom_id integer NOT NULL, user_name VARCHAR (50) NOT NULL, timestamp TIMESTAMP NOT NULL); ''', ) # setting task order print_logs >> get_user >> check_table_exist >> [create_table, insert_row] create_table >> insert_row >> query_the_table return dag
number_of_tasks = int(number_of_tasks) task_names = ['process_chunk_' + str(k) for k in range(0, number_of_tasks)] default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2019, 9, 19), 'email': [my_email_address], 'email_on_failure': True, 'email_on_retry': True, 'retries': 1, 'retry_delay': timedelta(seconds=30), } dag = DAG(dag_name, catchup=False, default_args=default_args, schedule_interval=None) # the following tasks are created by instantiating operators dynamically def get_task(j, task_name_j): return SFTPToS3UploadPartOperator( task_id=task_name_j, conn_id_source=sftp_conn, file_source_path=source_path + filename, # access_key=access_key, # secret_key=secret_key, # session_token=session_token, upload_id=upload_id, bucket=bucket,
import KubernetesPodOperator from airflow.models import Variable DEFAULT_ARGS = { 'owner': 'de', 'email': '*****@*****.**', 'email_on_retry': False, 'retries': 3, 'retry_delay': timedelta(minutes=5), 'start_date': datetime(2020, 3, 17) } IMAGE_CONFIG = Variable.get('crosslend_images_config', deserialize_json=True) CONFIG = Variable.get('immobilienscout24_conf', deserialize_json=True) with DAG( 'flat-data-ingestion', default_args=DEFAULT_ARGS, schedule_interval='0 0 * * *' ) as dag: KubernetesPodOperator( namespace='Crosslend_Dataengineering', image=IMAGE_CONFIG['flat-data-ingestion'], cmds=["python", "main.py", "--config", json.dumps(CONFIG)], name="flat-data-ingestion", task_id="flat-data-ingestion", in_cluster=True )
""" Add a Markdown description to a DAG or a task. The description is shown in “Graph View” for DAGs, “Task Details” for tasks. Doc: https://airflow.readthedocs.io/en/latest/concepts.html#documentation-notes """ from airflow import DAG from airflow.operators.bash_operator import BashOperator from datetime import datetime default_args = { 'start_date': datetime.now() } dag = DAG( 'description_markdown', default_args=default_args) dag.doc_md = """ # Markdown hi ## Subheader Here's a [url](www.airbnb.com) My numbered list: 1. one 1. two My bulleted list: - first - second """
'depends_on_past': False, 'start_date': datetime(2019, 1, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } # don't auto-schedule the dag # https://airflow.readthedocs.io/en/stable/scheduler.html dag = DAG('npmjs_static_2', default_args=default_args, schedule_interval=None) # periodically run the dag # dag = DAG('tutorial', default_args=default_args, schedule_interval=timedelta(days=1)) # load dep_tree for packages, relative to AIRFLOW_HOME npmjs_dep_path = "./dags/npmjs.with_stats.dep_graph_2.pickle" dep_tree = pickle.load(open(npmjs_dep_path, "rb")) logging.info("loaded dep_tree with %d nodes", dep_tree.number_of_nodes()) def get_sanitized_pkgname(pkg_name): invalid_name = re.compile(r'[^a-zA-Z0-9_.-]') pkg_name = re.sub(invalid_name, '..', pkg_name) return pkg_name
from airflow import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators import (StageToRedshiftOperator, LoadFactOperator, LoadDimensionOperator, DataQualityOperator) from helpers import SqlQueries # AWS_KEY = os.environ.get('AWS_KEY') # AWS_SECRET = os.environ.get('AWS_SECRET') default_args = { 'owner': 'ranjith', 'start_date': datetime(2019, 1, 12), } dag = DAG('data_pipeline_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *') start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="staging_events", s3_bucket="udacity-dend", s3_key="log_data", ) stage_songs_to_redshift = StageToRedshiftOperator(
class BaseSensorTest(unittest.TestCase): def setUp(self): configuration.load_test_config() args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE } self.dag = DAG(TEST_DAG_ID, default_args=args) session = settings.Session() session.query(TaskReschedule).delete() session.query(DagRun).delete() session.query(TaskInstance).delete() session.commit() def _make_dag_run(self): return self.dag.create_dagrun( run_id='manual__', start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) def _make_sensor(self, return_value, **kwargs): poke_interval = 'poke_interval' timeout = 'timeout' if poke_interval not in kwargs: kwargs[poke_interval] = 0 if timeout not in kwargs: kwargs[timeout] = 0 sensor = DummySensor( task_id=SENSOR_OP, return_value=return_value, dag=self.dag, **kwargs ) dummy_op = DummyOperator( task_id=DUMMY_OP, dag=self.dag ) dummy_op.set_upstream(sensor) return sensor @classmethod def _run(cls, task): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_ok(self): sensor = self._make_sensor(True) dr = self._make_dag_run() self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.SUCCESS) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) def test_fail(self): sensor = self._make_sensor(False) dr = self._make_dag_run() with self.assertRaises(AirflowSensorTimeout): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.FAILED) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) def test_soft_fail(self): sensor = self._make_sensor(False, soft_fail=True) dr = self._make_dag_run() self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: self.assertEquals(ti.state, State.SKIPPED) def test_soft_fail_with_retries(self): sensor = self._make_sensor( return_value=False, soft_fail=True, retries=1, retry_delay=timedelta(milliseconds=1)) dr = self._make_dag_run() # first run fails and task instance is marked up to retry with self.assertRaises(AirflowSensorTimeout): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.UP_FOR_RETRY) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) sleep(0.001) # after retry DAG run is skipped self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: self.assertEquals(ti.state, State.SKIPPED) def test_ok_with_reschedule(self): sensor = self._make_sensor( return_value=None, poke_interval=10, timeout=25, mode='reschedule') sensor.poke = Mock(side_effect=[False, False, True]) dr = self._make_dag_run() # first poke returns False and task is re-scheduled date1 = timezone.utcnow() with freeze_time(date1): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # verify task is re-scheduled, i.e. state set to NONE self.assertEquals(ti.state, State.NONE) # verify one row in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 1) self.assertEquals(task_reschedules[0].start_date, date1) self.assertEquals(task_reschedules[0].reschedule_date, date1 + timedelta(seconds=sensor.poke_interval)) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # second poke returns False and task is re-scheduled date2 = date1 + timedelta(seconds=sensor.poke_interval) with freeze_time(date2): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # verify task is re-scheduled, i.e. state set to NONE self.assertEquals(ti.state, State.NONE) # verify two rows in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 2) self.assertEquals(task_reschedules[1].start_date, date2) self.assertEquals(task_reschedules[1].reschedule_date, date2 + timedelta(seconds=sensor.poke_interval)) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # third poke returns True and task succeeds date3 = date2 + timedelta(seconds=sensor.poke_interval) with freeze_time(date3): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.SUCCESS) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) def test_fail_with_reschedule(self): sensor = self._make_sensor( return_value=False, poke_interval=10, timeout=5, mode='reschedule') dr = self._make_dag_run() # first poke returns False and task is re-scheduled date1 = timezone.utcnow() with freeze_time(date1): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.NONE) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # second poke returns False, timeout occurs date2 = date1 + timedelta(seconds=sensor.poke_interval) with freeze_time(date2): with self.assertRaises(AirflowSensorTimeout): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.FAILED) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) def test_soft_fail_with_reschedule(self): sensor = self._make_sensor( return_value=False, poke_interval=10, timeout=5, soft_fail=True, mode='reschedule') dr = self._make_dag_run() # first poke returns False and task is re-scheduled date1 = timezone.utcnow() with freeze_time(date1): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.NONE) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # second poke returns False, timeout occurs date2 = date1 + timedelta(seconds=sensor.poke_interval) with freeze_time(date2): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: self.assertEquals(ti.state, State.SKIPPED) def test_ok_with_reschedule_and_retry(self): sensor = self._make_sensor( return_value=None, poke_interval=10, timeout=5, retries=1, retry_delay=timedelta(seconds=10), mode='reschedule') sensor.poke = Mock(side_effect=[False, False, False, True]) dr = self._make_dag_run() # first poke returns False and task is re-scheduled date1 = timezone.utcnow() with freeze_time(date1): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.NONE) # verify one row in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 1) self.assertEquals(task_reschedules[0].start_date, date1) self.assertEquals(task_reschedules[0].reschedule_date, date1 + timedelta(seconds=sensor.poke_interval)) self.assertEqual(task_reschedules[0].try_number, 1) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # second poke fails and task instance is marked up to retry date2 = date1 + timedelta(seconds=sensor.poke_interval) with freeze_time(date2): with self.assertRaises(AirflowSensorTimeout): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.UP_FOR_RETRY) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # third poke returns False and task is rescheduled again date3 = date2 + timedelta(seconds=sensor.poke_interval) + sensor.retry_delay with freeze_time(date3): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.NONE) # verify one row in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 1) self.assertEquals(task_reschedules[0].start_date, date3) self.assertEquals(task_reschedules[0].reschedule_date, date3 + timedelta(seconds=sensor.poke_interval)) self.assertEqual(task_reschedules[0].try_number, 2) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # fourth poke return True and task succeeds date4 = date3 + timedelta(seconds=sensor.poke_interval) with freeze_time(date4): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.SUCCESS) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) def test_should_include_ready_to_reschedule_dep(self): sensor = self._make_sensor(True) deps = sensor.deps self.assertTrue(ReadyToRescheduleDep() in deps) def test_invalid_mode(self): with self.assertRaises(AirflowException): self._make_sensor( return_value=True, mode='foo') def test_ok_with_custom_reschedule_exception(self): sensor = self._make_sensor( return_value=None, mode='reschedule') date1 = timezone.utcnow() date2 = date1 + timedelta(seconds=60) date3 = date1 + timedelta(seconds=120) sensor.poke = Mock(side_effect=[ AirflowRescheduleException(date2), AirflowRescheduleException(date3), True, ]) dr = self._make_dag_run() # first poke returns False and task is re-scheduled with freeze_time(date1): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # verify task is re-scheduled, i.e. state set to NONE self.assertEquals(ti.state, State.NONE) # verify one row in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 1) self.assertEquals(task_reschedules[0].start_date, date1) self.assertEquals(task_reschedules[0].reschedule_date, date2) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # second poke returns False and task is re-scheduled with freeze_time(date2): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # verify task is re-scheduled, i.e. state set to NONE self.assertEquals(ti.state, State.NONE) # verify two rows in task_reschedule table task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 2) self.assertEquals(task_reschedules[1].start_date, date2) self.assertEquals(task_reschedules[1].reschedule_date, date3) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) # third poke returns True and task succeeds with freeze_time(date3): self._run(sensor) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: self.assertEquals(ti.state, State.SUCCESS) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE) def test_reschedule_with_test_mode(self): sensor = self._make_sensor( return_value=None, poke_interval=10, timeout=25, mode='reschedule') sensor.poke = Mock(side_effect=[False]) dr = self._make_dag_run() # poke returns False and AirflowRescheduleException is raised date1 = timezone.utcnow() with freeze_time(date1): for dt in self.dag.date_range(DEFAULT_DATE, end_date=DEFAULT_DATE): TaskInstance(sensor, dt).run( ignore_ti_state=True, test_mode=True) tis = dr.get_task_instances() self.assertEquals(len(tis), 2) for ti in tis: if ti.task_id == SENSOR_OP: # in test mode state is not modified self.assertEquals(ti.state, State.NONE) # in test mode no reschedule request is recorded task_reschedules = TaskReschedule.find_for_task_instance(ti) self.assertEquals(len(task_reschedules), 0) if ti.task_id == DUMMY_OP: self.assertEquals(ti.state, State.NONE)
from airflow import DAG from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2015, 6, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('slacker', default_args=default_args, schedule_interval=timedelta(minutes=1)) t1 = BashOperator(task_id='hive', bash_command='hive -f /home/maria_dev/slackbot/slackbot.sql', dag=dag) t2 = BashOperator(task_id='ok', bash_command='echo "ok!"', dag=dag) t2.set_upstream(t1)
GROUP BY city ); COMMIT; """) def log_oldest(): redshift_hook = PostgresHook("redshift") records = redshift_hook.get_records(""" SELECT birthyear FROM older_riders ORDER BY birthyear ASC LIMIT 1 """) if len(records) > 0 and len(records[0]) > 0: logging.info(f"Oldest rider was born in {records[0][0]}") dag = DAG("lesson3.exercise2", start_date=datetime.datetime.utcnow()) load_and_analyze = PythonOperator( task_id='load_and_analyze', dag=dag, python_callable=load_and_analyze, provide_context=True, ) create_oldest_task = PostgresOperator(task_id="create_oldest", dag=dag, sql=""" BEGIN; DROP TABLE IF EXISTS older_riders; CREATE TABLE older_riders AS ( SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
from airflow import DAG from airflow.operators.dagrun_operator import TriggerDagRunOperator from airflow.sensors.external_task_sensor import ExternalTaskSensor from airflow.utils.dates import days_ago with DAG(dag_id="dag_referenced_task_dag_id_exists_fail", schedule_interval=None, start_date=days_ago(1)) as dag: TriggerDagRunOperator(task_id="test_trigger", trigger_dag_id="nonexistent") ExternalTaskSensor(task_id="test_sensor_dag", external_dag_id="nonexistent") ExternalTaskSensor(task_id="test_sensor_task", external_dag_id="nonexistent", external_task_id="non-task")