def test_infer_dag(self):
        dag = DAG('dag', start_date=DEFAULT_DATE)
        dag2 = DAG('dag2', start_date=DEFAULT_DATE)

        op1 = DummyOperator(task_id='test_op_1', owner='test')
        op2 = DummyOperator(task_id='test_op_2', owner='test')
        op3 = DummyOperator(task_id='test_op_3', owner='test', dag=dag)
        op4 = DummyOperator(task_id='test_op_4', owner='test', dag=dag2)

        # double check dags
        self.assertEqual(
            [i.has_dag() for i in [op1, op2, op3, op4]],
            [False, False, True, True])

        # can't combine operators with no dags
        self.assertRaises(AirflowException, op1.set_downstream, op2)

        # op2 should infer dag from op1
        op1.dag = dag
        op1.set_downstream(op2)
        self.assertIs(op2.dag, dag)

        # can't assign across multiple DAGs
        self.assertRaises(AirflowException, op1.set_downstream, op4)
        self.assertRaises(AirflowException, op1.set_downstream, [op3, op4])
    def test_set_dag(self):
        """
        Test assigning Operators to Dags, including deferred assignment
        """
        dag = DAG('dag', start_date=DEFAULT_DATE)
        dag2 = DAG('dag2', start_date=DEFAULT_DATE)
        op = DummyOperator(task_id='op_1', owner='test')

        # no dag assigned
        self.assertFalse(op.has_dag())
        self.assertRaises(AirflowException, getattr, op, 'dag')

        # no improper assignment
        with self.assertRaises(TypeError):
            op.dag = 1

        op.dag = dag

        # no reassignment
        with self.assertRaises(AirflowException):
            op.dag = dag2

        # but assigning the same dag is ok
        op.dag = dag

        self.assertIs(op.dag, dag)
        self.assertIn(op, dag.tasks)
    def test_check_task_dependencies(self, trigger_rule, successes, skipped,
                                     failed, upstream_failed, done,
                                     flag_upstream_failed,
                                     expect_state, expect_completed):
        start_date = datetime.datetime(2016, 2, 1, 0, 0, 0)
        dag = models.DAG('test-dag', start_date=start_date)
        downstream = DummyOperator(task_id='downstream',
                                   dag=dag, owner='airflow',
                                   trigger_rule=trigger_rule)
        for i in range(5):
            task = DummyOperator(task_id='runme_{}'.format(i),
                                 dag=dag, owner='airflow')
            task.set_downstream(downstream)
        run_date = task.start_date + datetime.timedelta(days=5)

        ti = TI(downstream, run_date)
        completed = ti.evaluate_trigger_rule(
            successes=successes, skipped=skipped, failed=failed,
            upstream_failed=upstream_failed, done=done,
            flag_upstream_failed=flag_upstream_failed)

        self.assertEqual(completed, expect_completed)
        self.assertEqual(ti.state, expect_state)
    catchup=False)

stage_int_sql_path = os.path.join(
    JOB_ARGS["stage_sql_path"],  #stage_sql_path = adlogs/load_raw_logs
    "int")
stage_int_hourly_query = SqlUtils.load_query(stage_int_sql_path).split("---")
stage_int_hourly_job = SnowflakeOperator(task_id="stage_adlogs_int_hourly",
                                         snowflake_conn_id=SF_CONN_ID,
                                         warehouse=SF_WAREHOUSE,
                                         database=SF_DATABASE,
                                         sql=stage_int_hourly_query,
                                         params={"env": ENV},
                                         autocommit=True,
                                         dag=DAG)

stage_int_tables = DummyOperator(task_id="finish_int_rl_staging")

stage_onetag_sql_path = os.path.join(JOB_ARGS["stage_sql_path"], "onetag")
stage_onetag_hourly_query = SqlUtils.load_query(stage_onetag_sql_path).split(
    "---")
stage_onetag_hourly_job = SnowflakeOperator(
    task_id="stage_adlogs_onetag_hourly",
    snowflake_conn_id=SF_CONN_ID,
    warehouse=SF_WAREHOUSE,
    database=SF_DATABASE,
    sql=stage_onetag_hourly_query,
    params={"env": ENV},
    autocommit=True,
    dag=DAG)

stage_onetag_table = DummyOperator(task_id="finish_onetag_rl_staging")
with DAG(
        dag_id="external_task_marker_parent",
        start_date=start_date,
        schedule_interval='10 11 * * *',
) as parent_dag:

    parent_task = ExternalTaskMarker(
        task_id="parent_task",
        external_dag_id="external_task_marker_child",
        external_task_id="child_task1",
    )

with DAG(
        dag_id="external_task_marker_child",
        start_date=start_date,
        schedule_interval='20 11 * * *',
) as child_dag:

    child_task1 = ExternalTaskSensor(
        task_id="child_task1",
        external_dag_id=parent_dag.dag_id,
        external_task_id=parent_task.task_id
        # timeout=600,
        # allowed_states=['success'],
        # failed_states=['failed', 'skipped'],
        # mode="reschedule",
    )
    # [END howto_operator_external_task_sensor]
    child_task2 = DummyOperator(task_id="child_task2")
    child_task1 >> child_task2
from airflow.operators import ShortCircuitOperator, DummyOperator
from airflow.models import DAG
import airflow.utils.helpers
from datetime import datetime, timedelta

seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_short_circuit_operator', default_args=args)

cond_true = ShortCircuitOperator(task_id='condition_is_True',
                                 python_callable=lambda: True,
                                 dag=dag)

cond_false = ShortCircuitOperator(task_id='condition_is_False',
                                  python_callable=lambda: False,
                                  dag=dag)

ds_true = [DummyOperator(task_id='true_' + str(i), dag=dag) for i in [1, 2]]
ds_false = [DummyOperator(task_id='false_' + str(i), dag=dag) for i in [1, 2]]

airflow.utils.helpers.chain(cond_true, *ds_true)
airflow.utils.helpers.chain(cond_false, *ds_false)
Example #7
0
    "transform",  #goal of DAG is to transform new log data
    default_args=DEFAULTS,
    start_date=datetime(2018, 1, 1),
    schedule_interval=JOB_ARGS["schedule_interval"],
    catchup=False)

# sensor that waits on the completion of stage_ad_logs_to_snowflake
adlogs_sensor = ExternalTaskSensor(
    task_id="wait_for_stage",  # this sensor's name
    external_dag_id="stage_ad_logs_to_snowflake",  # DAG to reference
    external_task_id="adlogs_snowflake_staging_finish",  # task to wait on
    execution_delta=timedelta(minutes=5),
    dag=DAG)

#dummy op for finish task
transform_finish = DummyOperator(task_id="finish_transforms")

#loop through .yaml list and create sql transform tasks for relevant tables
for table in JOB_ARGS["tables"]:
    #set path to .sql file
    query_log = []  #create empty list to hold sql queries for a given table
    for process in JOB_ARGS["tables"][
            table]:  #loop through processses in the .yaml for a given table
        sql_path = os.path.join(
            JOB_ARGS[
                "stage_sql_path"],  #stage_sql_path = adlogs/log_process/filename.sql
            process)
        sql_query = SqlUtils.load_query(sql_path).split(
            "---"
        )  # sql_query is a list of all the queries in a given .sql file, seperated by '---'
        query_log += sql_query
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
DAG designed to test what happens when a DAG with pooled tasks is run
by a BackfillJob.
Addresses issue #1225.
"""
from datetime import datetime

from airflow.models import DAG
from airflow.operators import DummyOperator

dag = DAG(dag_id='test_backfill_pooled_task_dag')
task = DummyOperator(task_id='test_backfill_pooled_task',
                     dag=dag,
                     pool='test_backfill_pooled_task_pool',
                     owner='airflow',
                     start_date=datetime(2016, 2, 1))
Example #9
0
from airflow import DAG
from airflow.operators import DummyOperator, EmailOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'start_date': datetime.now() - timedelta(seconds=10),
    'retries': 0
}

dag = DAG('Sales_Nov',
          default_args=default_args,
          start_date=datetime.now() - timedelta(seconds=10))

op1 = DummyOperator(task_id='File1_landing', dag=dag)
t1 = EmailOperator(task_id='Processing_File_1',
                   to='*****@*****.**',
                   subject="Airflow_report",
                   html_content="File 1 started",
                   dag=dag)
op2 = DummyOperator(task_id='File2_landing', dag=dag)
t2 = EmailOperator(task_id='Processing_File_2',
                   to='*****@*****.**',
                   subject="Airflow_report",
                   html_content="File 2 started",
                   dag=dag)

op3 = DummyOperator(task_id='Aggregating', dag=dag)
op4 = DummyOperator(task_id='Final_Table_Push', dag=dag)

t1.set_upstream(op1)
Example #10
0
                    python_callable=compare_result,
                    trigger_rule="all_done",
                    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)

join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag)

t4 = QuboleOperator(
    task_id='hadoop_jar_cmd',
    command_type='hadoopcmd',
    sub_command=
    'jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc',
    cluster_label='default',
    fetch_logs=True,
    dag=dag)

t5 = QuboleOperator(
    task_id='pig_cmd',
    command_type="pigcmd",
    script_location=
    "s3://paid-qubole/PigAPIDemo/scripts/script1-hadoop-s3-small.pig",
Example #11
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import, print_function, division, unicode_literals
from airflow import DAG
from airflow.operators import DummyOperator
from datetime import datetime, timedelta

yesterday = datetime.combine(datetime.today() - timedelta(7),
                             datetime.min.time())

default_args = {
    'owner': 'airflow',
    'start_date': yesterday,
}

dag = DAG('schedule',
          default_args=default_args,
          schedule_interval=timedelta(seconds=10))

t1 = DummyOperator(task_id='task1', dag=dag)
Example #12
0
from airflow.operators import DummyOperator
from airflow.operators.http_operator import SimpleHttpOperator
import airflow
from airflow.sensors.http_sensor import HttpSensor

DAG_NAME = 'HTTP_OPERATOR_TEST'
args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(10)}

dag = DAG(
    dag_id=DAG_NAME,
    catchup=False,
    default_args=args,
    schedule_interval='3 12 * * *',
)

start_task = DummyOperator(task_id='starting_task', dag=dag)

http_sensor_task = HttpSensor(task_id='http_sensor_task',
                              http_conn_id='https_default',
                              method='GET',
                              endpoint='dog.ceo/api/breed/hound/images',
                              headers={"Content-Type": "application/json"},
                              xcom_push=True,
                              dag=dag)

t1 = SimpleHttpOperator(task_id='get_labrador',
                        method='GET',
                        http_conn_id='https_default',
                        endpoint='dog.ceo/api/breed/hound/images',
                        headers={"Content-Type": "application/json"},
                        xcom_push=True,
def create_dag(dag_id, schedule, start_date, delta_sensor, airpots_codes,
               default_args):

    dag = DAG(dag_id,
              schedule_interval=schedule,
              start_date=start_date,
              default_args=default_args)

    dag.doc_md = """
    # DAG fetching data from smiles.com.ar
    ### procesing and dumping on postgresql
    """
    """start = TimeDeltaSensor(
        task_id='wait_to_start',
        delta=timedelta(minutes=delta_sensor),
        dag=dag)"""

    start = DummyOperator(task_id="start", dag=dag)

    branches = []

    def return_dates_branches(**kwargs):
        return branches

    gen_url_branch = BranchPythonOperator(
        task_id='generate_url_dates',
        provide_context=True,
        python_callable=return_dates_branches,
        dag=dag)

    def transform_data(**kwargs):
        ti = kwargs['ti']
        raw_data = ti.xcom_pull(task_ids=return_dates_branches())
        data = []
        logging.info(raw_data)
        if raw_data is not None:
            flat_list = [item for sublist in raw_data for item in sublist]
            for row in flat_list:
                row = list(row)
                # add À-ÿ for spanish accents
                date = '/'.join(
                    list(
                        re.compile("([A-ZÀ-ÿ]+)(\d+)([A-ZÀ-ÿ]+)").split(
                            row[1]))[2:4])
                date = dateparser.parse(date,
                                        languages=['pt', 'es'],
                                        date_formats=['%d/%b'
                                                      ]).strftime('%Y-%m-%d')
                row[1] = date
                td = row[4].split(':')
                row[4] = str(timedelta(hours=int(td[0]), minutes=int(td[1])))
                row[5] = int(row[5].replace('.', ''))
                row[6] = int(row[6].replace('.', ''))
                row[8] = row[8].split(' ')[-1]
                row.insert(0, datetime.now().strftime('%Y-%m-%d'))
                data.append(tuple(row))
            return data
        else:
            print('No se recibio datos')

    t2 = PythonOperator(
        task_id='transform_data',
        python_callable=transform_data,
        depends_on_past=True,
        trigger_rule=TriggerRule.ALL_SUCCESS,
        provide_context=True,
        dag=dag,
    )

    t2.doc_md = """
    #### Task Documentation
    Transform fetched data
    @return a list of tuples
    """

    # def gen_url_dates(**kwargs):
    date_start = read_scraped_date(airpots_codes)
    date_end = date_start + timedelta(days=AMOUNT_DAYS)
    date_generated = [
        date_start + timedelta(days=x)
        for x in range(0, (date_end - date_start).days)
    ]

    for i, date in enumerate(date_generated):
        date_ml = str(date.timestamp())[:8] + '00000'
        url_dated = """https://www.smiles.com.ar/emission?originAirportCode={}&destinationAirportCode={}&departureDate={}&adults=1&children=0&infants=0&isFlexibleDateChecked=false&tripType=3&currencyCode=BRL&segments=2&departureDate2={}&originAirportCode2={}&destinationAirportCode2={}""".format(
            airpots_codes[0][0], airpots_codes[1], date_ml, date_ml,
            airpots_codes[0][1], airpots_codes[1])

        get_data_op = PythonOperator(
            task_id='get_data_{}and{}to{}_{}'.format(airpots_codes[0][0],
                                                     airpots_codes[0][1],
                                                     airpots_codes[1], i),
            python_callable=get_data_URL,
            op_kwargs={'URL': url_dated},
            trigger_rule=TriggerRule.ONE_SUCCESS,
            provide_context=True,
            dag=dag,
        )
        branches.append(get_data_op.task_id)
        get_data_op.set_upstream(gen_url_branch)
        get_data_op.set_downstream(t2)
        get_data_op.doc_md = """
        #### Task Documentation
        Fetch data from passed url
        return list of semi-parsed data
        """

    insert_data = PythonOperator(
        task_id='insert_data',
        python_callable=insert_into_table,
        provide_context=True,
        dag=dag,
    )

    insert_data.doc_md = """
    #### Task Documentation
    Insert parsed and transformed data into table
    """
    t2.set_downstream(insert_data)
    gen_url_branch.set_upstream(start)

    return dag
check_to_remove_op = BranchPythonOperator(
    task_id='check_to_remove',
    python_callable=check_to_remove,
    provide_context=True,
    dag=dag
)

check_to_update_op = BranchPythonOperator(
    task_id='check_to_update',
    python_callable=check_to_update,
    provide_context=True,
    dag=dag
)

update_scores_branch_op = DummyOperator(
    task_id='update_scores_branch',
    dag=dag
)
nothing_to_remove_op = DummyOperator(
    task_id='nothing_to_remove',
    dag=dag
)

nothing_to_update_op = DummyOperator(
    task_id='nothing_to_update',
    dag=dag
)

check_job_posting_to_be_updated_op.set_downstream(check_to_remove_op)
check_job_posting_to_be_updated_op.set_downstream(check_to_update_op)

check_work_experience_to_be_updated_op.set_downstream(check_to_remove_op)
Example #15
0
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datetime import datetime

from airflow.models import DAG
from airflow.operators import DummyOperator
DEFAULT_DATE = datetime(2100, 1, 1)

# DAG tests backfill with pooled tasks
# Previously backfill would queue the task but never run it
dag1 = DAG(dag_id='test_start_date_scheduling', start_date=DEFAULT_DATE)
dag1_task1 = DummyOperator(task_id='dummy', dag=dag1, owner='airflow')
Example #16
0
def create_test_pipeline(suffix, trigger_rule, dag):

    skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag)

    always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag)

    join = DummyOperator(task_id=trigger_rule, dag=dag, trigger_rule=trigger_rule)

    join.set_upstream(skip_operator)
    join.set_upstream(always_true)

    final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag)
    final.set_upstream(join)
Example #17
0
remove_scores_op = PostgresOperator(
    task_id='remove_scores',
    postgres_conn_id='db1_etl',
    sql='scripts/postgres/remove_work_experience_job_post_scores.sql',
    dag=dag
)

update_scores_op = PostgresOperator(
    task_id='update_scores',
    postgres_conn_id='db1_etl',
    sql='scripts/postgres/update_work_experience_job_post_scores.sql',
    dag=dag
)

dummy_op = DummyOperator(task_id='compute_similarity_branching', dag=dag)

copy_scores_to_temp_table_op = BashOperator(
    task_id='copy_scores_to_temp_table',
    bash_command='scripts/bash/copy_scores_to_temp_table.sh',
    params={"partnum": 4},
    provide_context=True,
    dag=dag)

for option in np.arange(4):
    t = PythonOperator(
       task_id='compute_similarity_branch_%d' % option,
       python_callable=compute_similarity_score,
       params={'partnum': 4, 'partindex': option},
       provide_context=True,
       pool='high_memory_usage',
import sys

from qfl.etl.data_ingest import daily_equity_price_ingest

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2015, 8, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('etl_daily',
          start_date=datetime(2016, 05, 01),
          schedule_interval="0 0 14 * MON-FRI",
          default_args=default_args)

t2 = PythonOperator(task_id='daily_equity_price_ingest',
                    python_callable=daily_equity_price_ingest,
                    dag=dag)

run_this_last = DummyOperator(task_id='run_this_last', dag=dag)
run_this_last.set_upstream(t2)
Example #19
0
from airflow.operators import DummyOperator, PythonOperator
import airflow.hooks.S3_hook

default_args = {
    'owner': 'anitha',
    'start_date': datetime(2019, 1, 1),
    'retry_delay': timedelta(minutes=5)
}


def upload(filename, key, bucket_name):
    hook = airflow.hooks.S3_hook.S3Hook('anitha_s3')
    hook.load_file(filename, key, bucket_name)


# Using the context manager alllows you not to duplicate
with DAG('S3', default_args=default_args, schedule_interval='@once') as dag:

    start_task = DummyOperator(task_id='dummy_start')

    upload_to_S3_task = PythonOperator(
        task_id='upload_file_to_S3',
        python_callable=upload,
        op_kwargs={
            'filename': '/home/ec2-user/airflow/dags/email.py',
            'key': 'email.py',
            'bucket_name': 'saksbucket',
        },
    )
    start_task >> upload_to_S3_task
# BranchPython operator that depends on past
# and where tasks may run or be skipped on
# alternating runs
dag = DAG(dag_id='example_branch_dop_operator_v3',schedule_interval='*/1 * * * *',  default_args=args)

def should_run(ds, **kwargs):

    print("------------- exec dttm = {} and minute = {}".format(kwargs['execution_date'], kwargs['execution_date'].minute))
    if kwargs['execution_date'].minute % 2 == 0:
        return "oper_1"
    else:
        return "oper_2"


cond = BranchPythonOperator(
    task_id='condition',
    provide_context=True,
    python_callable=should_run,
    dag=dag)

oper_1 = DummyOperator(
    task_id='oper_1',
    dag=dag)
oper_1.set_upstream(cond)

oper_2 = DummyOperator(
    task_id='oper_2',
    dag=dag)
oper_2.set_upstream(cond)
Example #21
0
dir_extract_applist = os.path.join(project_dir, "extract_applist")
script_extract_applist = "extract-applist.sh"
dir_media_stats = os.path.join(project_dir, "media_stats")
script_media_stats = "media-daily-stats.sh"

##interval day between now and execution date day
day_interval_between_execution_and_now = "{% set now_s = macros.time.mktime(macros.datetime.now().timetuple()) %} \
	{% set exe_s = macros.time.mktime(execution_date.timetuple()) %} \
	{% set interval_day = (now_s - exe_s)/(3600*24) %} \
	{{ interval_day|int }}"

dag = DAG(
    dag_id="dsp-report-daily", default_args=args, start_date=start_date_daily_rounded, schedule_interval="0 0 * * *"
)

start_task = DummyOperator(task_id="start_now", dag=dag)
end_task = DummyOperator(task_id="end_here", dag=dag, trigger_rule="all_done")


def gen_hourly_job_sensor(
    report_name_value="your_report_name",
    task_id_value=None,
    report_time_type_value="hourly",
    report_time_day_value="1970-01-01",
    mysql_connid_value=mysql_conn_Id,
    table_value=success_job_table,
    parent_dag=dag,
):
    sql_template = "select case when count(*) >=24 \
	then 1 else 0 end  from {table} \
	where report_name = '{report_name}' \
default_args = {
    'owner': 'arnaud',
    'start_date': datetime(2020, 5, 27),
    'depends_on_past': False,
    'catchup': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

#set DAG run at 5:30 AM everyday
dag = DAG('stock_current_etl_dag',
          default_args=default_args,
          schedule_interval='@daily')

start_task = DummyOperator(task_id='dummy_start', dag=dag)

upload_current_news_to_S3_task = PythonOperator(
    task_id='upload_current_news_to_S3',
    python_callable=current_news_etl,
    dag=dag)

upload_current_stock_to_S3_task = PythonOperator(
    task_id='upload_current_stock_to_S3',
    python_callable=current_stocks_etl,
    op_kwargs={
        'list_of_stocks': [
            'AAPL', 'INTC', 'TSLA', 'GILD', 'BA', 'AMZN', 'CBB', 'DAL', 'MMM',
            'MSFT'
        ],
        'ndays':
Example #23
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import, print_function, division, unicode_literals
from airflow import DAG
from airflow.operators import DummyOperator, ShortCircuitOperator
from airflow.utils import chain
from datetime import datetime, timedelta

yesterday = datetime.combine(datetime.today() - timedelta(7), datetime.min.time())

default_args = {
    'owner': 'airflow',
    'start_date': yesterday,
}

dag = DAG('skip', default_args=default_args)

t1 = DummyOperator(task_id='task1', dag=dag)
t2 = DummyOperator(task_id='task2', dag=dag)
t3 = DummyOperator(task_id='task3', dag=dag)

cond_true = ShortCircuitOperator(task_id='cond_t', python_callable=lambda: True, dag=dag)
cond_false = ShortCircuitOperator(task_id='cond_f', python_callable=lambda: False, dag=dag)

chain(t1, cond_true, t2)
chain(t1, cond_false, t3)
Example #24
0
#Defining SubDag structure
#############################################################################

default_args = {
    'owner': 'dale',
    'start_date': datetime(2019, 9, 1),
    'retry_delay': timedelta(minutes=.25)
    #,'concurrency': 22
}

one_dag = DAG(
    parent_dag_name, default_args=default_args, schedule_interval='@once'
)  #in production, need to update this to run once daily (add various dags and set variables in Airflow?)

#end dummy dag
start_task = DummyOperator(task_id='start_task', dag=one_dag)


# Dynamcially creates a task that randomly selects which audit table to insert data into with the goal of distibuting inserts which overcomes snowflake locking table issues
def create_dynamic_task_dist_audit(database_name, table):
    return PythonOperator(
        #provide_context=True,
        task_id='select_audit_table_' + database_name + '_' + table,
        pool='Pool_max_parallel_5',
        python_callable=distribute_audit_inserts,
        dag=one_dag)


# Creates the tasks dynamically.  Each one will elaborate one chunk of data.
def create_dynamic_task_tos3(database_name, table):
    return PythonOperator(
    True,
    "approved":
    True,
    "inProg":
    False,
    "done":
    False,
    "approvedBy":
    "karakuri",
    "workflow":
    workflow_id
})

print("TASKS: ", tasks)

dag = DAG('sfsc_review_new_airflow_process_tasks',
          default_args=default_args,
          schedule_interval=None)

start = DummyOperator(task_id='start', default_args=default_args, dag=dag)

process = SubDagOperator(
    task_id='process',
    subdag=subdag_tasks('sfsc_review_new_airflow_process_tasks', 'process',
                        tasks, default_args),
    default_args=default_args,
    dag=dag,
)

start >> process
s3 = boto3.resource('s3')
s3_filename = 'train.csv'
local_filename = '/home/jennie/workspace/titanic/train.csv'
bucket_name = 'airflow-demo-09092019'

default_args = {
    'owner': 'Jennie',
    'start_date': datetime(2019, 1, 1),
    # 'retry_delay': timedelta(minutes=5)
}

dag = DAG('abbbbbbbbb_titanic_analysis', default_args=default_args, schedule_interval='@once')

# task 1: dummy =====================
dummy_task = DummyOperator(
    task_id = 'dummy_start',
    dag=dag
)

# task 2: Upload file =====================
upload_task = PythonOperator(
    task_id = 'upload_file_to_s3',
    python_callable= upload_file_to_s3,
    op_kwargs={
        's3': s3,
        'filename': local_filename,
        'key': s3_filename,
        'bucket_name': bucket_name
    },
    dag= dag
)
DAG_NAME = 'example_subdag_operator'

args = {
    'owner': 'airflow',
    'start_date': datetime(2016, 1, 1),
}

dag = DAG(
    dag_id=DAG_NAME,
    default_args=args,
    schedule_interval="@once",
)

start = DummyOperator(
    task_id='start',
    default_args=args,
    dag=dag,
)

section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    default_args=args,
    dag=dag,
)

some_other_task = DummyOperator(
    task_id='some-other-task',
    default_args=args,
    dag=dag,
)
default_args = {
    'owner': 'airflow',
    'start_date': datetime(2017, 12, 19)
}

# Schedule this DAG to run once.
dag = DAG('ah_ftp_operator',
          description='FTPs with FTPOperator',
          schedule_interval='@once',
          start_date=datetime(2017, 12, 18),
          default_args=default_args)


with dag:
    # Dummy start DAG.
    kick_off_dag = DummyOperator(task_id='kick_off_dag')

    # Call the functions

    ftp_to_s3 = FTPToS3Operator(
        task_id='download_file',
        ftp_conn_id='astro_ftp',
        ftp_directory='/files/test_ah/sample_data.csv',
        local_path='test_data.csv',
        s3_conn_id='astronomer-s3',
        s3_bucket='astronomer-worflows-dev',
        s3_key='test_data.csv',
    )

    # A task won't start until the one before it does.
    # e.g. the upload won't start until the download taks finishes.
Example #29
0
from builtins import range
from airflow.operators import BashOperator, DummyOperator
from airflow.models import DAG
from datetime import datetime, timedelta

seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_bash_operator', default_args=args)

cmd = 'ls -l'
run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

run_this = BashOperator(
    task_id='run_after_loop', bash_command='echo 1', dag=dag)
run_this.set_downstream(run_this_last)

for i in range(3):
    i = str(i)
    task = BashOperator(
        task_id='runme_'+i,
        bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
        dag=dag)
    task.set_downstream(run_this)

task = BashOperator(
    task_id='also_run_this',
Example #30
0
def generate_dag(area, download_dir, default_args):
    """Generate Landsat8 ingestion DAGs.

    Parameters
    ----------
    area: Landsat8Area
        Configuration parameters for the Landsat8 area to be downloaded
    default_args: dict
        Default arguments for all tasks in the DAG.

    """

    dag = DAG(
        LANDSAT8.id + "_{}".format(area.name),
        description="DAG for downloading, processing and ingesting {} AOI in Landsat8 data "
                    "from scene_list".format(area.name),
        default_args=default_args,
        dagrun_timeout=LANDSAT8.dagrun_timeout,
        schedule_interval=LANDSAT8.dag_schedule_interval,
        catchup=LANDSAT8.catchup,
	max_active_runs=LANDSAT8.max_active_runs,
        params={
            "area": area,
        }
    )
    search_task = Landsat8SearchOperator(
        task_id='search_{}'.format(area.name),
        area=area,
        cloud_coverage=LANDSAT8.cloud_coverage,
        startdate = LANDSAT8.startdate,
        enddate = LANDSAT8.enddate,
        filter_max =LANDSAT8.filter_max,
        order_by =LANDSAT8.order_by,
        order_type =LANDSAT8.order_type,
        db_credentials= CFG.landsat8_postgresql_credentials,
        dag=dag
    )
    generate_html_description = Landsat8ProductDescriptionOperator(
        task_id='generate_html_description',
        description_template=os.path.join(
            CFG.templates_base_dir, "product_abstract.html"),
        download_dir=download_dir,
        dag=dag
    )
    download_thumbnail = Landsat8DownloadOperator(
        task_id="download_thumbnail",
        download_dir=download_dir,
        get_inputs_from=search_task.task_id,
        url_fragment="thumb_small.jpg",
        download_max=LANDSAT8.download_max,
        geoserver_rest_url=CFG.geoserver_rest_url,
        geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection,
        geoserver_username=CFG.geoserver_username,
        geoserver_password=CFG.geoserver_password,
        dag=dag
    )
    generate_thumbnail = Landsat8ThumbnailOperator(
        task_id='generate_thumbnail',
        get_inputs_from=download_thumbnail.task_id,
        thumb_size_x="64",
        thumb_size_y="64",
        dag=dag
    )
    download_metadata = Landsat8DownloadOperator(
        task_id="download_metadata",
        download_dir=download_dir,
        get_inputs_from=search_task.task_id,
        url_fragment="MTL.txt",
        download_max=LANDSAT8.download_max,
        geoserver_rest_url=CFG.geoserver_rest_url,
        geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection,
        geoserver_username=CFG.geoserver_username,
        geoserver_password=CFG.geoserver_password,
        dag=dag
    )

    join_task = DummyOperator(
        task_id='landsat8_join',
        dag=dag
    )

    download_tasks = []
    translate_tasks = []
    addo_tasks = []
    upload_tasks = []
    gdalinfo_tasks = []

    for band in area.bands:
        download_band = Landsat8DownloadOperator(
            task_id="download_band{}".format(band),
            download_dir=download_dir,
            get_inputs_from=search_task.task_id,
            url_fragment="B{}.TIF".format(band),
            download_max=LANDSAT8.download_max,
            geoserver_rest_url=CFG.geoserver_rest_url,
            geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection,
            geoserver_username=CFG.geoserver_username,
            geoserver_password=CFG.geoserver_password,
            dag=dag
        )
        download_tasks.append(download_band)

        translate = GDALTranslateOperator(
            task_id="translate_band{}".format(band),
            get_inputs_from=download_band.task_id,
            dag=dag
        )
        translate_tasks.append(translate)

        addo = GDALAddoOperator(
            task_id="add_overviews_band{}".format(band),
            get_inputs_from=translate.task_id,
            resampling_method="average",
            max_overview_level=128,
            compress_overview="PACKBITS",
            dag=dag
        )
        addo_tasks.append(addo)

        gdalinfo = GDALInfoOperator(
            task_id='landsat8_gdalinfo_band_{}'.format(band),
            get_inputs_from=addo.task_id,
            dag=dag
        )
        gdalinfo_tasks.append(gdalinfo)

        upload = RSYNCOperator(
            task_id="upload_band{}".format(band),
            host=CFG.rsync_hostname,
            remote_usr=CFG.rsync_username,
            ssh_key_file=CFG.rsync_ssh_key,
            remote_dir=LANDSAT8.repository_dir,
            get_inputs_from=addo.task_id,
            dag=dag)
        upload_tasks.append(upload)

        download_band.set_upstream(search_task)
        translate.set_upstream(download_band)
        addo.set_upstream(translate)
        gdalinfo.set_upstream(addo)
        upload.set_upstream(addo)
        join_task.set_upstream(upload)
        join_task.set_upstream(gdalinfo)

    download_task_ids = ( task.task_id for task in download_tasks )
    create_original_package_task = PythonOperator(task_id="create_original_package",
                                  python_callable=create_original_package,
                                  op_kwargs={
                                      'get_inputs_from': {
                                          "search_task_id"  : search_task.task_id,
                                          "download_task_ids" : download_task_ids,
                                      }
                                      ,
                                      'out_dir' : LANDSAT8.process_dir
                                  },
                                  dag=dag)

    upload_original_package_task = RSYNCOperator(
        task_id="upload_original_package",
        host=CFG.rsync_hostname,
        remote_usr=CFG.rsync_username,
        ssh_key_file=CFG.rsync_ssh_key,
        remote_dir=LANDSAT8.original_package_upload_dir,
        get_inputs_from=create_original_package_task.task_id,
        dag=dag)

    # we only neeed gdalinfo output on one of the granules
    gdalinfo_task = gdalinfo_tasks[0]
    gdalinfo_task_id = gdalinfo_task.task_id

    upload_task_ids = (task.task_id for task in upload_tasks)
    generate_metadata = Landsat8MTLReaderOperator(
        task_id='generate_metadata',
        original_package_download_base_url = LANDSAT8.original_package_download_base_url,
        gs_workspace = LANDSAT8.geoserver_workspace,
        gs_wms_layer = LANDSAT8.geoserver_layer,
        gs_wms_width = LANDSAT8.geoserver_oseo_wms_width,
        gs_wms_height = LANDSAT8.geoserver_oseo_wms_height,
        gs_wms_format = LANDSAT8.geoserver_oseo_wms_format,
        gs_wms_version = LANDSAT8.geoserver_oseo_wms_version,
        gs_wfs_featuretype = LANDSAT8.geoserver_featuretype,
        gs_wfs_format = LANDSAT8.geoserver_oseo_wfs_format,
        gs_wfs_version=LANDSAT8.geoserver_oseo_wfs_version,
        gs_wcs_scale_i = LANDSAT8.geoserver_oseo_wcs_scale_i,
        gs_wcs_scale_j = LANDSAT8.geoserver_oseo_wcs_scale_j,
        gs_wcs_format = LANDSAT8.geoserver_oseo_wcs_format,
        gs_wcs_version = LANDSAT8.geoserver_oseo_wcs_version,
        gs_wcs_coverage_id = LANDSAT8.geoserver_layer,
        get_inputs_from={
            "search_task_id"  : search_task.task_id,
            "metadata_task_id": download_metadata.task_id,
            "upload_task_ids" : upload_task_ids,
            "gdalinfo_task_id": gdalinfo_task_id,
            "upload_original_package_task_id": upload_original_package_task.task_id,
        },
        metadata_xml_path=os.path.join(CFG.templates_base_dir, "metadata.xml"),
        dag=dag
    )

    product_zip_task = Landsat8ProductZipFileOperator(
        task_id='landsat8_product_zip',
        get_inputs_from=[
            generate_html_description.task_id,
            generate_metadata.task_id,
            generate_thumbnail.task_id
        ],
        output_dir=LANDSAT8.process_dir,
        dag=dag
    )

    # curl -vvv -u evoadmin:\! -XPOST -H "Content-type: application/zip" --data-binary @/var/data/Sentinel-2/S2_MSI_L1C/download/S2A_MSIL1C_20170909T093031_N0205_R136_T36VUQ_20170909T093032/product.zip "http://ows-oda.eoc.dlr.de/geoserver/rest/oseo/collections/SENTINEL2/products"
    publish_task = PythonOperator(task_id="publish_product_task",
                                  python_callable=publish_product,
                                  op_kwargs={
                                      'geoserver_username': CFG.geoserver_username,
                                      'geoserver_password': CFG.geoserver_password,
                                      'geoserver_rest_endpoint': '{}/oseo/collections/{}/products'.format(
                                          CFG.geoserver_rest_url, LANDSAT8.geoserver_oseo_collection),
                                      'get_inputs_from': product_zip_task.task_id,
                                  },
                                  dag=dag)

    if CFG.eoxserver_rest_url:
      publish_eox_task = PythonOperator(task_id="publish_product_eox_task",
                                    python_callable=publish_product,
                                    op_kwargs={
                                      'geoserver_username': CFG.eoxserver_username,
                                      'geoserver_password': CFG.eoxserver_password,
                                      'geoserver_rest_endpoint': CFG.eoxserver_rest_url,
                                      'get_inputs_from': product_zip_task.task_id,
                                    },
                                    dag = dag)

    download_thumbnail.set_upstream(search_task)
    download_metadata.set_upstream(search_task)
    for tid in download_tasks:
        create_original_package_task.set_upstream(tid)
    upload_original_package_task.set_upstream(create_original_package_task)
    generate_metadata.set_upstream(join_task)
    generate_metadata.set_upstream(download_metadata)
    generate_metadata.set_upstream(upload_original_package_task)
    generate_thumbnail.set_upstream(download_thumbnail)
    generate_html_description.set_upstream(search_task)
    product_zip_task.set_upstream(generate_html_description)
    product_zip_task.set_upstream(generate_metadata)
    product_zip_task.set_upstream(generate_thumbnail)
    publish_task.set_upstream(upload_original_package_task)
    publish_task.set_upstream(product_zip_task)

    if CFG.eoxserver_rest_url:
        publish_eox_task.set_upstream(publish_task)

    return dag
Example #31
0
seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': seven_days_ago,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
)

dag = DAG('simple', default_args=default_args)
t1 = DummyOperator(
    task_id='testairflow',
    bash_command='python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task1_1.py',
    dag=dag)
t2 = DummyOperator(
    task_id='testairflow',
    bash_command='python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task1_2.py;'
                 'python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task1_3.py;',
    dag=dag)
t3 = DummyOperator(
    task_id='testairflow',
    bash_command='python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task2_1.py;'
                 'python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task2_2.py;',
                  'python C:\\Users\Lansrod\Desktop\\truata project\pyspark_introduction\src\\task2_3.py;',
    dag=dag)
DAG_NAME = 'example_subdag_operator'

args = {
    'owner': 'airflow',
    'start_date': datetime(2016, 1, 1),
}

dag = DAG(
    dag_id=DAG_NAME,
    default_args=args,
    schedule_interval="@once",
)

start = DummyOperator(
    task_id='start',
    default_args=args,
    dag=dag,
)

section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    default_args=args,
    dag=dag,
)

some_other_task = DummyOperator(
    task_id='some-other-task',
    default_args=args,
    dag=dag,
)
# DAG tests depends_on_past dependencies
dag2 = DAG(dag_id='test_depends_on_past', default_args=default_args)
dag2_task1 = DummyOperator(
    task_id='test_dop_task',
    dag=dag2,
    depends_on_past=True,)

# DAG tests that a Dag run that doesn't complete is marked failed
dag3 = DAG(dag_id='test_dagrun_states_fail', default_args=default_args)
dag3_task1 = PythonOperator(
    task_id='test_dagrun_fail',
    dag=dag3,
    python_callable=fail)
dag3_task2 = DummyOperator(
    task_id='test_dagrun_succeed',
    dag=dag3,)
dag3_task2.set_upstream(dag3_task1)

# DAG tests that a Dag run that completes but has a failure is marked success
dag4 = DAG(dag_id='test_dagrun_states_success', default_args=default_args)
dag4_task1 = PythonOperator(
    task_id='test_dagrun_fail',
    dag=dag4,
    python_callable=fail,
)
dag4_task2 = DummyOperator(
    task_id='test_dagrun_succeed',
    dag=dag4,
    trigger_rule=TriggerRule.ALL_FAILED
)
Example #34
0
    'start_date': datetime(2015, 8, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('etl_daily',
          start_date=datetime(2016, 05, 01),
          schedule_interval="0 0 14 * MON-FRI",
          default_args=default_args)

t1 = PythonOperator(task_id='test_airflow',
                    python_callable=test_airflow,
                    dag=dag)

t2 = PythonOperator(task_id='daily_equity_price_ingest',
                    python_callable=daily_equity_price_ingest,
                    dag=dag)

run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

t2.set_upstream(t1)

run_this_last.set_upstream(t2)
}

# BranchPython operator that depends on past
# and where tasks may run or be skipped on
# alternating runs
dag = DAG(dag_id='example_branch_dop_operator_v3',
          schedule_interval='*/1 * * * *',
          default_args=args)


def should_run(ds, **kwargs):

    print("------------- exec dttm = {} and minute = {}".format(
        kwargs['execution_date'], kwargs['execution_date'].minute))
    if kwargs['execution_date'].minute % 2 == 0:
        return "oper_1"
    else:
        return "oper_2"


cond = BranchPythonOperator(task_id='condition',
                            provide_context=True,
                            python_callable=should_run,
                            dag=dag)

oper_1 = DummyOperator(task_id='oper_1', dag=dag)
oper_1.set_upstream(cond)

oper_2 = DummyOperator(task_id='oper_2', dag=dag)
oper_2.set_upstream(cond)
from airflow.operators import BranchPythonOperator, DummyOperator
from airflow.models import DAG
from datetime import datetime, timedelta
import random

seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_branch_operator', default_args=args)

cmd = 'ls -l'
run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

options = ['branch_a', 'branch_b', 'branch_c', 'branch_d']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(run_this_first)

join = DummyOperator(
    task_id='join',
    trigger_rule='one_success',
    dag=dag
)
Example #37
0
from datetime import datetime, timedelta

seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(
    dag_id='dag2',
    default_args=args,
    schedule_interval="30 17 * * *"  # 这里可以填crontab时间格式
)

task0 = DummyOperator(task_id='task0', dag=dag)

cmd = 'ls -l'
task1 = BashOperator(task_id='task1', bash_command=cmd, dag=dag)

task0.set_downstream(task1)

task2 = DummyOperator(trigger_rule='all_done',
                      task_id='task2',
                      dag=dag,
                      depends_on_past=True)

task2.set_upstream(task1)

task3 = DummyOperator(trigger_rule='all_done',
                      depends_on_past=True,
from airflow.operators import PythonOperator, DummyOperator
# task 1: dummy =====================
# 3 types:

# sensor: keep running until a certain criteria is met
# HdfsSensor: Waits for a file or folder to land in HDFS

# operator: trigger a certain action (eg. call a function)
# PythonOperator

# transfer: move data from one location to another
# S3ToRedshiftTransfer: load files from s3 to Redshift

dummy_task = DummyOperator(
    task_id='dummy_start',
    # dag container
    dag=dag)

# task 2: Upload file =====================
di_bar = PythonOperator(
    task_id='di_bar',
    # function that is invoked
    python_callable=mua_may_quay_cuong,
    # function arguments
    op_kwargs={'bai_hat': 'Vinahouse'},
    dag=dag)


def mua_may_quay_cuong(bai_hat):
    print('Dang quay bai', bai_hat)
Example #39
0
from airflow.operators import BranchPythonOperator, DummyOperator
from airflow.models import DAG
from datetime import datetime, timedelta
import random

seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_branch_operator', default_args=args)

cmd = 'ls -l'
run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

options = ['branch_a', 'branch_b', 'branch_c', 'branch_d']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(run_this_first)

for option in options:
    t = DummyOperator(task_id=option, dag=dag)
    t.set_upstream(branching)
    dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag)
    t.set_downstream(dummy_follow)
Example #40
0
dag2 = DAG(dag_id='test_depends_on_past', start_date=DEFAULT_DATE)
dag2_task1 = DummyOperator(
    task_id='test_dop_task',
    dag=dag2,
    depends_on_past=True,
    owner='airflow')

# DAG tests that a Dag run that doesn't complete is marked failed
dag3 = DAG(dag_id='test_dagrun_states_fail', start_date=DEFAULT_DATE)
dag3_task1 = PythonOperator(
    task_id='test_dagrun_fail',
    dag=dag3,
    owner='airflow',
    python_callable=fail)
dag3_task2 = DummyOperator(
    task_id='test_dagrun_succeed',
    dag=dag3,
    owner='airflow')
dag3_task2.set_upstream(dag3_task1)

# DAG tests that a Dag run that completes but has a failure is marked success
dag4 = DAG(dag_id='test_dagrun_states_success', start_date=DEFAULT_DATE)
dag4_task1 = PythonOperator(
    task_id='test_dagrun_fail',
    dag=dag4,
    owner='airflow',
    python_callable=fail,
)
dag4_task2 = DummyOperator(
    task_id='test_dagrun_succeed',
    dag=dag4,
    owner='airflow',