Example #1
0
def create_test_pipeline(suffix, trigger_rule, dag):

    skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag)

    always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag)

    join = DummyOperator(task_id=trigger_rule, dag=dag, trigger_rule=trigger_rule)

    join.set_upstream(skip_operator)
    join.set_upstream(always_true)

    final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag)
    final.set_upstream(join)
Example #2
0
dag2_task1 = DummyOperator(
    task_id='test_dop_task',
    dag=dag2,
    depends_on_past=True,
)

# DAG tests that a Dag run that doesn't complete is marked failed
dag3 = DAG(dag_id='test_dagrun_states_fail', default_args=default_args)
dag3_task1 = PythonOperator(task_id='test_dagrun_fail',
                            dag=dag3,
                            python_callable=fail)
dag3_task2 = DummyOperator(
    task_id='test_dagrun_succeed',
    dag=dag3,
)
dag3_task2.set_upstream(dag3_task1)

# DAG tests that a Dag run that completes but has a failure is marked success
dag4 = DAG(dag_id='test_dagrun_states_success', default_args=default_args)
dag4_task1 = PythonOperator(
    task_id='test_dagrun_fail',
    dag=dag4,
    python_callable=fail,
)
dag4_task2 = DummyOperator(task_id='test_dagrun_succeed',
                           dag=dag4,
                           trigger_rule=TriggerRule.ALL_FAILED)
dag4_task2.set_upstream(dag4_task1)

# DAG tests that a Dag run that completes but has a root failure is marked fail
dag5 = DAG(dag_id='test_dagrun_states_root_fail', default_args=default_args)
from airflow.operators import BranchPythonOperator, DummyOperator
from airflow.models import DAG
from datetime import datetime, timedelta
import random

seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_branch_operator', default_args=args)

cmd = 'ls -l'
run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

options = ['branch_a', 'branch_b', 'branch_c', 'branch_d']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(run_this_first)

for option in options:
    t = DummyOperator(task_id=option, dag=dag)
    t.set_upstream(branching)
    dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag)
    t.set_downstream(dummy_follow)
def generate_dag(area, download_dir, default_args):
    """Generate Landsat8 ingestion DAGs.

    Parameters
    ----------
    area: Landsat8Area
        Configuration parameters for the Landsat8 area to be downloaded
    default_args: dict
        Default arguments for all tasks in the DAG.

    """

    dag = DAG(
        LANDSAT8.id + "_{}".format(area.name),
        description="DAG for downloading, processing and ingesting {} AOI in Landsat8 data "
                    "from scene_list".format(area.name),
        default_args=default_args,
        dagrun_timeout=LANDSAT8.dagrun_timeout,
        schedule_interval=LANDSAT8.dag_schedule_interval,
        catchup=LANDSAT8.catchup,
        params={
            "area": area,
        }
    )
    search_task = Landsat8SearchOperator(
        task_id='search_{}'.format(area.name),
        area=area,
        cloud_coverage=LANDSAT8.cloud_coverage,
        startdate = LANDSAT8.startdate,
        enddate = LANDSAT8.enddate,
        filter_max =LANDSAT8.filter_max,
        order_by =LANDSAT8.order_by,
        order_type =LANDSAT8.order_type,
        db_credentials= CFG.landsat8_postgresql_credentials,
        dag=dag
    )
    generate_html_description = Landsat8ProductDescriptionOperator(
        task_id='generate_html_description',
        description_template=os.path.join(
            TEMPLATES_PATH, "product_abstract.html"),
        download_dir=download_dir,
        dag=dag
    )
    download_thumbnail = Landsat8DownloadOperator(
        task_id="download_thumbnail",
        download_dir=download_dir,
        get_inputs_from=search_task.task_id,
        url_fragment="thumb_small.jpg",
        download_max=LANDSAT8.download_max,
        geoserver_rest_url=CFG.geoserver_rest_url,
        geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection,
        geoserver_username=CFG.geoserver_username,
        geoserver_password=CFG.geoserver_password,
        dag=dag
    )
    generate_thumbnail = Landsat8ThumbnailOperator(
        task_id='generate_thumbnail',
        get_inputs_from=download_thumbnail.task_id,
        thumb_size_x="64",
        thumb_size_y="64",
        dag=dag
    )
    download_metadata = Landsat8DownloadOperator(
        task_id="download_metadata",
        download_dir=download_dir,
        get_inputs_from=search_task.task_id,
        url_fragment="MTL.txt",
        download_max=LANDSAT8.download_max,
        geoserver_rest_url=CFG.geoserver_rest_url,
        geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection,
        geoserver_username=CFG.geoserver_username,
        geoserver_password=CFG.geoserver_password,
        dag=dag
    )

    join_task = DummyOperator(
        task_id='landsat8_join',
        dag=dag
    )

    download_tasks = []
    translate_tasks = []
    addo_tasks = []
    upload_tasks = []
    gdalinfo_tasks = []

    for band in area.bands:
        download_band = Landsat8DownloadOperator(
            task_id="download_band{}".format(band),
            download_dir=download_dir,
            get_inputs_from=search_task.task_id,
            url_fragment="B{}.TIF".format(band),
            download_max=LANDSAT8.download_max,
            geoserver_rest_url=CFG.geoserver_rest_url,
            geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection,
            geoserver_username=CFG.geoserver_username,
            geoserver_password=CFG.geoserver_password,
            dag=dag
        )
        download_tasks.append(download_band)

        translate = GDALTranslateOperator(
            task_id="translate_band{}".format(band),
            get_inputs_from=download_band.task_id,
            dag=dag
        )
        translate_tasks.append(translate)

        addo = GDALAddoOperator(
            task_id="add_overviews_band{}".format(band),
            get_inputs_from=translate.task_id,
            resampling_method="average",
            max_overview_level=128,
            compress_overview="PACKBITS",
            dag=dag
        )
        addo_tasks.append(addo)

        gdalinfo = GDALInfoOperator(
            task_id='landsat8_gdalinfo_band_{}'.format(band),
            get_inputs_from=addo.task_id,
            dag=dag
        )
        gdalinfo_tasks.append(gdalinfo)

        upload = RSYNCOperator(
            task_id="upload_band{}".format(band),
            host=CFG.rsync_hostname,
            remote_usr=CFG.rsync_username,
            ssh_key_file=CFG.rsync_ssh_key,
            remote_dir=LANDSAT8.repository_dir,
            get_inputs_from=addo.task_id,
            dag=dag)
        upload_tasks.append(upload)

        download_band.set_upstream(search_task)
        translate.set_upstream(download_band)
        addo.set_upstream(translate)
        gdalinfo.set_upstream(addo)
        upload.set_upstream(addo)
        join_task.set_upstream(upload)
        join_task.set_upstream(gdalinfo)

    download_task_ids = ( task.task_id for task in download_tasks )
    create_original_package_task = PythonOperator(task_id="create_original_package",
                                  python_callable=create_original_package,
                                  op_kwargs={
                                      'get_inputs_from': {
                                          "search_task_id"  : search_task.task_id,
                                          "download_task_ids" : download_task_ids,
                                      }
                                      ,
                                      'out_dir' : LANDSAT8.process_dir
                                  },
                                  dag=dag)

    upload_original_package_task = RSYNCOperator(
        task_id="upload_original_package",
        host=CFG.rsync_hostname,
        remote_usr=CFG.rsync_username,
        ssh_key_file=CFG.rsync_ssh_key,
        remote_dir=LANDSAT8.original_package_upload_dir,
        get_inputs_from=create_original_package_task.task_id,
        dag=dag)

    # we only neeed gdalinfo output on one of the granules
    gdalinfo_task = gdalinfo_tasks[0]
    gdalinfo_task_id = gdalinfo_task.task_id

    upload_task_ids = (task.task_id for task in upload_tasks)
    generate_metadata = Landsat8MTLReaderOperator(
        task_id='generate_metadata',
        original_package_download_base_url = LANDSAT8.original_package_download_base_url,
        gs_workspace = LANDSAT8.geoserver_workspace,
        gs_wms_layer = LANDSAT8.geoserver_layer,
        gs_wms_width = LANDSAT8.geoserver_oseo_wms_width,
        gs_wms_height = LANDSAT8.geoserver_oseo_wms_height,
        gs_wms_format = LANDSAT8.geoserver_oseo_wms_format,
        gs_wms_version = LANDSAT8.geoserver_oseo_wms_version,
        gs_wfs_featuretype = LANDSAT8.geoserver_featuretype,
        gs_wfs_format = LANDSAT8.geoserver_oseo_wfs_format,
        gs_wfs_version=LANDSAT8.geoserver_oseo_wfs_version,
        gs_wcs_scale_i = LANDSAT8.geoserver_oseo_wcs_scale_i,
        gs_wcs_scale_j = LANDSAT8.geoserver_oseo_wcs_scale_j,
        gs_wcs_format = LANDSAT8.geoserver_oseo_wcs_format,
        gs_wcs_version = LANDSAT8.geoserver_oseo_wcs_version,
        gs_wcs_coverage_id = LANDSAT8.geoserver_layer,
        get_inputs_from={
            "search_task_id"  : search_task.task_id,
            "metadata_task_id": download_metadata.task_id,
            "upload_task_ids" : upload_task_ids,
            "gdalinfo_task_id": gdalinfo_task_id,
            "upload_original_package_task_id": upload_original_package_task.task_id,
        },
        metadata_xml_path=os.path.join(TEMPLATES_PATH, "metadata.xml"),
        dag=dag
    )

    product_zip_task = Landsat8ProductZipFileOperator(
        task_id='landsat8_product_zip',
        get_inputs_from=[
            generate_html_description.task_id,
            generate_metadata.task_id,
            generate_thumbnail.task_id
        ],
        output_dir=LANDSAT8.process_dir,
        dag=dag
    )

    # curl -vvv -u evoadmin:\! -XPOST -H "Content-type: application/zip" --data-binary @/var/data/Sentinel-2/S2_MSI_L1C/download/S2A_MSIL1C_20170909T093031_N0205_R136_T36VUQ_20170909T093032/product.zip "http://ows-oda.eoc.dlr.de/geoserver/rest/oseo/collections/SENTINEL2/products"
    publish_task = PythonOperator(task_id="publish_product_task",
                                  python_callable=publish_product,
                                  op_kwargs={
                                      'geoserver_username': CFG.geoserver_username,
                                      'geoserver_password': CFG.geoserver_password,
                                      'geoserver_rest_endpoint': '{}/oseo/collections/{}/products'.format(
                                          CFG.geoserver_rest_url, LANDSAT8.geoserver_oseo_collection),
                                      'get_inputs_from': product_zip_task.task_id,
                                  },
                                  dag=dag)

    download_thumbnail.set_upstream(search_task)
    download_metadata.set_upstream(search_task)
    for tid in download_tasks:
        create_original_package_task.set_upstream(tid)
    upload_original_package_task.set_upstream(create_original_package_task)
    generate_metadata.set_upstream(join_task)
    generate_metadata.set_upstream(download_metadata)
    generate_metadata.set_upstream(upload_original_package_task)
    generate_thumbnail.set_upstream(download_thumbnail)
    generate_html_description.set_upstream(search_task)
    product_zip_task.set_upstream(generate_html_description)
    product_zip_task.set_upstream(generate_metadata)
    product_zip_task.set_upstream(generate_thumbnail)
    publish_task.set_upstream(upload_original_package_task)
    publish_task.set_upstream(product_zip_task)

    return dag
Example #5
0
branching = BranchPythonOperator(task_id='branching',
                                 python_callable=lambda: 'source_count'
                                 if datetime.now().day <= 7 and datetime.today(
                                 ).weekday() == 6 else 'ignore_not_sunday',
                                 dag=dag)
branching.set_upstream(run_this_first)

esucc = EmailOperator(task_id='email_success_' + dag.dag_id,
                      to=email_addr,
                      subject=dag.dag_id + ' [success] on ' +
                      datetime.now().strftime('%Y-%m-%d'),
                      html_content='Congratulation!',
                      trigger_rule='all_success',
                      dag=dag)

source_count = BashOperator(
    task_id='source_count',
    bash_command='/disk1/source_data_count; ./daily_table_count.sh > out.log ',
    dag=dag)

source_count.set_upstream(branching)
esucc.set_upstream(source_count)

ignore_not_sunday = DummyOperator(task_id='ignore_not_sunday', dag=dag)
ignore_not_sunday.set_upstream(branching)

join = DummyOperator(task_id='join', trigger_rule='all_success', dag=dag)
join << ignore_not_sunday
join << esucc
Example #6
0
    'start_date': datetime(2015, 8, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('etl_daily',
          start_date=datetime(2016, 05, 01),
          schedule_interval="0 0 14 * MON-FRI",
          default_args=default_args)

t1 = PythonOperator(task_id='test_airflow',
                    python_callable=test_airflow,
                    dag=dag)

t2 = PythonOperator(task_id='daily_equity_price_ingest',
                    python_callable=daily_equity_price_ingest,
                    dag=dag)

run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

t2.set_upstream(t1)

run_this_last.set_upstream(t2)
Example #7
0
            dag=dag,
            table='{}.{}'.format(dependency["db"], dependency["table"]),
            partition=dependency["partition"],
        )

        dependency_list.append(wait_for)

    #Create full path for the file
    hql_file_path = os.path.join(os.path.dirname(__file__), source['hql'])
    print hql_file_path
    run_hive_query = HiveOperator(task_id='run_hive_query',
                                  dag=dag,
                                  hql="""
        {{ local_hive_settings }}
        """ + "\n " + open(hql_file_path, 'r').read())

    # dummy task
    all_tasks = DummyOperator(task_id='all_tasks',
                              dag=dag,
                              on_success_callback=send_task_success)

    # mark dependencies
    for dependency in dependency_list:
        dependency.set_downstream(run_hive_query)

    all_tasks.set_upstream(run_hive_query)

    #So that mulitple dags could be created
    # https://airflow.incubator.apache.org/faq.html#how-can-i-create-dags-dynamically
    globals()[dag_id] = dag
Example #8
0
    t.set_upstream(dummy_op)
    t.set_downstream(create_temp_scores_table_op)

archive_trained_models_op = BashOperator(
    task_id='archive_trained_models',
    bash_command='scripts/bash/archive_trained_models.sh',
    dag=dag
)

notify_processing_completion_op = SlackAPIPostOperator(
    task_id='notify_processing_completion',
    token=Variable.get('slack_token'),
    channel='#engineering-commits',
    username='******',
    icon_url=Variable.get('tia_slack_icon_url'),
    text='*user_work_experience_job_posting_similarity_scores* has been refreshed on {{ts}}',
    dag=dag
)

create_temp_scores_table_op.set_downstream(copy_scores_to_temp_table_op)
copy_scores_to_temp_table_op.set_downstream(remove_scores_op)
copy_scores_to_temp_table_op.set_downstream(update_scores_op)
delete_temp_scores_table_op.set_upstream(remove_scores_op)
delete_temp_scores_table_op.set_upstream(update_scores_op)
delete_temp_scores_table_op.set_downstream(notify_processing_completion_op)

dummy_op.set_upstream(compute_title_feature_op)
dummy_op.set_upstream(compute_skill_feature_op)
dummy_op.set_upstream(compute_description_feature_op)
dummy_op.set_downstream(archive_trained_models_op)
dag2 = DAG(dag_id='test_depends_on_past', default_args=default_args)
dag2_task1 = DummyOperator(
    task_id='test_dop_task',
    dag=dag2,
    depends_on_past=True,)

# DAG tests that a Dag run that doesn't complete is marked failed
dag3 = DAG(dag_id='test_dagrun_states_fail', default_args=default_args)
dag3_task1 = PythonOperator(
    task_id='test_dagrun_fail',
    dag=dag3,
    python_callable=fail)
dag3_task2 = DummyOperator(
    task_id='test_dagrun_succeed',
    dag=dag3,)
dag3_task2.set_upstream(dag3_task1)

# DAG tests that a Dag run that completes but has a failure is marked success
dag4 = DAG(dag_id='test_dagrun_states_success', default_args=default_args)
dag4_task1 = PythonOperator(
    task_id='test_dagrun_fail',
    dag=dag4,
    python_callable=fail,
)
dag4_task2 = DummyOperator(
    task_id='test_dagrun_succeed',
    dag=dag4,
    trigger_rule=TriggerRule.ALL_FAILED
)
dag4_task2.set_upstream(dag4_task1)
Example #10
0
    'owner': 'airflow',
    'start_date': datetime.now() - timedelta(seconds=10),
    'retries': 0
}

dag = DAG('Sales_Nov',
          default_args=default_args,
          start_date=datetime.now() - timedelta(seconds=10))

op1 = DummyOperator(task_id='File1_landing', dag=dag)
t1 = EmailOperator(task_id='Processing_File_1',
                   to='*****@*****.**',
                   subject="Airflow_report",
                   html_content="File 1 started",
                   dag=dag)
op2 = DummyOperator(task_id='File2_landing', dag=dag)
t2 = EmailOperator(task_id='Processing_File_2',
                   to='*****@*****.**',
                   subject="Airflow_report",
                   html_content="File 2 started",
                   dag=dag)

op3 = DummyOperator(task_id='Aggregating', dag=dag)
op4 = DummyOperator(task_id='Final_Table_Push', dag=dag)

t1.set_upstream(op1)
t2.set_upstream(op2)
op3.set_upstream(t1)
op3.set_upstream(t2)
op4.set_upstream(op3)
Example #11
0
    schedule_interval="30 17 * * *"  # 这里可以填crontab时间格式
)

task0 = DummyOperator(task_id='task0', dag=dag)

cmd = 'ls -l'
task1 = BashOperator(task_id='task1', bash_command=cmd, dag=dag)

task0.set_downstream(task1)

task2 = DummyOperator(trigger_rule='all_done',
                      task_id='task2',
                      dag=dag,
                      depends_on_past=True)

task2.set_upstream(task1)

task3 = DummyOperator(trigger_rule='all_done',
                      depends_on_past=True,
                      task_id='task3',
                      dag=dag)

task3.set_upstream(task2)

task4 = BashOperator(task_id='task4', bash_command='lsfds-ljss', dag=dag)

task5 = DummyOperator(trigger_rule='all_done', task_id='task5', dag=dag)

task5.set_upstream(task4)
task5.set_upstream(task3)
}

# BranchPython operator that depends on past
# and where tasks may run or be skipped on
# alternating runs
dag = DAG(dag_id='example_branch_dop_operator_v3',
          schedule_interval='*/1 * * * *',
          default_args=args)


def should_run(ds, **kwargs):

    print("------------- exec dttm = {} and minute = {}".format(
        kwargs['execution_date'], kwargs['execution_date'].minute))
    if kwargs['execution_date'].minute % 2 == 0:
        return "oper_1"
    else:
        return "oper_2"


cond = BranchPythonOperator(task_id='condition',
                            provide_context=True,
                            python_callable=should_run,
                            dag=dag)

oper_1 = DummyOperator(task_id='oper_1', dag=dag)
oper_1.set_upstream(cond)

oper_2 = DummyOperator(task_id='oper_2', dag=dag)
oper_2.set_upstream(cond)
# BranchPython operator that depends on past
# and where tasks may run or be skipped on
# alternating runs
dag = DAG(dag_id='example_branch_dop_operator_v3',schedule_interval='*/1 * * * *',  default_args=args)

def should_run(ds, **kwargs):

    print("------------- exec dttm = {} and minute = {}".format(kwargs['execution_date'], kwargs['execution_date'].minute))
    if kwargs['execution_date'].minute % 2 == 0:
        return "oper_1"
    else:
        return "oper_2"


cond = BranchPythonOperator(
    task_id='condition',
    provide_context=True,
    python_callable=should_run,
    dag=dag)

oper_1 = DummyOperator(
    task_id='oper_1',
    dag=dag)
oper_1.set_upstream(cond)

oper_2 = DummyOperator(
    task_id='oper_2',
    dag=dag)
oper_2.set_upstream(cond)
import sys

from qfl.etl.data_ingest import daily_equity_price_ingest

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2015, 8, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('etl_daily',
          start_date=datetime(2016, 05, 01),
          schedule_interval="0 0 14 * MON-FRI",
          default_args=default_args)

t2 = PythonOperator(task_id='daily_equity_price_ingest',
                    python_callable=daily_equity_price_ingest,
                    dag=dag)

run_this_last = DummyOperator(task_id='run_this_last', dag=dag)
run_this_last.set_upstream(t2)
    task_id='nothing_to_remove',
    dag=dag
)

nothing_to_update_op = DummyOperator(
    task_id='nothing_to_update',
    dag=dag
)

check_job_posting_to_be_updated_op.set_downstream(check_to_remove_op)
check_job_posting_to_be_updated_op.set_downstream(check_to_update_op)

check_work_experience_to_be_updated_op.set_downstream(check_to_remove_op)
check_work_experience_to_be_updated_op.set_downstream(check_to_update_op)

update_scores_branch_op.set_upstream(check_to_update_op)
remove_scores_op.set_upstream(check_to_remove_op)
nothing_to_remove_op.set_upstream(check_to_remove_op)
nothing_to_update_op.set_upstream(check_to_update_op)

notify_processing_completion_op.set_upstream(nothing_to_remove_op)
notify_processing_completion_op.set_upstream(nothing_to_update_op)

update_scores_branch_op.set_downstream(compute_title_feature_op)
update_scores_branch_op.set_downstream(compute_skill_feature_op)
update_scores_branch_op.set_downstream(compute_description_feature_op)

compute_similarity_op.set_upstream(compute_title_feature_op)
compute_similarity_op.set_upstream(compute_skill_feature_op)
compute_similarity_op.set_upstream(compute_description_feature_op)
compute_similarity_op.set_downstream(update_scores_op)
Example #16
0
t6 = PythonOperator(task_id='daily_optionworks_ingest',
                    python_callable=DailyOptionWorksIngest.launch,
                    dag=dag,
                    provide_context=True)

t7 = PythonOperator(task_id='daily_generic_index_price_ingest',
                    python_callable=DailyGenericIndexPriceIngest.launch,
                    dag=dag,
                    provide_context=True)

night_task_waiter = TimeSensor(task_id='night_task_2000_waiter',
                               target_time=dt.time(hour=20, minute=0))

run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

# t1.set_upstream(night_task_waiter)
t2.set_upstream(t1)
# t3.set_upstream(night_task_waiter)
# t5.set_upstream(night_task_waiter)
t2.set_upstream(t7)

run_this_last.set_upstream(t1)
run_this_last.set_upstream(t2)
run_this_last.set_upstream(t3)
run_this_last.set_upstream(t4)
run_this_last.set_upstream(t5)
run_this_last.set_upstream(t6)
run_this_last.set_upstream(t7)