예제 #1
0
    bash_command='date',
    executor_config={"KubernetesExecutor": {
        "image": "ubuntu:latest"
    }},
    dag=dag,
)

t1.doc_md = """\
#### Task Documentation
You can document your task using the attributes `doc_md` (markdown),
`doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets
rendered in the UI's Task Instance Details page.
![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png)
"""

dag.doc_md = __doc__

t2 = BashOperator(
    task_id='sleep',
    depends_on_past=False,
    bash_command='sleep 5',
    executor_config={"KubernetesExecutor": {
        "image": "ubuntu:latest"
    }},
    dag=dag,
)

templated_command = """
{% for i in range(5) %}
    echo "{{ ds }}"
    echo "{{ macros.ds_add(ds, 7)}}"
from airflow.sensors.http_sensor import HttpSensor

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(2),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('example_http_operator', default_args=default_args)

dag.doc_md = __doc__

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = SimpleHttpOperator(
    task_id='post_op',
    endpoint='api/v1.0/nodes',
    data=json.dumps({"priority": 5}),
    headers={"Content-Type": "application/json"},
    response_check=lambda response: True if len(response.json()) == 0 else False,
    dag=dag)

t5 = SimpleHttpOperator(
    task_id='post_op_formenc',
    endpoint='nodes/url',
    data="name=Joe",
    headers={"Content-Type": "application/x-www-form-urlencoded"},
    task_id='get_listings',
    python_callable=get_listings,
    dag=dag,
)

t2 = PythonOperator(
    task_id='send_email',
    provide_context=True,
    python_callable=send_email,
    dag=dag,
)

# noinspection PyStatementEffect
t1 >> t2

# Documentation
dag.doc_md = f"""
#### DAG Documentation
{dag.description}
"""

t1.doc_md = """
#### Task Documentation
Retrieves and stores Zoopla data
"""

t2.doc_md = """
#### Task Documentation
Sends email notification when new data is available
"""
    'retry_delay': timedelta(minutes=2),
    'catchup': False,
    'email_on_retry': False
}

# DAG object creation
# Scheduler: https://airflow.apache.org/docs/stable/scheduler.html

dag = DAG(
    'arXiv_Redshift_dag',
    default_args=default_args,
    description='Load and transform data from S3 in Redshift with Airflow',
    schedule_interval='@once',
    catchup=True,
    max_active_runs=1)
dag.doc_md = """
### DAG Summary
This DAG describes the ETL process for ArXiv data from S3 to Redshift

### Points of Contact
Email: [email protected]
"""

############################
# Task Operators
############################

start_operator = DummyOperator(task_id='begin_execution', dag=dag)
start_operator.doc_md = """
# Dummy operator: Start of DAG
"""
예제 #5
0
    dag=dag_main,
)

wrap_up = BashOperator(
    task_id='wrap_up',
    bash_command=f'python3 {work_dir}/src/update_run_time.py',
    dag=dag_main,
)
# ======================End of main pipeline=====================

# Ending Task
t_end = BashOperator(
    task_id='running_end',
    bash_command='echo Running End! Time: $(date +"%T")',
    trigger_rule='one_success',  # if one branches is done
    dag=dag_main,
)

# Assemble main running pipline
dag_main >> t_start >> monitor >> t1 >> t2 >> t3 >> t4 >> t5 >> wrap_up >> t_end
# Branch if nothing to execute (no updates)
monitor >> check >> t_end

# DAG docs:
dag_main.doc_md = __doc__

t1.doc_md = """\
#### Task Documentation
Data Ingestion of files. It ingest two parts.
"""
예제 #6
0
from att_service_instance_funcs import *


args = {
    'start_date': datetime.utcnow(),
    'owner': 'ATT',
}

dag_att = DAG(
    dag_id='att_workflow_onu',
    default_args=args,
    # this dag will be triggered by external systems
    schedule_interval=None,
)

dag_att.doc_md = __doc__


def ONU_event(model_accessor, message, **kwargs):
    #context = kwargs
    #run_id = context['dag_run'].run_id

    logging.info('onu.events: received event', message=message)

    si = find_or_create_att_si(model_accessor, logging, message)
    if message['status'] == 'activated':
        logging.info('onu.events: activated onu', message=message)
        si.no_sync = False
        si.uni_port_id = long(message['portNumber'])
        si.of_dpid = message['deviceId']
        si.oper_onu_status = 'ENABLED'
예제 #7
0
                    ext_task = dag.get_task('wait_for_{}_{}'.format(
                        ext_dag_id, ext_task_id))
                    dummy = dag.get_task('{}_{}_finish'.format(
                        ext_dag_id, ext_task_id))
                except:
                    ext_task = ExternalTaskSensor(
                        task_id='wait_for_{}_{}'.format(
                            ext_dag_id, ext_task_id),
                        external_dag_id=ext_dag_id,
                        external_task_id=ext_task_id,
                        execution_delta=datetime.timedelta(
                            minutes=int(execution_delta)),
                        dag=dag)
                    dummy = DummyOperator(task_id='{}_{}_finish'.format(
                        ext_dag_id, ext_task_id),
                                          dag=dag)
                ext_task >> dummy >> dag.get_task(
                    job_dict.get(str(row.job_num)))
            else:
                dag.get_task(job_dict.get(str(dep_job_id))) >> dag.get_task(
                    job_dict.get(str(row.job_num)))

    dag.doc_md = md

    for task in filter(lambda x: x.task_id not in ('start', 'finish'),
                       dag.tasks):
        if not task.upstream_list:
            start >> task
        if not task.downstream_list:
            task >> finish
예제 #8
0
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime, timedelta
from unicorn.airflow.util.unicorn_airflow_util import load_yaml

dag_id = "unicorn_get_ip_dag"
dir_path = os.path.dirname(os.path.realpath(__file__))
dag_config = load_yaml(os.path.join(dir_path, dag_id + ".yml"))

default_args = dag_config['default_args']
default_args['start_date'] = datetime.now()

dag = DAG(dag_id,
          default_args=dag_config["default_args"],
          schedule_interval=dag_config["schedule_interval"])

dag.doc_md = dag_config['doc_md']

task1 = BashOperator(task_id='TaskStart',
                     bash_command="echo {{params}}",
                     params={'cmd':dag_config["task1_cmd"]},
                     dag=dag)

task2 = BashOperator(task_id='UnicornGetIp',
                     depends_on_past=False,
                     bash_command=dag_config["task1_cmd"],
                     dag=dag)

task3 = DummyOperator(
    task_id='TaskFinsish',
    dag=dag
)
Add a Markdown description to a DAG or a task.
The description is shown in “Graph View” for DAGs, “Task Details” for tasks.
Doc: https://airflow.readthedocs.io/en/latest/concepts.html#documentation-notes
"""
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime

default_args = {
    'start_date': datetime.now()
}

dag = DAG(
    'description_markdown',
    default_args=default_args)
dag.doc_md = """
# Markdown hi
## Subheader
Here's a [url](www.airbnb.com)

My numbered list:

1. one
1. two

My bulleted list:

- first
- second
"""
                               task_id='pod2',
                               is_delete_operator_pod=True,
                               hostnetwork=False,
                               )

    t3 = KubernetesPodOperator(namespace='airflow',
                               image="ubuntu:16.04",
                               cmds=["bash", "-cx"],
                               arguments=["echo", "hello world"],
                               labels={'runner': 'airflow'},
                               name="pod3",
                               task_id='pod3',
                               is_delete_operator_pod=True,
                               hostnetwork=False,
                               )

    t4 = KubernetesPodOperator(namespace='airflow',
                               image="ubuntu:16.04",
                               cmds=["bash", "-cx"],
                               arguments=["echo", "hello world"],
                               labels={'runner': 'airflow'},
                               name="pod4",
                               task_id='pod4',
                               is_delete_operator_pod=True,
                               hostnetwork=False,
                               )

    company_onboarding.doc_md = __doc__

    t1 >> [t2, t3] >> t4
def create_dag(dag_id, schedule, start_date, delta_sensor, airpots_codes,
               default_args):

    dag = DAG(dag_id,
              schedule_interval=schedule,
              start_date=start_date,
              default_args=default_args)

    dag.doc_md = """
    # DAG fetching data from smiles.com.ar
    ### procesing and dumping on postgresql
    """
    """start = TimeDeltaSensor(
        task_id='wait_to_start',
        delta=timedelta(minutes=delta_sensor),
        dag=dag)"""

    start = DummyOperator(task_id="start", dag=dag)

    branches = []

    def return_dates_branches(**kwargs):
        return branches

    gen_url_branch = BranchPythonOperator(
        task_id='generate_url_dates',
        provide_context=True,
        python_callable=return_dates_branches,
        dag=dag)

    def transform_data(**kwargs):
        ti = kwargs['ti']
        raw_data = ti.xcom_pull(task_ids=return_dates_branches())
        data = []
        logging.info(raw_data)
        if raw_data is not None:
            flat_list = [item for sublist in raw_data for item in sublist]
            for row in flat_list:
                row = list(row)
                # add À-ÿ for spanish accents
                date = '/'.join(
                    list(
                        re.compile("([A-ZÀ-ÿ]+)(\d+)([A-ZÀ-ÿ]+)").split(
                            row[1]))[2:4])
                date = dateparser.parse(date,
                                        languages=['pt', 'es'],
                                        date_formats=['%d/%b'
                                                      ]).strftime('%Y-%m-%d')
                row[1] = date
                td = row[4].split(':')
                row[4] = str(timedelta(hours=int(td[0]), minutes=int(td[1])))
                row[5] = int(row[5].replace('.', ''))
                row[6] = int(row[6].replace('.', ''))
                row[8] = row[8].split(' ')[-1]
                row.insert(0, datetime.now().strftime('%Y-%m-%d'))
                data.append(tuple(row))
            return data
        else:
            print('No se recibio datos')

    t2 = PythonOperator(
        task_id='transform_data',
        python_callable=transform_data,
        depends_on_past=True,
        trigger_rule=TriggerRule.ALL_SUCCESS,
        provide_context=True,
        dag=dag,
    )

    t2.doc_md = """
    #### Task Documentation
    Transform fetched data
    @return a list of tuples
    """

    # def gen_url_dates(**kwargs):
    date_start = read_scraped_date(airpots_codes)
    date_end = date_start + timedelta(days=AMOUNT_DAYS)
    date_generated = [
        date_start + timedelta(days=x)
        for x in range(0, (date_end - date_start).days)
    ]

    for i, date in enumerate(date_generated):
        date_ml = str(date.timestamp())[:8] + '00000'
        url_dated = """https://www.smiles.com.ar/emission?originAirportCode={}&destinationAirportCode={}&departureDate={}&adults=1&children=0&infants=0&isFlexibleDateChecked=false&tripType=3&currencyCode=BRL&segments=2&departureDate2={}&originAirportCode2={}&destinationAirportCode2={}""".format(
            airpots_codes[0][0], airpots_codes[1], date_ml, date_ml,
            airpots_codes[0][1], airpots_codes[1])

        get_data_op = PythonOperator(
            task_id='get_data_{}and{}to{}_{}'.format(airpots_codes[0][0],
                                                     airpots_codes[0][1],
                                                     airpots_codes[1], i),
            python_callable=get_data_URL,
            op_kwargs={'URL': url_dated},
            trigger_rule=TriggerRule.ONE_SUCCESS,
            provide_context=True,
            dag=dag,
        )
        branches.append(get_data_op.task_id)
        get_data_op.set_upstream(gen_url_branch)
        get_data_op.set_downstream(t2)
        get_data_op.doc_md = """
        #### Task Documentation
        Fetch data from passed url
        return list of semi-parsed data
        """

    insert_data = PythonOperator(
        task_id='insert_data',
        python_callable=insert_into_table,
        provide_context=True,
        dag=dag,
    )

    insert_data.doc_md = """
    #### Task Documentation
    Insert parsed and transformed data into table
    """
    t2.set_downstream(insert_data)
    gen_url_branch.set_upstream(start)

    return dag
예제 #12
0
def create_basiskaart_dag(is_first: bool, table_name: str,
                          select_statement: str) -> DAG:
    """Generates a DAG for each table.

    The table_name is the target table in de masterDB where the data will be inserted. The
    select_statement is one of the imported SQL query selects (see above) that will be executed
    on the source DB.
    """
    # start time first DAG
    # Note: the basiskaartimport task in Jenkins runs at an arbitrary but invariant time between
    # 3 and 5 a.m. Because of this, the first DAG starts running at 7 a.m.
    schedule_start_hour = 7

    dag = DAG(
        f"{dag_id}_{table_name}",
        default_args={
            "owner": owner,
            **default_args
        },
        # the first DAG will have the is_first boolean set to True
        # the other DAG's will be triggered to start when the previous DAG is finished
        # (estafette run / relay run)
        schedule_interval=f"0 {schedule_start_hour} * * *"
        if is_first else None,
        description="""
        basisregistratie grootschalige topologie (BGT) en kleinschalige basiskaart (KBK10 en 50).
        The basiskaart data is collected from basiskaart DB.""",
        tags=["basiskaart"],
    )

    with dag:

        # 1. Post info message on slack
        slack_at_start = MessageOperator(
            task_id="slack_at_start",
            http_conn_id="slack",
            webhook_token=slack_webhook_token,
            message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})",
            username="******",
        )

        # 2. Create temp and target table
        create_tables = PostgresOperator(
            task_id="create_tables",
            sql=CREATE_TABLES,
            params=dict(base_table=table_name, dag_id=dag_id),
        )

        # 3. Copy data into temp table
        copy_data = PythonOperator(
            task_id="insert_data",
            python_callable=create_tables_from_basiskaartdb_to_masterdb,
            op_kwargs={
                "source_connection": source_connection,
                "source_select_statement": globals()[select_statement],
                "target_base_table": f"{dag_id}_{table_name}_temp",
            },
            dag=dag,
        )

        # 4. Check for changes in temp table to merge in target table
        change_data_capture = PgComparatorCDCOperator(
            task_id="change_data_capture",
            source_table=f"{dag_id}_{table_name}_temp",
            target_table=f"{dag_id}_{table_name}",
        )

        # 5. Create mviews for T-REX tile server
        create_mviews = PostgresOperator(
            task_id="create_mviews",
            sql=CREATE_MVIEWS,
            params=dict(base_table=table_name, dag_id=dag_id),
        )

        # 6. Rename COLUMNS based on Provenance
        provenance_translation = ProvenanceRenameOperator(
            task_id="rename_columns",
            dataset_name=dag_id,
            prefix_table_name=f"{dag_id}_",
            rename_indexes=False,
            pg_schema="public",
        )

        # 7. Drop temp table
        clean_up = PostgresOperator(
            task_id="drop_temp_table",
            sql=[
                f"DROP TABLE IF EXISTS {dag_id}_{table_name}_temp CASCADE",
            ],
        )

        # 8. Trigger next DAG to run (estafette)
        trigger_next_dag = TriggerDynamicDagRunOperator(
            task_id="trigger_next_dag",
            dag_id_prefix=f"{dag_id}_",
            trigger_rule="all_done",
        )

        # 9. Grant database permissions
        grant_db_permissions = PostgresPermissionsOperator(task_id="grants",
                                                           dag_name=dag_id)

    # Flow
    (slack_at_start >> create_tables >> copy_data >> change_data_capture >>
     create_mviews >> provenance_translation >> clean_up >> trigger_next_dag >>
     grant_db_permissions)

    dag.doc_md = """
    #### DAG summary
    This DAG contains BGT (basisregistratie grootschalige topografie) i
    and KBK10 (kleinschalige basiskaart 10)
    and KBK50 (kleinschalige basiskaart 50) data
    #### Mission Critical
    Classified as 2 (beschikbaarheid [range: 1,2,3])
    #### On Failure Actions
    Fix issues
    and rerun dag on working days
    #### Point of Contact
    Inform the businessowner at [businessowner]@amsterdam.nl
    #### Business Use Case / process / origin
    NA
    #### Prerequisites/Dependencies/Resourcing
    https://api.data.amsterdam.nl/v1/docs/datasets/basiskaart.html
    Note: The basiskaart data is collected from the GOB objectstore
    and processed in the basiskaart DB
    => which is the source for this DAG.
    """

    return dag
예제 #13
0
from airflow.sensors.cord_workflow_plugin import CORDEventSensor, CORDModelSensor
from airflow.operators.cord_workflow_plugin import CORDModelOperator

log = logging.getLogger(__name__)
args = {
    # hard coded date
    'start_date': datetime(2019, 1, 1),
    'owner': 'iychoi'
}

dag_parallel_cord = DAG(
    dag_id='parallel_cord_workflow',
    default_args=args,
    # this dag will be triggered by external systems
    schedule_interval=None)
dag_parallel_cord.doc_md = __doc__

dag_parallel_cord_admin = DAG(
    dag_id='parallel_cord_workflow_admin',
    default_args=args,
    # this dag will be triggered by external systems
    schedule_interval=None)
dag_parallel_cord_admin.doc_md = __doc__


def on_onu_event(model_accessor, message, **kwargs):
    log.info('onu.events: received an event - %s' % message)


def on_auth_event(model_accessor, message, **kwargs):
    log.info('authentication.events: received an event - %s' % message)