from airflow.operators import BranchPythonOperator, DummyOperator
from airflow.models import DAG
from datetime import datetime, timedelta
import random

seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_branch_operator', default_args=args)

cmd = 'ls -l'
run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

options = ['branch_a', 'branch_b', 'branch_c', 'branch_d']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(run_this_first)

for option in options:
    t = DummyOperator(task_id=option, dag=dag)
    t.set_upstream(branching)
    dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag)
    t.set_downstream(dummy_follow)
Exemple #2
0
t3 = PythonOperator(task_id='compare_result',
                    provide_context=True,
                    python_callable=compare_result,
                    trigger_rule="all_done",
                    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)

join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag)

t4 = QuboleOperator(
    task_id='hadoop_jar_cmd',
    command_type='hadoopcmd',
    sub_command=
    'jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc',
    cluster_label='default',
    fetch_logs=True,
    dag=dag)

t5 = QuboleOperator(
    task_id='pig_cmd',
    command_type="pigcmd",
                            html_content='Check out the latest!!',
                            files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
                            dag=dag)


sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)


clear_latest = BashOperator(bash_command='rm -rf {}/latest_links.txt'.format(
    RAW_TWEET_DIR), task_id='clear_latest', dag=dag)


gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r'\W+', '', term)
    simple_search = PythonOperator(
        task_id='search_{}_twitter'.format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,
        dag=dag,
        params={'query': term})
    simple_search.set_upstream(gen_search_terms)
    simple_search.set_downstream(sub)

sub.set_downstream(email_links)
email_links.set_downstream(clear_latest)
    task_id='compare_result',
    provide_context=True,
    python_callable=compare_result,
    trigger_rule="all_done",
    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)


join = DummyOperator(
    task_id='join',
    trigger_rule='one_success',
    dag=dag
)


t4 = QuboleOperator(
    task_id='hadoop_jar_cmd',
    command_type='hadoopcmd',
    sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc',
    cluster_label='default',
    fetch_logs=True,
Exemple #5
0
    subject='Latest popular links',
    html_content='Check out the latest!!',
    files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
    dag=dag)

sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)

clear_latest = BashOperator(
    bash_command='rm -rf {}/latest_links.txt'.format(RAW_TWEET_DIR),
    task_id='clear_latest',
    dag=dag)

gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r'\W+', '', term)
    simple_search = PythonOperator(
        task_id='search_{}_twitter'.format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,
        dag=dag,
        params={'query': term})
    simple_search.set_upstream(gen_search_terms)
    simple_search.set_downstream(sub)

sub.set_downstream(email_links)
email_links.set_downstream(clear_latest)
def create_dag(dag_id, schedule, start_date, delta_sensor, airpots_codes,
               default_args):

    dag = DAG(dag_id,
              schedule_interval=schedule,
              start_date=start_date,
              default_args=default_args)

    dag.doc_md = """
    # DAG fetching data from smiles.com.ar
    ### procesing and dumping on postgresql
    """
    """start = TimeDeltaSensor(
        task_id='wait_to_start',
        delta=timedelta(minutes=delta_sensor),
        dag=dag)"""

    start = DummyOperator(task_id="start", dag=dag)

    branches = []

    def return_dates_branches(**kwargs):
        return branches

    gen_url_branch = BranchPythonOperator(
        task_id='generate_url_dates',
        provide_context=True,
        python_callable=return_dates_branches,
        dag=dag)

    def transform_data(**kwargs):
        ti = kwargs['ti']
        raw_data = ti.xcom_pull(task_ids=return_dates_branches())
        data = []
        logging.info(raw_data)
        if raw_data is not None:
            flat_list = [item for sublist in raw_data for item in sublist]
            for row in flat_list:
                row = list(row)
                # add À-ÿ for spanish accents
                date = '/'.join(
                    list(
                        re.compile("([A-ZÀ-ÿ]+)(\d+)([A-ZÀ-ÿ]+)").split(
                            row[1]))[2:4])
                date = dateparser.parse(date,
                                        languages=['pt', 'es'],
                                        date_formats=['%d/%b'
                                                      ]).strftime('%Y-%m-%d')
                row[1] = date
                td = row[4].split(':')
                row[4] = str(timedelta(hours=int(td[0]), minutes=int(td[1])))
                row[5] = int(row[5].replace('.', ''))
                row[6] = int(row[6].replace('.', ''))
                row[8] = row[8].split(' ')[-1]
                row.insert(0, datetime.now().strftime('%Y-%m-%d'))
                data.append(tuple(row))
            return data
        else:
            print('No se recibio datos')

    t2 = PythonOperator(
        task_id='transform_data',
        python_callable=transform_data,
        depends_on_past=True,
        trigger_rule=TriggerRule.ALL_SUCCESS,
        provide_context=True,
        dag=dag,
    )

    t2.doc_md = """
    #### Task Documentation
    Transform fetched data
    @return a list of tuples
    """

    # def gen_url_dates(**kwargs):
    date_start = read_scraped_date(airpots_codes)
    date_end = date_start + timedelta(days=AMOUNT_DAYS)
    date_generated = [
        date_start + timedelta(days=x)
        for x in range(0, (date_end - date_start).days)
    ]

    for i, date in enumerate(date_generated):
        date_ml = str(date.timestamp())[:8] + '00000'
        url_dated = """https://www.smiles.com.ar/emission?originAirportCode={}&destinationAirportCode={}&departureDate={}&adults=1&children=0&infants=0&isFlexibleDateChecked=false&tripType=3&currencyCode=BRL&segments=2&departureDate2={}&originAirportCode2={}&destinationAirportCode2={}""".format(
            airpots_codes[0][0], airpots_codes[1], date_ml, date_ml,
            airpots_codes[0][1], airpots_codes[1])

        get_data_op = PythonOperator(
            task_id='get_data_{}and{}to{}_{}'.format(airpots_codes[0][0],
                                                     airpots_codes[0][1],
                                                     airpots_codes[1], i),
            python_callable=get_data_URL,
            op_kwargs={'URL': url_dated},
            trigger_rule=TriggerRule.ONE_SUCCESS,
            provide_context=True,
            dag=dag,
        )
        branches.append(get_data_op.task_id)
        get_data_op.set_upstream(gen_url_branch)
        get_data_op.set_downstream(t2)
        get_data_op.doc_md = """
        #### Task Documentation
        Fetch data from passed url
        return list of semi-parsed data
        """

    insert_data = PythonOperator(
        task_id='insert_data',
        python_callable=insert_into_table,
        provide_context=True,
        dag=dag,
    )

    insert_data.doc_md = """
    #### Task Documentation
    Insert parsed and transformed data into table
    """
    t2.set_downstream(insert_data)
    gen_url_branch.set_upstream(start)

    return dag