True,
    "approved":
    True,
    "inProg":
    False,
    "done":
    False,
    "approvedBy":
    "karakuri",
    "workflow":
    workflow_id
})

print("TASKS: ", tasks)

dag = DAG('sfsc_review_new_airflow_process_tasks',
          default_args=default_args,
          schedule_interval=None)

start = DummyOperator(task_id='start', default_args=default_args, dag=dag)

process = SubDagOperator(
    task_id='process',
    subdag=subdag_tasks('sfsc_review_new_airflow_process_tasks', 'process',
                        tasks, default_args),
    default_args=default_args,
    dag=dag,
)

start >> process
gen_search_terms = BranchPythonOperator(task_id='generate_search_terms',
                                        provide_context=True,
                                        python_callable=generate_search_terms,
                                        dag=dag)


email_links = EmailOperator(task_id='email_best_links',
                            to='*****@*****.**',
                            subject='Latest popular links',
                            html_content='Check out the latest!!',
                            files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
                            dag=dag)


sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)


clear_latest = BashOperator(bash_command='rm -rf {}/latest_links.txt'.format(
    RAW_TWEET_DIR), task_id='clear_latest', dag=dag)


gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r'\W+', '', term)
    simple_search = PythonOperator(
        task_id='search_{}_twitter'.format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,
Esempio n. 3
0
            "export_data_hql": export_table2_to_gcp_hql,
            "add_dataproc_partition_hql": add_dataproc_table2_partition_hql,
            "drop_tmp_table_hql": drop_tmp_table2_hql
        }
    }
}

export_table_params = {
    'gcp_warehouse_path': persist_cfg['gcp_warehouse']['casesci_prd'],
    'gcp_keyfile': persist_cfg['gcp_keyfile']['casesci_prd']
}

export_table_dataproc_config = {
    "dataproc_cluster": persist_cfg["gcp_prod_project"]["dataproc_cluster"],
    "region": persist_cfg["gcp_prod_project"]["region"],
    "gcp_conn_id": persist_cfg["gcp_prod_project"]["gcp_conn_id"]
}

for op_name in export_table_dict:
    export_table_dict[op_name]["operator"] = SubDagOperator(
        subdag=export_to_gcp_dag("crmdata_bids_bounds." + op_name,
                                 sync_data2GCP_dag.schedule_interval,
                                 persist_cfg['queue'],
                                 sync_data2GCP_dag.default_args,
                                 export_table_dict[op_name]["hql_dict"],
                                 export_table_params,
                                 export_table_dataproc_config),
        task_id=op_name,
        queue=persist_cfg['queue'],
        dag=sync_data2GCP_dag)
Esempio n. 4
0
gen_search_terms = BranchPythonOperator(task_id='generate_search_terms',
                                        provide_context=True,
                                        python_callable=generate_search_terms,
                                        dag=dag)

email_links = EmailOperator(
    task_id='email_best_links',
    to='*****@*****.**',
    subject='Latest popular links',
    html_content='Check out the latest!!',
    files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
    dag=dag)

sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)

clear_latest = BashOperator(
    bash_command='rm -rf {}/latest_links.txt'.format(RAW_TWEET_DIR),
    task_id='clear_latest',
    dag=dag)

gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r'\W+', '', term)
    simple_search = PythonOperator(
        task_id='search_{}_twitter'.format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,
dag = DAG(
    dag_id=DAG_NAME,
    default_args=args,
    schedule_interval="@once",
)

start = DummyOperator(
    task_id='start',
    default_args=args,
    dag=dag,
)

section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    default_args=args,
    dag=dag,
)

some_other_task = DummyOperator(
    task_id='some-other-task',
    default_args=args,
    dag=dag,
)

section_2 = SubDagOperator(
    task_id='section-2',
    subdag=subdag(DAG_NAME, 'section-2', args),
    default_args=args,
    dag=dag,
)
Esempio n. 6
0
# DAG tests that a deadlocked subdag is properly caught
dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args)
subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args)
subdag7_task1 = PythonOperator(
    task_id='test_subdag_fail',
    dag=subdag7,
    python_callable=fail)
subdag7_task2 = DummyOperator(
    task_id='test_subdag_dummy_1',
    dag=subdag7,)
subdag7_task3 = DummyOperator(
    task_id='test_subdag_dummy_2',
    dag=subdag7)
dag7_subdag1 = SubDagOperator(
    task_id='subdag',
    dag=dag7,
    subdag=subdag7)
subdag7_task1.set_downstream(subdag7_task2)
subdag7_task2.set_downstream(subdag7_task3)

# DAG tests that queued tasks are run
dag8 = DAG(
    dag_id='test_scheduled_queued_tasks',
    start_date=DEFAULT_DATE,
    end_date=DEFAULT_DATE,
    default_args=default_args)
dag8_task1 = PythonOperator(
    # use delayed_fail because otherwise LocalExecutor will have a chance to
    # complete the task
    python_callable=delayed_fail,
    task_id='test_queued_task',
        start_task >> dt_s3
        dt_s3 >> dt_sf
        dt_sf >> end

    return one_dag


#############################################################################
#Defining Main Dag structure
#############################################################################

main_dag = DAG(
    dag_id=parent_dag_name,
    default_args=default_args,
    schedule_interval='@once'
    #schedule_interval=timedelta(minutes=5),
    #max_active_runs=1
    ,
    concurrency=10)

database_list = get_database_list(database_include_patterns)

#Each database is an independant task that will run in parallel4
for i in database_list:
    sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, i,
                                                     '@once'),
                             task_id=i,
                             dag=main_dag,
                             pool='Pool_max_parallel_5',
                             executor=LocalExecutor())