], 'name': f"airflow-{air_env}-oaiharvester", }] }, network_configuration=network_config) # Monitor oaiharvester fargate task monitor_harvest = ECSTaskSensor(task_id='harvest_step_2', dag=dag, cluster=cluster, ecs_task_id='harvest_step_1') check_records = BranchPythonOperator( task_id='records_check', dag=dag, provide_context=True, python_callable=check_if_records, ) no_records_to_harvest = DummyOperator(task_id='no_records_to_harvest') # ingest with mario fargate task ingest = ECSOperator( task_id='harvest_step_3', dag=dag, cluster=cluster, task_definition=f"airflow-{air_env}-mario", overrides={ 'containerOverrides': [{ 'command': [ f"--url={es_url}",
orientation="TB", tags=['DECRYPT'], dagrun_timeout=timedelta(hours=1) ) as dag: t_pipeline_begin = PythonOperator( task_id="begin_pipeline", python_callable=begin_pipeline, provide_context=True, dag=dag, ) t_check_pipeline = BranchPythonOperator( task_id="check_pipeline", python_callable=pipeline_enable_check, provide_context=True, dag=dag, ) t_pipeline_check_passed = PythonOperator( task_id="pipeline_check_passed", python_callable=pipeline_check_passed, provide_context=True, dag=dag, ) t_pipeline_check_skipped = PythonOperator( task_id="pipeline_check_skipped", python_callable=pipeline_check_skipped, provide_context=True,
def extract_2g_externals(parent_dag_name, child_dag_name, start_date, schedule_interval): """ Parse huawei 4g cm files. :param parent_dag_name: :param child_dag_name: :param start_date: :param schedule_interval: :return: """ dag = DAG( '%s.%s' % (parent_dag_name, child_dag_name), schedule_interval=schedule_interval, start_date=start_date, ) branch_start = BranchPythonOperator(task_id='branch_huawei_2g_externals', python_callable=pre_clean_up, dag=dag) def extract_huawei_2g_externals(): huawei_cm.extract_live_network_2g_externals_on_2g() huawei_cm.extract_live_network_3g_externals_on_2g() huawei_cm.extract_live_network_4g_externals_on_2g() t66 = PythonOperator(task_id='extract_huawei_2g_externals', python_callable=extract_huawei_2g_externals, dag=dag) t29 = BashOperator( task_id='run_huawei_4g_xml_nbi_parser', bash_command= 'java -jar /mediation/bin/boda-huaweinbixmlparser.jar /mediation/data/cm/huawei/raw/nbi_lte /mediation/data/cm/huawei/parsed/nbi_lte /mediation/conf/cm/huawei_nbi_lte_parser.cfg', dag=dag) t29_2 = BashOperator( task_id='run_huawei_4g_mml_parser', bash_command= 'java -jar /mediation/bin/boda-huaweimmlparser.jar /mediation/data/cm/huawei/raw/mml_lte /mediation/data/cm/huawei/parsed/mml_lte /mediation/conf/cm/huawei_mml_lte_parser.cfg', dag=dag) run_huawei_2g_xml_gexport_parser = BashOperator( task_id='run_huawei_4g_xml_gexport_parser', bash_command= 'java -jar /mediation/bin/boda-huaweicmobjectparser.jar /mediation/data/cm/huawei/raw/gexport_lte /mediation/data/cm/huawei/parsed/gexport_lte /mediation/conf/cm/huawei_gexport_lte_parser.cfg', dag=dag) t_join = DummyOperator( task_id='join_huawei_2g_externals', dag=dag, ) dag.set_dependency('branch_huawei_4g_parser', 'run_huawei_4g_mml_parser') dag.set_dependency('branch_huawei_4g_parser', 'run_huawei_4g_xml_nbi_parser') dag.set_dependency('branch_huawei_4g_parser', 'run_huawei_4g_xml_gexport_parser') dag.set_dependency('run_huawei_4g_mml_parser', 'join_huawei_4g_parser') dag.set_dependency('run_huawei_4g_xml_nbi_parser', 'join_huawei_4g_parser') dag.set_dependency('run_huawei_4g_xml_gexport_parser', 'join_huawei_4g_parser') return dag
def book_hotel_do(**kwargs): # get hotel connection string conn = os.environ["AIRFLOW_CONN_HOTEL_SERVICE"] # send request to service r = requests.get(conn + "?tid=" + kwargs['ti'].xcom_pull(task_ids='init_transaction')) if r.status_code == 200: return 'Book_Flight' else: return 'hotel_booking_failed' book_hotel = BranchPythonOperator(task_id='Book_Hotel', retries=0, python_callable=book_hotel_do, provide_context=True, dag=dag) compensate_book_hotel = SimpleHttpOperator( task_id='compensate_book_hotel', method='GET', http_conn_id='HOTEL_SERVICE', endpoint='/compensate', trigger_rule='none_skipped', data={"tid": "{{ti.xcom_pull(task_ids='init_transaction')}}"}, headers={}, dag=dag, ) compensate_book_flight_and_hotel1 = SimpleHttpOperator(
}}) model_name = 'MnistModel' serve = 'tensorflow_model_server --port=8500 --rest_api_port=8501 --model_name={} \ --model_base_path=/root/airflow/runtime/models/{}'.format( model_name, model_name) def model_exist(): if os.path.isdir('/root/airflow/runtime/{}'.format(model_name)): return 'update_version_or_not_serve' else: return 'serve_model' branch = BranchPythonOperator(task_id="serve_or_not", python_callable=model_exist, dag=dag) t2 = BashOperator( task_id="serve_model", bash_command=serve, dag=dag, executor_config={"KubernetesExecutor": { "image": "tfserving:airflow" }}) t3 = DummyOperator(task_id="update_version_or_not_serve", dag=dag) t1.set_downstream(branch)
escolhe_h_m = PythonOperator( task_id='escolhe-h-m', python_callable=sorteia_h_m, dag=dag ) def MouF(**context): value = context['task_instance'].xcom_pull(task_ids='escolhe-h-m') if value =='male': return 'branch_homem' else: return 'branch_mulher' male_female = BranchPythonOperator( task_id = 'condicional', python_callable=MouF, provide_context=True, dag=dag ) def mean_homem(): df = pd.read_csv('~/train.csv') df = df.loc[df.Sex == 'male'] print(f"Media de idade dos homens no Titanic: {df.Age.mean()}") branch_homem = PythonOperator( task_id='branch_homem', python_callable=mean_homem, dag=dag ) def mean_mulher():
description= "This DAG shows branching, if there is no data for particular partition then it skips it, else does word count", start_date=datetime(2018, 2, 1), catchup=False, schedule_interval=timedelta(days=1)) def check_data_exists(): import requests r = requests.get(Variable.get('data_base_url')) if r.status_code == 200: return 'process_data' else: return 'log_no_data' does_data_exist = BranchPythonOperator(task_id='does_data_exist', python_callable=check_data_exists, dag=dag) process_data = BashOperator(task_id='process_data', dag=dag, bash_command='echo Processing data') log_no_data = BashOperator(task_id='log_no_data', dag=dag, bash_command='echo No data found!') process_data.set_upstream(does_data_exist) log_no_data.set_upstream(does_data_exist)
for (dirpath, dirnames, filenames) in os.walk(input_path): for ifile in filenames: target="%s/%s" % (dirpath, ifile) output="%s/%s" % (output_path, ifile) ### File SUBDAG subdag_name="process_%s" % ifile subdag = DAG("%s.%s" % (DAG_NAME, subdag_name), default_args=args) subdagop = SubDagOperator(task_id=subdag_name, subdag=subdag, dag=dag) init >> subdagop >> end ## File SUBDAG TASKS init_process = DummyOperator(task_id='init_processing', dag=subdag) # Check file exists input_files_check = BranchPythonOperator(task_id='input_files_check', python_callable=check_if_file_exists, op_kwargs={'target': target}, dag=subdag) # File not found dummy task for branching file_not_found = DummyOperator(task_id='file_not_found', dag=subdag) # Method selection ingest_method_selector = BranchPythonOperator(task_id='ingest_method_selector', python_callable=branch_ingest_method, op_kwargs={'target': target}, dag=subdag) # Call Admin to inform about an error call_admin = BashOperator(task_id='call_admin', bash_command="echo \"LA INGESTA NO PUEDE REALIZARSE\"", dag=subdag) # Ending SUBDAG end_process = DummyOperator(task_id='end_processing', trigger_rule='one_success', dag=subdag) ### SUBSUBDAG CSV csv_subdag_name="csv_processor" csv_subdag = DAG("%s.%s.%s" % (DAG_NAME, subdag_name, csv_subdag_name), default_args=args)
return '{}'.format(cont_task) else: return ''.format(stop_task) start_op = BashOperator(task_id='start_task', bash_command="echo False", xcom_push=True, dag=dag) start_py = PythonOperator( task_id='start_py', python_callable=compare, #op_kwargs={'connection_name': 'redshift', 'schema':'aoi', 'table_name':'order_details', 'column':'order_date'}, dag=dag, ) branch_op = BranchPythonOperator(task_id='branch_task', provide_context=True, python_callable=branch_func, op_kwargs={ 'input_task': 'start_py', 'cont_task': 'continue_task', 'stop_task': 'stop_task' }, dag=dag) continue_op = DummyOperator(task_id='continue_task', dag=dag) stop_op = DummyOperator(task_id='stop_task', dag=dag) start_py >> branch_op >> [continue_op, stop_op]
task_ids='check_comic_info') print("export the message to a file") with DAG('comic_pusher', default_args=default_args) as dag: get_read_history = PythonOperator(task_id='get_read_history', python_callable=process_metadata, op_args=['read']) check_comic_info = PythonOperator(task_id='check_comic_info', python_callable=check_comic_info, provide_context=True) decide_what_to_do = BranchPythonOperator(task_id='new_comic_available', python_callable=decide_what_to_do, provide_context=True) update_read_history = PythonOperator(task_id='update_read_history', python_callable=process_metadata, op_args=['write'], provide_context=True) generate_notification = PythonOperator(task_id='yes_generate_notification', python_callable=generate_message, provide_context=True) send_notification = SlackAPIPostOperator( task_id='send_notification', token="YOUR_SLACK_TOKEN", channel='#comic-notification',
dag=dag, retries=1) check_s3_for_key = S3KeySensor(task_id='check_s3_for_key', bucket_key=OUTPUT_FILE_KEY, wildcard_match=True, bucket_name=BUCKET_NAME, s3_conn_id='aws_default', timeout=20, poke_interval=5, dag=dag) t_check_dataset_group = BranchPythonOperator( task_id='check_dataset_group', provide_context=True, python_callable=check_dataset_group, retries=1, dag=dag, ) t_init_personalize = DummyOperator( task_id="init_personalize", trigger_rule=TriggerRule.ALL_SUCCESS, dag=dag, ) t_create_dataset_group = PythonOperator( task_id='create_dataset_group', provide_context=True, python_callable=create_dataset_group, retries=1,
finish = DummyOperator(task_id='finish', dag=dag) def decide_which_path(): now = datetime.now(timezone('Africa/Nairobi')) print('Current Hour in Africa/Nairobi') print(now.hour) if now.hour >= 5 and now.hour <= 21: return "rerun_trigger" else: return "sleep_trigger" branch = BranchPythonOperator(task_id='branch', python_callable=decide_which_path, trigger_rule="all_done", dag=dag) rerun_trigger = TriggerDagRunOperator(task_id='rerun_trigger', trigger_dag_id=DAG_ID, dag=dag) sleep_trigger = TriggerDagRunOperator(task_id='sleep_trigger', trigger_dag_id=SLEEP_DAG_ID, dag=dag) pause_replication >> wait_for_replication_pause wait_for_replication_pause >> update_flat_obs wait_for_replication_pause >> update_flat_orders wait_for_replication_pause >> update_flat_lab_obs
""" Example DAG demonstrating a workflow with nested branching. The join tasks are created with ``none_failed_or_skipped`` trigger rule such that they are skipped whenever their corresponding ``BranchPythonOperator`` are skipped. """ from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import BranchPythonOperator from airflow.utils.dates import days_ago with DAG(dag_id="join_dag", start_date=days_ago(2), schedule_interval="@daily") as dag: branch_1 = BranchPythonOperator(task_id="branch_1", python_callable=lambda: "true_1") join_1 = DummyOperator(task_id="join_1", trigger_rule="one_success") true_1 = DummyOperator(task_id="true_1") false_1 = DummyOperator(task_id="false_1") branch_2 = BranchPythonOperator(task_id="branch_2", python_callable=lambda: "true_2") join_2 = DummyOperator(task_id="join_2", trigger_rule="one_success") true_2 = DummyOperator(task_id="true_2") false_2 = DummyOperator(task_id="false_2") false_3 = DummyOperator(task_id="false_3") branch_1 >> true_1 >> join_1 branch_1 >> false_1 >> branch_2 >> [true_2, false_2] >> join_2 >> false_3 >> join_1
'owner': 'air', 'start_date': datetime.now(), 'provide_context': True, 'start_date': days_ago(2), } dag = DAG( dag_id='copyBatteryLogs', default_args=args, schedule_interval='@daily', tags=['gemeni'], catchup=False ) connect = BranchPythonOperator(task_id='connect', python_callable=connect_to_server, dag=dag) mount = BashOperator(task_id='mount', bash_command='./gemeni/mountDC.sh', # xcom_push=True, dag=dag) list_source_folder = PythonOperator(task_id='read_source', python_callable=read_source_folder, provide_context=True, dag=dag) list_dist_folder = PythonOperator(task_id='read_dist', python_callable=read_dist_folder, provide_context=True,
cluster_name='spark-cluster-{{ ds_nodash }}', num_workers=2, master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1', image_version='1.3.89-debian10', storage_bucket='fsp-logistics-spark-bucket', region='europe-central2' ) create_cluster.doc_md = """## Create Dataproc cluster This task creates a Dataproc cluster in your project. """ weekday_or_weekend = BranchPythonOperator( task_id='weekday_or_weekend', python_callable=assess_day, op_kwargs={'execution_date': '{{ ds }}'} ) weekend_analytics = DataProcPySparkOperator( task_id='weekend_analytics', main='gs://fsp-logistics-spark-bucket/pyspark/weekend/gas_composition_count.py', cluster_name='spark-cluster-{{ ds_nodash }}', region='europe-central2', dataproc_pyspark_jars='gs://spark-lib/bigquery/spark-bigquery-latest.jar', ) weekday_analytics = SubDagOperator( task_id='weekday_analytics', subdag=weekday_subdag( parent_dag='bigquery_data_analytics',
5: "Saturday​", 6: "Sunday​", } def get_weekday(execution_date, **kwargs): print("Today it is: {}".format(weekdays[execution_date.weekday()])) print_execution_time = PythonOperator(task_id="print_weekday", dag=dag, python_callable=get_weekday, provide_context=True) def get_on_call(execution_date, **kwargs): return weekday_person_to_email[execution_date.weekday()] branching = BranchPythonOperator(task_id="branching", dag=dag, python_callable=get_on_call, provide_context=True) print_execution_time >> branching final_task = DummyOperator(task_id="final_task", dag=dag) for person in set(weekday_person_to_email.values()): branching >> DummyOperator(task_id=person, dag=dag) >> final_task
model23 = PythonOperator( task_id='model23', provide_context=True, python_callable=model23.run_model23, params={'file_path': '/home/hasitha/airflow/dags/files/model1.txt'}, dag=dag) model3 = PythonOperator( task_id='model3', provide_context=True, python_callable=model_final.run_model, params={'file_path': '/home/hasitha/airflow/dags/files/output.txt'}, dag=dag) branch1 = BranchPythonOperator(task_id='branch1', python_callable=branch1.check_branch1_condition, dag=dag) branch2 = BranchPythonOperator(task_id='branch2', python_callable=branch2.check_branch2_condition, dag=dag) model3.set_upstream(model21) model3.set_upstream(model22) model3.set_upstream(model23) model21.set_upstream(branch2) model22.set_upstream(branch2) model23.set_upstream(branch2) branch2.set_upstream(model11) branch2.set_upstream(model12) model11.set_upstream(branch1)
email_subject = """ Email report for {{ params.department }} on {{ ds_nodash }} """ email_report_task = EmailOperator( task_id='email_report_task', to='*****@*****.**', subject=email_subject, html_content='', params={'department': 'Data subscription services'}, dag=dag) no_email_task = DummyOperator(task_id='no_email_task', dag=dag) def check_weekend(**kwargs): dt = datetime.strptime(kwargs['execution_date'], '%Y-%m-%d') #If dt.weekday() is 0-4, it's Mon-Fri. If 5-6, it's Sat/Sun if (dt.weekday() < 5): return 'email_report_task' else: return 'no_email_task' branch_task = BranchPythonOperator(task_id='check_if_weekend', python_callable=check_weekend, provide_context=True, dag=dag) sensor >> bash_task >> python_task python_task >> branch_task >> [email_report_task, no_email_task]
notify=True, tags=['tag1', 'tag2'], # If the script at s3 location has any qubole specific macros to be replaced # macros='[{"date": "{{ ds }}"}, {"name" : "abc"}]', trigger_rule="all_done") t3 = PythonOperator(task_id='compare_result', provide_context=True, python_callable=compare_result, trigger_rule="all_done") t3 << [t1, t2] options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options)) branching << t3 join = DummyOperator(task_id='join', trigger_rule='one_success') t4 = QuboleOperator( task_id='hadoop_jar_cmd', command_type='hadoopcmd', sub_command='jar s3://paid-qubole/HadoopAPIExamples/' 'jars/hadoop-0.20.1-dev-streaming.jar ' '-mapper wc ' '-numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/' 'data/3.tsv -output ' 's3://paid-qubole/HadoopAPITests/data/3_wc', cluster_label='{{ params.cluster_label }}',
def source_to_use(**kwargs): ti = kwargs['ti'] source = ti.xcom_pull(task_ids='hook_task') print("source fetch from XCOM: {}".format(source)) return source def check_for_activated_source(**kwargs): ti = kwargs['ti'] return ti.xcom_pull(task_ids='xcom_task').lower() with DAG('branch_dag', default_args=default_args, schedule_interval='@once') as dag: start_task = DummyOperator(task_id='start_task') hook_task = PythonOperator(task_id='hook_task', python_callable=get_activated_sources) xcom_task = PythonOperator(task_id='xcom_task', python_callable=source_to_use, provide_context=True) branch_task = BranchPythonOperator(task_id='branch_task', python_callable=check_for_activated_source, provide_context=True) mysql_task = BashOperator(task_id='mysql', bash_command='echo "MYSQL is activated"') postgresql_task = BashOperator(task_id='postgresql', bash_command='echo "PostgreSQL is activated"') s3_task = BashOperator(task_id='s3', bash_command='echo "S3 is activated"') mongo_task = BashOperator(task_id='mongo', bash_command='echo "Mongo is activated"') start_task >> hook_task >> xcom_task >> branch_task branch_task >> mysql_task branch_task >> postgresql_task branch_task >> s3_task branch_task >> mongo_task
} # BranchPython operator that depends on past # and where tasks may run or be skipped on # alternating runs dag = DAG(dag_id='example_branch_dop_operator_v3', schedule_interval='*/1 * * * *', default_args=args, tags=['example']) def should_run(**kwargs): print('------------- exec dttm = {} and minute = {}'.format( kwargs['execution_date'], kwargs['execution_date'].minute)) if kwargs['execution_date'].minute % 2 == 0: return "dummy_task_1" else: return "dummy_task_2" cond = BranchPythonOperator( task_id='condition', provide_context=True, python_callable=should_run, dag=dag, ) dummy_task_1 = DummyOperator(task_id='dummy_task_1', dag=dag) dummy_task_2 = DummyOperator(task_id='dummy_task_2', dag=dag) cond >> [dummy_task_1, dummy_task_2]
start = DummyOperator(task_id='start', dag=dag) def print_all(*args, **kwargs): logger.info('print all') logger.info('args: %s', args) logger.info('kwargs: %s', kwargs) def check_callable(*args, **kwargs): return 'foo' check = BranchPythonOperator(task_id='check', python_callable=check_callable, dag=dag) foo = PythonOperator(task_id='foo', python_callable=print_all, provide_context=True, dag=dag) bar = PythonOperator(task_id='bar', python_callable=print_all, provide_context=True, dag=dag) start >> check check >> foo check >> bar
def _pick_a_branch(execution_date, **context): weekday = execution_date.weekday() return job_map[weekday_person_to_email[weekday]] def _print_exec_date(execution_date, **context): print(execution_date) print_date = PythonOperator( task_id="print_branching_date", python_callable=_print_exec_date, provide_context=True, dag=dag) branching = BranchPythonOperator(task_id='branching_operator', python_callable=_pick_a_branch, provide_context=True, dag=dag) final_task = DummyOperator(task_id='final_task', dag=dag, trigger_rule=TriggerRule.ONE_SUCCESS) for key, value in job_map.items(): branching >> DummyOperator(task_id='' + value, dag=dag) >> final_task print_date >> branching
endpoint_name = 'class_attendance' get_enpdpoints_task_id = "get_{0}_dl_endpoint".format(endpoint_name) branch_task_id = "branch_row_count_{0}_dl".format(endpoint_name) file_to_gcs_task_id = "{0}_to_gcs".format(endpoint_name) zero_branch_task_id = "{0}_zero_row".format(endpoint_name) t2 = PythonOperator(task_id=get_enpdpoints_task_id, python_callable=get_class_attendance, op_args=[SAVE_PATH, BASE_URL, API_KEYS], templates_dict=ep_template) t_branch = BranchPythonOperator(task_id=branch_task_id, python_callable=row_count_branch, op_args=[ get_enpdpoints_task_id, file_to_gcs_task_id, zero_branch_task_id ], trigger_rule="all_done") t_gcs = FileToGoogleCloudStorageOperator( task_id=file_to_gcs_task_id, google_cloud_storage_conn_id='gcs_silo', bucket="deanslist", src="{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_path' )}}", dst=endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}", dag=dag) t_zero_row = DummyOperator(task_id=zero_branch_task_id)
Print the payload "message" passed to the DagRun conf attribute. :param context: The execution context :type context: dict """ print("Remotely received value of {}".format(str(context["dag_run"].conf))) ######################################################################################################################## ############################################### #################################################### ############################################### TASKS DEFINITIONS #################################################### ############################################### #################################################### ######################################################################################################################## network_init = BashOperator(task_id='network_init',bash_command='ansible --version -vvv', dag=dag,) teams_message = PythonOperator(task_id='send_teams_message', python_callable=send_teams_message, trigger_rule='one_success',dag=dag,) deploy_playbook_sbc = BashOperator(task_id='deploy_sbc',bash_command='cd ' + PLAYBOOKS + ' && ansible-playbook -i ' + PLAYBOOKS + '/inventory ' + PLAYBOOKS + '/sbc_audiocodes_telnet.yaml', dag=dag,) deploy_validation = BranchPythonOperator(task_id='deploy_validation', python_callable=deploy_validation, dag=dag,) query_formio = PythonOperator(task_id='query_formio', python_callable=query_formio,dag=dag,) rollback = PythonOperator(task_id='rollback', python_callable=rollback,dag=dag,) write_influx_result = PythonOperator(task_id='write_influx_result', python_callable=write_influx_result, trigger_rule='one_success', dag=dag,) network_end = BashOperator(task_id='network_end', bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', trigger_rule='one_success', dag=dag,) backup_sbc = PythonOperator(task_id='backup_sbc',python_callable=backup_sbc, dag=dag,) context_data = PythonOperator(task_id='dag_context', python_callable=context_data, dag=dag) init_teams_message = PythonOperator(task_id='init_teams_message', python_callable=send_teams_init, trigger_rule='one_success',dag=dag,) ######################################################################################################################## ################################################ ###################################################### ################################################ TASKS WORKFLOW ###################################################### ################################################ ###################################################### ######################################################################################################################## network_init >> query_formio
'email': [os.environ['email']], 'email_on_failure': True, 'email_on_retry': False, 'retries': 4, 'retry_delay': timedelta(minutes=15), } with DAG('update_data_if_not_already_up_to_date', catchup=False, default_args=default_args, schedule_interval="@daily", ) as dag: cond = BranchPythonOperator( task_id='check_if_data_is_up_to_date', python_callable=should_run, dag=dag, ) data_is_already_up_to_date = DummyOperator(task_id='data_is_already_up_to_date') start_spark_cluster = BashOperator(task_id='start_spark_cluster', bash_command='/usr/local/spark/sbin/start-all.sh ') process_reddit_data = BashOperator(task_id='process_data', bash_command='/usr/local/spark/sbin/spark-submit --master spark://10.0.0.16:7077 --packages org.apache.hadoop:hadoop-aws:2.7.3, --packages org.postgresql:postgresql:42.2.5 /identifying_trending_topics_on_social_media/process_data/process_reddit_comments.py ') stop_spark_cluster = BashOperator(task_id='stop_spark_cluster',bash_command='/usr/local/spark/sbin/stop-all.sh ') cond >> start_spark_cluster >> process_reddit_data >> stop_spark_cluster
from airflow.models import Variable def demo_check(): demo = Variable.get("Demo_Flag") if demo: if demo == '1': return "send_sns" return "end_task" skip_operator = DummyOperator(task_id='skip_message', trigger_rule='one_success', dag=dag) end_operator = DummyOperator(task_id='end_task', trigger_rule='one_success', dag=dag) demo_check_operator = BranchPythonOperator( task_id='demo_check', python_callable=demo_check, trigger_rule='one_success', dag=dag ) sns_operator = SnsPublishOperator( task_id='send_sns', target_arn='arn:aws:sns:us-west-2:356032320829:airflow', message='Airflow fingerprinting done', aws_conn_id='S3_conn', trigger_rule='one_success', dag=dag )
task_id=f"download_file_if_changed_{identifier}", python_callable=download_file_if_changed, op_kwargs={ "url": original_url_exp, "target": zip_path_exp }) calc_hash = CalculateHash(task_id=f'calc_hash_{identifier}', path=zip_path_exp) check_if_is_already_up = BranchPythonOperator( task_id=f"branching_{identifier}", provide_context=True, python_callable=check_if_is_already_processed, op_kwargs={ 'pull_hash_from': f'calc_hash_{identifier}', 'data_set': '{{ params.data_set }}', 'proceed_path': f"proceed_to_insert_{identifier}", 'db_name': 'db', 'already_processed_path': "success", }) upload_to_ftp_step = PythonOperator( task_id=f"upload_to_ftp_{identifier}", python_callable=upload_to_ftp, op_kwargs={ 'con_id': 'ftp_data.controlciudadano.org.py', 'remote_path': ftp_target_path, 'local_path': zip_path_exp })
) t_pipeline_exec_cwl2 = BashOperator(task_id='pipeline_exec_cwl2', bash_command=""" \ tmp_dir={{tmp_dir_path(run_id)}} ; \ cd ${tmp_dir}/cwl_out ; \ {{ti.xcom_pull(task_ids='build_cmd2')}} >> $tmp_dir/session.log 2>&1 ; \ echo $? """) #next_op if true, bail_op if false. test_op returns value for testing. t_maybe_keep_cwl2 = BranchPythonOperator( task_id='maybe_keep_cwl2', python_callable=utils.pythonop_maybe_keep, provide_context=True, op_kwargs={ 'next_op': 'move_data', 'bail_op': 'set_dataset_error', 'test_op': 'pipeline_exec_cwl2' }) #Others t_send_create_dataset = PythonOperator( task_id='send_create_dataset', python_callable=utils.pythonop_send_create_dataset, provide_context=True, op_kwargs={ 'parent_dataset_uuid_callable': get_parent_dataset_uuid, 'http_conn_id': 'ingest_api_connection', 'endpoint': '/datasets/derived', 'dataset_name_callable': build_dataset_name,
else: return "email_joe" args = { "owner": "airflow", "start_date": datetime(2019, 12, 1), } with DAG(dag_id='sixth_dag', default_args=args, schedule_interval='@daily') as dag: print_execution_date = PythonOperator(task_id='print_execution_date', python_callable=_get_execution_date, provide_context=True) branching = BranchPythonOperator(task_id='branching', python_callable=_get_branch, provide_context=True) email_bob = DummyOperator(task_id='email_bob') email_alice = DummyOperator(task_id='email_alice') email_joe = DummyOperator(task_id='email_joe') final_task = BashOperator(task_id='final_task', bash_command="echo final task", trigger_rule="none_failed") print_execution_date >> branching >> [email_bob, email_alice, email_joe ] >> final_task