seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG( dag_id='example_bash_operator', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60)) cmd = 'ls -l' run_this_last = DummyOperator(task_id='run_this_last', dag=dag) run_this = BashOperator( task_id='run_after_loop', bash_command='echo 1', dag=dag) run_this.set_downstream(run_this_last) for i in range(3): i = str(i) task = BashOperator( task_id='runme_'+i, bash_command='echo "{{ task_instance_key_str }}" && sleep 1', dag=dag) task.set_downstream(run_this) task = BashOperator( task_id='also_run_this', bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', dag=dag) task.set_downstream(run_this_last)
# 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('dag', default_args=default_args) # t1, t2 and t3 are examples of tasks created by instatiating operators requirements = BashOperator( task_id='requirements', bash_command='pip install -r requirements.txt', dag=dag) data = BashOperator( task_id='data', bash_command='python src/make_dataset.py', dag=dag) clean = BashOperator( task_id='clean', bash_command='find . -name "*.pyc" -exec rm {} \;', dag=dag) lint = BashOperator( task_id='flake8', bash_command='flake8 .', dag=dag) data.set_upstream(requirements)
default_args={'owner': 'airflow', 'provide_context': True}) valid_chars='-_.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' def sanitize(text): return ''.join(c for c in text if c in valid_chars) #this is where pipeline-generated bash commands come in.... bash_commands = ('echo "hi russ"', 'echo "hello again"') conclusion_command = 'echo "all done"' conclusion = BashOperator(task_id='conclude', bash_command=conclusion_command, dag=dag) for cmd in bash_commands: cmd.rstrip() run_this = BashOperator( task_id=sanitize(cmd), bash_command=cmd, dag=dag) run_this.set_downstream(conclusion) # def push(**kwargs): # # pushes an XCom without a specific target # kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1) # def push_by_returning(**kwargs): # # pushes an XCom without a specific target, just by returning it
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime.now(), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG( 'docker_sample', default_args=default_args, schedule_interval=timedelta(minutes=10)) t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag) t2 = BashOperator( task_id='sleep', bash_command='sleep 5', retries=3, dag=dag) t3 = DockerOperator(api_version='1.19', docker_url='tcp://localhost:2375', #Set your docker URL command='/bin/sleep 30', image='centos:latest', network_mode='bridge', task_id='docker_op_tester', dag=dag)
) make_klusta_dir_task = BashOperator( task_id='make_klusta_dir', bash_command=make_klustadir_cmd, params={'klustadir': KLUSTA_DIR}, on_success_callback = lambda c: set_perms(c['params']['klustadir'],default_args['owner']), dag=dag) make_kwd_task = BashOperator( task_id='make_kwd', # pool='RAM', bash_command=make_kwd_command, env={'PATH': ANACONDA_PATH}, params={'klustadir': KLUSTA_DIR, 'matfiledir': MATFILE_DIR, 'probe': PROBE, 'rig': RIG, 'omit': OMIT}, on_failure_callback = lambda c: clean_dir(c['params']['klustadir']), on_success_callback = lambda c: set_perms(c['params']['klustadir'],default_args['owner']), dag=dag) phy_task = BashOperator( task_id='phy_spikesort', # pool='CPU', env={'PATH': PHY_PATH}, bash_command=sort_spikes_command, params={'klustadir': KLUSTA_DIR, 'matfiledir': MATFILE_DIR}, on_failure_callback = lambda c: [clean_dir(c['params']['klustadir'],filt) for filt in ('*.kwik','*.kwx')],
default_args = { 'owner': 'Samarth', 'start_date': datetime(2016, 03, 15, 12), } # "schedule_interval" is your cron expression you can write any cron expression like unix cron. dag = DAG('airflow_task_with_hdfs_sensor', default_args=default_args, schedule_interval="1 * * * *") bash_task = BashOperator( task_id='dependency_for_hdfs_sensor', bash_command='echo "HDFS sensor would only be enabled after I am done!"', dag=dag) # Sensor operator takes "filepath" to check if this file is present in hdfs or not. # "hdfs_conn_id" is configured in ui Admin--> Connection. hdfs_sensor_task = HdfsSensor( task_id='hdfs_sensor_task', filepath='/user/samarthg/input2', hdfs_conn_id='webhdfs_default', dag=dag) post_hdfs_sensor_task = BashOperator( task_id='post_hdfs_sensor_task', bash_command='echo "I am done, it means sensor has done its job."', dag=dag) # Setting up the correct dependencies for defined tasks. hdfs_sensor_task.set_upstream(bash_task) post_hdfs_sensor_task.set_upstream(hdfs_sensor_task)
default_args = { 'owner': 'anjana', 'depends_on_past': False, 'start_date': datetime(2020, 6, 21), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } with DAG('Helloworld', default_args=default_args) as dag: t1 = BashOperator( task_id='task_1', bash_command='echo "Hello World from Task 1"', dag=dag) t2 = BashOperator( task_id='task_2', bash_command='echo "Hello World from Task 2"', dag=dag) t3 = BashOperator( task_id='task_3', bash_command='echo "Hello World from Task 3"', dag=dag) t4 = BashOperator( task_id='task_4', bash_command='echo "Hello World from Task 4"',
datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_bash_operator', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60)) cmd = 'ls -l' run_this_last = DummyOperator(task_id='run_this_last', dag=dag) run_this = BashOperator(task_id='run_after_loop', bash_command='echo 1', dag=dag) run_this.set_downstream(run_this_last) for i in range(3): i = str(i) task = BashOperator( task_id='runme_' + i, bash_command='echo "{{ task_instance_key_str }}" && sleep 1', dag=dag) task.set_downstream(run_this) task = BashOperator( task_id='also_run_this', bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', dag=dag)
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2016, 11, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 5, 'retry_delay': timedelta(minutes=5) } dag = DAG('s3_dag_test', default_args=default_args, schedule_interval= '@once') t1 = BashOperator( task_id='bash_test', bash_command='echo "hello world" > s3_conn_test.txt', dag=dag) sensor = S3KeySensor( task_id='check_s3_for_file_in_s3', bucket_key='*', wildcard_match=True, bucket_name='airflow-input-spite', s3_conn_id='aws_default', timeout=18*60*60, poke_interval=120, dag=dag) t1.set_upstream(sensor)
'start_date': yesterday, # To email on failure or retry set 'email' arg to your email and enable # emailing here. 'email_on_failure': False, 'email_on_retry': False, # If a task fails, retry it once after waiting at least 5 minutes 'retries': 0, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with DAG(dag_id='monitor_dag', schedule_interval=None, default_args=default_dag_args) as dag: bash_prerequisites_install_cmd = """sudo apt install -y python-pip""" bash_prerequisites_install = BashOperator( task_id='bash_prerequisites_install', bash_command=bash_prerequisites_install_cmd) bash_pip_install_cmd = """sudo pip install pandas google-colab google-cloud-bigquery google-cloud-bigquery-storage pyarrow pyTelegramBotAPI""" bash_pip_install = BashOperator(task_id='bash_pip_install', bash_command=bash_pip_install_cmd) bash_colab_export_script_cmd = """python /home/omid/gs_dags/script.py""" bash_colab_export_scriptTask = BashOperator( task_id='bash_colab_export_script', bash_command=bash_colab_export_script_cmd) bash_prerequisites_install >> bash_pip_install >> bash_colab_export_scriptTask
#@auther: uday sharma from airflow import DAG from airflow.operators import BashOperator, HiveOperator from datetime import datetime, timedelta default_args = { 'owner': 'udaysharma', 'start_date': datetime(2016, 1, 14), 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('incremental_load', default_args=default_args) sqoop_job = """ exec ./scripts/sqoop_incremental.sh """ # Importing the data from Mysql table to HDFS task1 = BashOperator(task_id='sqoop_import', bash_command=sqoop_job, dag=dag) # Inserting the data from Hive external table to the target table task2 = HiveOperator( task_id='hive_insert', hql= 'INSERT INTO TABLE orders_trans SELECT order_id, first_name,last_name, item_code, order_date FROM orders_stg;', depends_on_past=True, dag=dag) # defining the job dependency task2.set_upstream(task1)
pipeline_args = { 'owner': 'pipeliner', 'depends_on_past': False, 'start_date': datetime(2016, 10, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } dag_id = 'pipeline_sample_dag' dag = DAG(dag_id, default_args=pipeline_args, schedule_interval='0 0 * * 6') globals()[dag_id] = dag # Create sample task operators task_1 = BashOperator(task_id='print_date', bash_command='date', dag=dag) task_2 = BashOperator(task_id='sleep', bash_command='sleep 60', retries=3, dag=dag) task_3 = BashOperator(task_id='print_date_again', bash_command='date', dag=dag) task_2.set_upstream(task_1) task_3.set_upstream(task_2)
from __future__ import absolute_import, unicode_literals import os from airflow.operators import BashOperator from airflow.models import DAG from datetime import datetime, timedelta args = { 'owner': 'recursive_schedule_interval', 'depends_on_past': False, 'start_date': datetime(2021, 1, 1), } dag = DAG( dag_id='recursive_schedule_interval', default_args=args, schedule_interval="*/5 * * * 1,2,3,4,5", catchup=False ) # cmd file name CMD = 'echo Job executado em: $(date +"%d/%m/%Y %k:%M:%S")' run_this = BashOperator( task_id='bash_operator', bash_command=CMD, dag=dag )
Documentation that goes along with the Airflow tutorial located [here](http://pythonhosted.org/airflow/tutorial.html) """ from airflow import DAG from airflow.operators import BashOperator from datetime import datetime, timedelta from settings import default_args dag = DAG('reddit_comments', default_args=default_args, schedule_interval=timedelta(2)) pre = BashOperator( task_id='setup', bash_command='tasks/setup_dirs.sh', depends_on_past=False, dag=dag) t1 = BashOperator( task_id='ngrams_batch', bash_command='tasks/run_ngrams_batch.sh', depends_on_past=False, dag=dag) t2 = BashOperator( task_id='ngrams_optimize', depends_on_past=True, bash_command='tasks/optimize_ngrams.sh', dag=dag) t1.set_upstream(pre) t2.set_upstream(t1)
'retries': 0, 'retry_delay': timedelta(minutes=1), } dag = DAG('bharat_sheep_download', default_args=default_args, schedule_interval="@once") # t1, t2, t3 and t4 are examples of tasks created using operators dl_cmd = """hive.c4000 --hiveconf start_date=\"\'{}\'\" --hiveconf end_date=\"\'{}\'\" -f /home/prabhakarbha01/sheep/src/data/1_insert_into_download.sql""" dl_reset_cmd = """hive.c4000 -f /home/prabhakarbha01/sheep/src/data/1_drop_create_download.sql""" dl_reset = BashOperator(task_id='reset_agg_table', bash_command=dl_reset_cmd, dag=dag) dl_10_01 = BashOperator(task_id='dl_10_01', bash_command=dl_cmd.format("2018-10-01", "2018-10-08"), dag=dag) dl_10_02 = BashOperator(task_id='dl_10_02', bash_command=dl_cmd.format("2018-10-08", "2018-10-16"), dag=dag) dl_10_03 = BashOperator(task_id='dl_10_03', bash_command=dl_cmd.format("2018-10-16", "2018-10-24"), dag=dag) dl_10_04 = BashOperator(task_id='dl_10_04',
n = sanitize(c) while n in dic.keys(): n = n + '_' dic[n] = c return dic command_dict = dict_from_cmd_list(bash_commands) tasks = [] for n,c in command_dict.iteritems(): task = BashOperator(task_id=n, bash_command=c, dag=dag, pool='default') if len(tasks) > 0: task.set_upstream(tasks[-1]) tasks.append(task) job = DagExecutionJob(dag) # def push(**kwargs): # # pushes an XCom without a specific target # kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1)
Code that goes along with the Airflow located at: http://airflow.readthedocs.org/en/latest/tutorial.html """ from airflow import DAG from airflow.operators import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2016, 4, 22), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 4, 24), } dag = DAG('spark_pi', default_args=default_args) # t1 is an example of tasks created by instatiating operators t1 = BashOperator( task_id='spark_pi', bash_command= 'spark-submit --class org.apache.spark.examples.SparkPi --master spark://127.0.0.1:7077 $SPARK_EXAMPLES_JAR 10', dag=dag)
from_channels = ['fromTwitter_A', 'fromTwitter_B', 'fromTwitter_C', 'fromTwitter_D'] to_channels = ['toTwitter_A', 'toTwitter_B', 'toTwitter_C', 'toTwitter_D'] yesterday = date.today() - timedelta(days=1) dt = yesterday.strftime("%Y-%m-%d") # define where you want to store the tweets csv file in your local directory local_dir = "/tmp/" # define the location where you want to store in HDFS hdfs_dir = " /tmp/" for channel in to_channels: file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql)
from airflow import DAG from airflow.operators import BashOperator, TimeSensor from datetime import datetime, timedelta, time default_args = { 'owner': 'Samarth', 'start_date': datetime(2016, 03, 15, 12), } # "schedule_interval" is your cron expression you can write any cron expression like unix cron. dag = DAG('airflow_task_with_time_sensor', default_args=default_args, schedule_interval="1 * * * *") bash_task = BashOperator( task_id='dependency_for_sensor', bash_command='echo "Sensor would only be enabled after I am done!"', dag=dag) # Sensor operator takes "target_time" which is a specific time in a day irrespective of date/day. # Sensor is executed once the target time has passed. In this case after 10:55 at morning. sensor_task = TimeSensor(task_id='sensor_task', target_time=time(10, 55, 1, 1), dag=dag) post_sensor_task = BashOperator( task_id='post_sensor_task', bash_command='echo "I am done, it means sensor has done its job."', dag=dag) # Setting up the correct dependencies for defined tasks. sensor_task.set_upstream(bash_task)
def my_py_command(ds, **kwargs): # Print out the "foo" param passed in via # `airflow test example_passing_params_via_test_command run_this <date> # -tp '{"foo":"bar"}'` if kwargs["test_mode"]: print(" 'foo' was passed in via test={} command : kwargs[params][foo] \ = {}".format( kwargs["test_mode"], kwargs["params"]["foo"]) ) # Print out the value of "miff", passed in below via the Python Operator print(" 'miff' was passed in via task params = {}".format( kwargs["params"]["miff"]) ) return 1 my_templated_command = """ echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} " echo " 'miff was passed in via BashOperator with value {{ params.miff }} " """ run_this = PythonOperator( task_id='run_this', provide_context=True, python_callable=my_py_command, params={"miff":"agg"}, dag=dag) also_run_this = BashOperator( task_id='also_run_this', bash_command=my_templated_command, params={"miff":"agg"}, dag=dag) also_run_this.set_upstream(run_this)
default_args = { 'owner': 'airflow', 'start_date': datetime.now() - timedelta(minutes=1), 'email': [], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('airflow_task_script_1', default_args=default_args, schedule_interval=None, start_date=datetime.now() - timedelta(minutes=1)) generating_the_MySql_data= BashOperator( task_id='generating_the_MySql_data', bash_command="cd ~/Documents/data/ ; python practical_exercise_data_generator.py --load_data;", dag=dag) generating_the_CSV_data= BashOperator( task_id='generating_the_CSV_data', bash_command="cd ~/Documents/data/ ; python practical_exercise_data_generator.py --create_csv", dag=dag) Sqoop_import_user= BashOperator( task_id='Sqoop_import_user', bash_command=""" sqoop import --connect jdbc:mysql://localhost/practical_exercise_1 --username root --password-file /user/cloudera/root_pwd.txt --table user -m 4 --hive-import --hive-overwrite --hive-database practical_exercise_1 --hive-table user; if [ $? -ne 0 ];then echo Failed at importing user table exit 1 fi
""" # Importing the incremental data from Mysql table to HDFS task1 = BashOperator( task_id= 'sqoop_incremental_import', #bash_command=sqoop_incremental_job, bash_command='./sqoop_incremental.sh', dag=dag ) # merge the data from Mysql table to HDFS task2 = BashOperator( task_id= 'sqoop_merge_import', bash_command='./sqoop_merge.sh', dag=dag ) # Inserting the data from Hive external table to the target table task3 = HiveOperator( task_id= 'hive_insert', hql='LOAD DATA INPATH "/user/cloudera/employeeprofilemerge" OVERWRITE INTO TABLE employee_profile;', depends_on_past=True, dag=dag ) # Inserting the data from Hive table with masked ssn external table to the target table task4 = HiveOperator( task_id= 'hive_insert_masked', hql='add jar /home/cloudera/Masking.jar;create TEMPORARY function masking as \'Masking\';INSERT OVERWRITE table employee_profile_masked SELECT profile_id,first_name,last_name,modified_date,masking(ssn) FROM employee_profile;',
'email_on_retry': False, 'retries': 2, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('thirdpay', default_args=default_args, schedule_interval='0 12 * * *') EXECUTION_DATE = "{{ ds }}" start = BashOperator(task_id='start', bash_command='echo start ', dag=dag) # ods 表生成 ods_fin_third_pay_wxapp = BashOperator( task_id='ods_fin_third_pay_wxapp', bash_command='CheckTag -d {{ ds }} -l day -b ods.ods_fin_third_pay_wxapp', dag=dag) ods_fin_third_pay_wxapp.set_upstream(start) ods_fin_third_pay_wxgzh = BashOperator( task_id='ods_fin_third_pay_wxgzh', bash_command='CheckTag -d {{ ds }} -l day -b ods.ods_fin_third_pay_wxgzh', dag=dag) ods_fin_third_pay_wxgzh.set_upstream(start)
'email_on_retry': False } # Set concurrency and max_active_runs to 1, preventing more than one dag instance # from being created. dag = DAG(dag_name, default_args=task_args, concurrency=1, max_active_runs=1, schedule_interval=schedule_interval) get_env = PythonOperator( task_id='get-config-from-s3', python_callable=ConfigGetter(), dag=dag) set_variables = PythonOperator( task_id='set-variables', python_callable=BootStrapper(), dag=dag) cleanup = BashOperator( task_id='cleanup', bash_command=rm_config, trigger_rule='all_done', dag=dag) set_variables.set_upstream(get_env) cleanup.set_upstream(set_variables)
""" Code that goes along with the Airflow located at: http://airflow.readthedocs.org/en/latest/tutorial.html """ from airflow import DAG from airflow.operators import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime.now(), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 4, 24), } dag = DAG('pyspark_submit_pi', default_args=default_args) t1 = BashOperator( task_id='spark_submit', bash_command= 'spark-submit --master spark://spark-master-2-0-1:7077 /root/volumes/source.ml/jupyterhub.ml/spark/pyspark_pi.py', dag=dag)
PROBE = "A1x16-5mm-50" RIG = "burung16" dag_id = USER + BLOCK dag = DAG(dag_id, default_args=default_args, schedule_interval='@once', ) phy_task = BashOperator( task_id='phy_spikesort', pool='phy', env={'PATH': PHY_PATH}, bash_command=sort_spikes_command, params={'klustadir': KLUSTA_DIR, 'matfiledir': MATFILE_DIR}, on_failure_callback = lambda c: [clean_dir(c['params']['klustadir'],filt) for filt in ('*.kwik','*.kwx')], on_success_callback = lambda c: set_perms(c['params']['klustadir'],default_args['owner']), dag=dag) clear_phy_task = BashOperator( task_id='clear_phy', bash_command=clear_phy_cmd, params={'klustadir': KLUSTA_DIR, 'matfiledir': MATFILE_DIR}, dag=dag) make_kwik_bak_dir_task = BashOperator( task_id='make_kwik_bak_dir', bash_command=make_kwik_bak_dir_cmd,
from airflow.operators import BashOperator, MySqlOperator from airflow.models import DAG from datetime import datetime default_args = { 'owner': 'max', 'start_date': datetime(2014, 9, 1), 'mysql_dbid': 'local_mysql', } dag = DAG(dag_id='example_3') run_this = BashOperator( task_id='also_run_this', bash_command='ls -l', **default_args) dag.add_task(run_this) for i in range(5): i = str(i) task = BashOperator( task_id='runme_'+i, bash_command='sleep {{ 10 + macros.random() * 10 }}', **default_args) task.set_upstream(run_this) dag.add_task(task)
aml_utils = load_source( 'aml_utils', "{pf}/asiamiles_airflow_extensions/utils.py".format( pf=configuration.get('core', 'plugins_folder'))) mod_config = aml_utils.load_config( "{dag_folder}/pros_etl.cfg".format( dag_folder=dirname(realpath(__file__)))) hdfs_home=mod_config['hadoop']['hdfs_home'] copy_rsynced_files_to_hadoop = BashOperator( task_id="copy_rsynced_files_to_hadoop", bash_command="hadoop fs -put -f /data1/staging/pros/* pros", dag=dag) #spark-shell --master yarn-client update_seat_idx = BashOperator( task_id="update_seat_idx", bash_command="cat /data1/airflow/dags/pros-etl/pros_seat_index_hist_load.scala | spark-shell --master yarn-client", dag=dag) update_curve = BashOperator( task_id="update_curve", bash_command="cat /data1/airflow/dags/pros-etl/pros_bid_price_hist_load.scala | spark-shell --master yarn-client", dag=dag) update_seat_idx.set_upstream(copy_rsynced_files_to_hadoop) update_curve.set_upstream(copy_rsynced_files_to_hadoop)
""" Executing tasks at a particular time of the day using sensor operator. """ from airflow import DAG from airflow.operators import BashOperator, TimeSensor from datetime import datetime, timedelta, time default_args = {"owner": "Samarth", "start_date": datetime(2016, 03, 15, 12)} # "schedule_interval" is your cron expression you can write any cron expression like unix cron. dag = DAG("airflow_task_with_time_sensor", default_args=default_args, schedule_interval="1 * * * *") bash_task = BashOperator( task_id="dependency_for_sensor", bash_command='echo "Sensor would only be enabled after I am done!"', dag=dag ) # Sensor operator takes "target_time" which is a specific time in a day irrespective of date/day. # Sensor is executed once the target time has passed. In this case after 10:55 at morning. sensor_task = TimeSensor(task_id="sensor_task", target_time=time(10, 55, 1, 1), dag=dag) post_sensor_task = BashOperator( task_id="post_sensor_task", bash_command='echo "I am done, it means sensor has done its job."', dag=dag ) # Setting up the correct dependencies for defined tasks. sensor_task.set_upstream(bash_task) post_sensor_task.set_upstream(sensor_task)
# Set concurrency and max_active_runs to 1, preventing more than one dag instance # from being created. dag = DAG(dag_name, default_args=task_args, concurrency=1, max_active_runs=1, schedule_interval=schedule_interval) get_file = PythonOperator( task_id='get-file-from-s3', python_callable=FileGetter(), dag=dag) hello_world_docker_write_logs = BashOperator( task_id='hello-world', bash_command=start_hello_world, trigger_rule=TriggerRule.ALL_SUCCESS, dag=dag) check_read_logs = PythonOperator( task_id='check_read_logs', python_callable=CheckReadLogs(), dag=dag) put_file = PythonOperator( task_id='put-file-to-s3', python_callable=DataPutter(), dag=dag) delete_object = PythonOperator( task_id='delete-object-from-s3',
} dag = DAG( 'financial_news', default_args=default_args, schedule_interval=timedelta(2)) #Run Camus to pull messages from Kafka into HDFS camus_a = BashOperator( task_id = 'camus_a', bash_command='tasks/run_camus.sh', depends_on_past=1, dag = dag) #Run Spark to sum all historical trades and write to Cassandra trades_batch_a = BashOperator( task_id = 'trades_batch_a', bash_command='tasks/run_trades_batch.sh', depends_on_past=1, dag = dag) #set trades batch after news batch to give it more memory trades_batch_a.set_upstream(camus_a) #Update Cassandra's stream 2 table to include counts from the batch run with all the trades summed from stock_count_rts1, which were the trades that came in since task1_camus started running sum_batch_a_rts2 = BashOperator( task_id = 'sum_batch_a_rts2', bash_command='tasks/sum_batch_rts2.sh', depends_on_past=1, dag = dag) sum_batch_a_rts2.set_upstream(trades_batch_a)
'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'schedule_interval': timedelta(1), # 'end_date': datetime(2016, 1, 1), } dag = DAG('tutorial', default_args=default_args) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag) t1.doc_md = """\ #### Task Documentation You can document your task using the attributes `doc_md` (markdown), `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets rendered in the UI's Task Details page. ![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png) """ dag.doc_md = __doc__ t2 = BashOperator( task_id='sleep', depends_on_past=False,
dag = DAG(dag_id, default_args=default_args, schedule_interval='@once', ) ############ Post-phy cleanup and merging make_postphy_dir_task = BashOperator( task_id='make_postphy_dir', bash_command=as_user(make_postphy_dir_cmd, USER), params={'postphydir': POSTPHY_DIR}, on_success_callback = lambda c: set_perms(c['params']['postphydir'],default_args['owner']), dag=dag) rsync_task = BashOperator( task_id='rsync', bash_command=as_user(rsync_command, USER), params={'postphydir': POSTPHY_DIR, 'mansortdir': MANSORT_DIR, 'mansorthost': MANSORT_HOST}, dag=dag) merge_events_task = BashOperator( task_id='merge_events', bash_command=merge_events_cmd, env={'PATH': ANACONDA_PATH}, params={'matfiledir': MATFILE_DIR, 'postphydir': POSTPHY_DIR}, dag=dag) kwik2pandas_task = BashOperator( task_id='kwik2pandas', bash_command=kwik2pandas_cmd,
# 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG( 'tutorial_mod', default_args=default_args, schedule_interval=timedelta(1)) # t1, t2 and t3 are examples of tasks created by instatiating operators t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag) t2 = BashOperator( task_id='sleep', bash_command='sleep 5', retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator( task_id='templated', bash_command=templated_command, params={'my_param': 'Parameter I passed in'},
create_disposition='CREATE_IF_NEEDED', skip_leading_rows=0, write_disposition='WRITE_TRUNCATE', # If the table exists, overwrite it. max_bad_records=0) # Delete the Cloud Dataproc cluster. delete_cluster = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', # Obviously needs to match the name of cluster created in the prior two Operators. cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', # This will tear down the cluster even if there are failures in upstream tasks. trigger_rule=TriggerRule.ALL_DONE) # Delete gcs files in the timestamped transformed folder. delete_transformed_files = BashOperator( task_id='delete_transformed_files', bash_command="gsutil -m rm -r gs://" + BUCKET + "/{{ dag_run.conf['transformed_path'] }}/") # If the spark job or BQ Load fails we rename the timestamped raw path to # a timestamped failed path. move_failed_files = BashOperator(task_id='move_failed_files', bash_command="gsutil mv gs://" + BUCKET + "/{{ dag_run.conf['raw_path'] }}/ " + "gs://" + BUCKET + "/{{ dag_run.conf['failed_path'] }}/", trigger_rule=TriggerRule.ONE_FAILED) # Set the dag property of the first Operators, this will be inherited by downstream Operators. create_cluster.dag = dag create_cluster.set_downstream(submit_pyspark)