from airflow import DAG # BashOperator from airflow.operators.bash_operator import BashOperator # days ago function from airflow.utils.dates import days_ago from datetime import datetime as dt from datetime import timedelta import os default_args = { 'owner' : 'airflow', 'depends_on_past' : False, 'start_date' : dt(2020, 12, 5), 'retries' : 1, 'retry_delay' : timedelta(minutes=1) } dag = DAG( 'Start_hadoop', description = 'Check if Hadoop running', default_args = default_args, schedule_interval = timedelta(days = 1) ) t1 = BashOperator( task_id='Start_hadoop', bash_command='hadoop_start', dag=dag)
print( 'POSTAMBLE ------------------------------------------------------------------------' ) create_podevent('Finishing ##PHASE## workflow for POD ##UUID##, Failed') create_podevent('State changed to: FAILED', level='STATUS') t1 = PythonOperator(task_id='preamble', provide_context=True, python_callable=preamble, dag=dag) # Note the space at the end of the bash_command value is REQUIRED t2 = BashOperator( task_id='maintask', bash_command= 'chmod +x /workflow/##PHASE##-##WFINDEX##-##UUID##/##WFNAME##; /bin/bash /workflow/##PHASE##-##WFINDEX##-##UUID##/##WFNAME## ', dag=dag) t3 = PythonOperator(task_id='postamble', provide_context=True, python_callable=postamble, dag=dag) t4 = PythonOperator(task_id='failure', provide_context=True, python_callable=failure, dag=dag, trigger_rule='all_failed') t2.set_upstream(t1)
# if a task fails, retry it once after waiting # at least 5 minutes #'retries': 1, #'retry_delay': timedelta(minutes=5), } dag = DAG( 'dag_cron', default_args=default_args, description='A simple tutorial DAG', schedule_interval="20 3 3 * *", ) t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag, ) t1.doc_md = """\ #### Task Documentation You can documentation your task usinf the attributes `doc_md` (,arkdown), `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets rendered in the UI's Task Instance Details page """ dag.doc_md = __doc__ t2 = BashOperator( task_id='sleep', depends_on_past=False,
import requests from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator dag = DAG( dag_id="download_rocket_launches", start_date=airflow.utils.dates.days_ago(14), schedule_interval=None, ) download_launches = BashOperator( task_id="download_launches", bash_command= "curl -o /tmp/launches.json 'https://launchlibrary.net/1.4/launch?next=5&mode=verbose'", dag=dag, executor_config={ 'request_memory': '128Mi', 'limit_memory': '128Mi', 'image': 'airflow/scipy:1.1.5' }) def _get_pictures(): # Ensure directory exists pathlib.Path("/tmp/images").mkdir(parents=True, exist_ok=True) # Download all pictures in launches.json with open("/tmp/launches.json") as f: launches = json.load(f) image_urls = [ launch["rocket"]["imageURL"] for launch in launches["launches"]
"retries": 1, "retry_delay": timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG("test_branch", default_args=default_args, schedule_interval=timedelta(minutes=5), catchup=False) t1 = BashOperator( task_id="init", bash_command="echo lol", params={"my_param": "Parameter I passed in"}, dag=dag, ) options = ["wowww", "wowww2"] t2 = BranchPythonOperator(task_id='branching', python_callable=lambda: random.choice(options), dag=dag) t3 = BashOperator( task_id="wowww", bash_command="echo wowwww", params={"my_param": "Parameter I passed in"}, dag=dag, ) t4 = DummyOperator(task_id='wowww2', trigger_rule='one_success', dag=dag)
'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=10), } dag = DAG( dag_id='aadownloaddata', default_args=default_args, description='descargadedatos', dagrun_timeout=timedelta(minutes=2), schedule_interval=timedelta(days=1), ) CreateDir = BashOperator(task_id='create_dir', depends_on_past=False, bash_command='mkdir -p /tmp/airflow/p2/', dag=dag) DownloadTemperatureData = BashOperator( task_id='download_temperature_data', depends_on_past=False, bash_command= 'curl -o /tmp/airflow/p2/temperature.csv.zip https://raw.githubusercontent.com/manuparra/MaterialCC2020/master/temperature.csv.zip', dag=dag) DownloadHumidityData = BashOperator( task_id='download_humidity_data', depends_on_past=False, bash_command= 'curl -o /tmp/airflow/p2/humidity.csv.zip https://raw.githubusercontent.com/manuparra/MaterialCC2020/master/humidity.csv.zip', dag=dag)
dag = DAG( "extract_secmar_s3", default_args=default_args, max_active_runs=1, concurrency=5, catchup=False, schedule_interval="50 8 * * *", ) dag.doc_md = __doc__ start = DummyOperator(task_id="start", dag=dag) end = DummyOperator(task_id="end", dag=dag) download_s3 = BashOperator( task_id="download_s3", bash_command="mkdir {base} && aws s3 sync s3://secmar {base}".format( base=BASE_PATH), dag=dag, ) download_s3.set_downstream(start) for table in SECMAR_TABLES + ["operations_valides"]: command = "awk 'NR==1{{$0=tolower($0)}} 1' {in_path} > {tmp} && mv {tmp} {out_path}".format( tmp="/tmp/lower_" + table, in_path=BASE_PATH + "/" + table + ".csv", out_path=in_path(table), ) lowercase_header = BashOperator(task_id="lowercase_header_csv_" + table, bash_command=command, dag=dag) lowercase_header.set_upstream(start) lowercase_header.set_downstream(end)
"retries": 1, "retry_delay": timedelta(minutes=1), 'provide_context': True, } dag = DAG("Stock-Exchange-V4", default_args=default_args, schedule_interval="0 13 * * 6,0-3", catchup=False) task_read_stock_exchange_xlsx_file = BashOperator( task_id='Download-Stock-Exchange-Xlsx-File', bash_command= 'curl --retry 10 --output {0} -L -H "User-Agent:Chrome/61.0" --compressed "http://members.tsetmc.com/tsev2/excel/MarketWatchPlus.aspx?d=0"' .format( path.join( EXCEL_FILE_PATH, "{0}_{1}.{2}".format(EXCEL_FILE_NAME, date.today().strftime("%Y_%m_%d"), EXCEL_FILE_EXT_XLSX))), dag=dag, ) task_waiting_file_xlsx = FileSensor( task_id="Waiting-Excel-File", fs_conn_id="fs_temp", filepath=path.join( EXCEL_FILE_PATH, "{0}_{1}.{2}".format(EXCEL_FILE_NAME, date.today().strftime("%Y_%m_%d"), EXCEL_FILE_EXT_XLSX)), poke_interval=10, # every 10 seconds,
from pendulum import Pendulum args = { 'owner': 'Anton Kostyliev', 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG( dag_id='exercise_2', default_args=args, schedule_interval=None, dagrun_timeout=timedelta(minutes=60), ) def print_execution_date(execution_date : Pendulum, **context): print("Execution date: " + execution_date.to_iso8601_string()) with dag as dag: execution_date = PythonOperator( task_id="print_execution_date", python_callable=print_execution_date, provide_context=True ) sleep1 = BashOperator(task_id='sleep1', bash_command="sleep 1") sleep5 = BashOperator(task_id='sleep5', bash_command="sleep 5") sleep10 = BashOperator(task_id='sleep10', bash_command="sleep 10") end = DummyOperator(task_id="finish_task") execution_date >> [sleep1, sleep5, sleep10] >> end
import airflow from airflow.operators.bash_operator import BashOperator from airflow.models import DAG args = { 'owner': 'Freddy Drennan', 'start_date': airflow.utils.dates.days_ago(2), 'email': ['*****@*****.**'], 'retries': 2, 'email_on_failure': True, 'email_on_retry': True } dag = DAG(dag_id='restore_postgres_from_backup', default_args=args, schedule_interval='@daily', concurrency=1, max_active_runs=1, catchup=False) task_1 = BashOperator(task_id='set_up_aws', bash_command='. /home/scripts/R/shell/aws_configure', dag=dag) task_2 = BashOperator( task_id='get_backup_from_s3', bash_command='. /home/scripts/R/shell/get_backup_from_s3', dag=dag) task_1 >> task_2
from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'royh', 'start_date': datetime(2020, 5, 5), 'depends_on_past': False, 'retries': 3, 'retry_delay': timedelta(minutes=5), 'catchup_by_default': False, 'email_on_retry': False } dag = DAG('seo_rankings', default_args=default_args, schedule_interval='@daily') t0 = BashOperator(task_id='where_am_i', bash_command='pwd && cd ~ && ls', dag=dag) t1 = BashOperator(task_id='get_coaster_rankings', bash_command='cd ~ && cd dags && python etsy_coaster_seo.py', dag=dag) t2 = BashOperator( task_id='get_luggage_rankings', bash_command='cd ~ && cd dags && python etsy_luggagetag_seo.py', dag=dag) t0 >> t1 >> t2
'start_date': datetime(2021, 3, 16, 18), # ano mes dia e hora 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } # Vamos definir a DAG - Fluxo dag = DAG("treino-04", description='Paralelismo', default_args=default_args, schedule_interval="*/10 * * * *") start_processing = BashOperator(task_id='start-processing', bash_command='echo Start Preprocessing! Vai!', dag=dag) get_data = BashOperator( task_id='get-data', bash_command= 'curl http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip. -o /usr/local/airflow/data/train.csv', dag=dag) def unzip_file(): with zipfile.ZipFile('/usr/local/airflow/data/microdados_enade_2019.zip', 'r') as zipped: zipped.extractall('/usr/local/airflow/data/')
https://github.com/apache/airflow/blob/master/airflow/example_dags/tutorial.py """ from airflow import DAG from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'Airflow', 'depends_on_past': False, 'start_date': datetime(2020, 2, 4), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('test_pipeline', default_args=default_args, schedule_interval=timedelta(days=1)) t1 = BashOperator( task_id='execute_casa', bash_command='casa --nologger --nogui -c ./src/casa_script.py', dag=dag)
"owner": "airflow", "depends_on_past": False, "start_date": datetime(2019, 11, 6), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=5) #"access_control": {"role1": {"can_dag_read", "can_dag_edit"}} # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG("rbac_test_2", default_args=default_args, schedule_interval=timedelta(minutes=1), catchup=False, access_control={'role1': ['can_dag_read']}) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag) t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag) t3 = BashOperator(task_id="sleep2", bash_command="sleep 5", retries=3, dag=dag) t2.set_upstream(t1) t3.set_upstream(t1)
log_url=context.get('task_instance').log_url, ) failed_alert = SlackWebhookOperator( task_id='slack_test', http_conn_id='slack', webhook_token=slack_webhook_token, message=slack_msg, username='******', ) return failed_alert.execute(context=context) default_args = {'owner':'rdumas', 'depends_on_past':False, 'start_date': datetime(2019, 11, 22), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_success': False, 'retries': 0, 'retry_delay': timedelta(minutes=5), 'on_failure_callback': task_fail_slack_alert } dag = DAG('pull_miovision',default_args=default_args, schedule_interval='0 3 * * *') # Add 3 hours to ensure that the data are at least 2 hours old t1 = BashOperator( task_id = 'pull_miovision', bash_command = '/etc/airflow/data_scripts/.venv/bin/python3 /etc/airflow/data_scripts/volumes/miovision/api/intersection_tmc.py run-api --path /etc/airflow/data_scripts/volumes/miovision/api/config.cfg --dupes', retries = 0, dag=dag)
# 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG( 'ats_daily', default_args=default_args, schedule_interval='0 0 * * *', catchup=False, max_active_runs=1) python_executable = '~/venv/bin/python3.7' python_script_path = '~/PycharmProjects/TwitterStats' # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator( task_id='words_trends', bash_command='cd {};{} words.py trends'.format(python_script_path, python_executable), dag=dag) t2 = BashOperator( task_id='tweeter_promotion', bash_command='cd {};{} tweeter_promotion.py'.format(python_script_path, python_executable), dag=dag) hu = BashOperator( task_id='hashtag_update', bash_command='cd {};{} hashtag_update.py'.format(python_script_path, python_executable), dag=dag) t4 = BashOperator( task_id='findbots', bash_command='cd {};{} findbots.py {{{{ ds }}}}'.format(python_script_path, python_executable),
'retry_delay': timedelta(minutes=1) } # Definição da DAG - Fluxo dag = DAG( "treino-03", description= "Pega dados do Titanic e calcula idade média para homens ou mulheres", default_args=default_args, schedule_interval=timedelta(minutes=2) #schedule_interval="*/2 * * * *" ) get_data = BashOperator( task_id='get_data', bash_command= 'curl https://raw.githubusercontent.com/A3Data/hermione/master/hermione/file_text/train.csv -o /usr/local/airflow/data/train.csv', dag=dag) def sorteia_h_m(): return random.choice(['male', 'female']) escolhe_h_m = PythonOperator(task_id='escolhe-h-m', python_callable=sorteia_h_m, dag=dag) def Mouf(**context): value = context['task_instance'].xcom_pull(task_ids='escolhe-h-m')
"start_date": datetime(2019, 1, 1), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG("random-fail", default_args=default_args, schedule_interval=timedelta(1)) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id="task1", bash_command="echo task1", dag=dag) t2 = BashOperator( task_id="task2", bash_command= "rnd=$RANDOM; echo $rnd; if [ $rnd -lt 20000 ]; then exit 1; fi", retries=0, dag=dag) t3 = BashOperator(task_id="task3", bash_command="echo task3", dag=dag) t1 >> t2 t2 >> t3
'owner': 'pardha', 'depends_on_past': False, 'start_date': datetime(2017, 5, 11), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } dag = DAG('HelloWorld', default_args=default_args) # t1, t2, t3 and t4 are examples of tasks created using operators t1 = BashOperator( task_id='task_1', bash_command='echo "Hello World from Task pardha"', dag=dag) t2 = BashOperator( task_id='task_2', bash_command='echo "Hello World from Task 2"', dag=dag) t3 = BashOperator( task_id='task_3', bash_command='echo "Hello World from Task 3"', dag=dag) t4 = BashOperator( task_id='task_4', bash_command='echo "Hello World from Task 4"',
xcom_push=True, dag=dag, ) callSjson = SimpleHttpOperator( task_id="pass_JSON", method='POST', endpoint='/payload', data=json.dumps({"channel": "UK"}), headers={"Content-Type": "application/json"}, http_conn_id='cloud_run_gcp_flask', xcom_push=True, dag=dag, ) echoS = BashOperator( task_id="echo_sucess", bash_command="echo sucess", dag=dag, ) echoF = BashOperator( task_id="echo_failure", bash_command="echo failure", dag=dag, ) echoS.set_upstream(callS1) echoS.set_upstream(callS2) echoF.set_upstream(callF1)
"retry_delay": timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG( "tutorial", default_args=default_args, schedule_interval=timedelta(days=1), catchup=False, ) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag) t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator( task_id="templated", bash_command=templated_command, params={"my_param": "Parameter I passed in"},
default_args = { 'owner': 'user', 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), # 'wait_for_downstream': False, # 'dag': dag, # 'adhoc':False, # 'sla': timedelta(hours=2), # 'execution_timeout': timedelta(seconds=300), # 'on_failure_callback': some_function, # 'on_success_callback': some_other_function, # 'on_retry_callback': another_function,我都搶不到 # 'trigger_rule': u'all_success' } dag = DAG('tutorial', default_args=default_args, description='my first DAG', schedule_interval='10 * * * * *') t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag)
except (IOError, psycopg2.Error), error : print ("Error inserting dataframe to DB! ", error) else: print "Data already exists in the DB! Skipping..." srcDir = '/home/ubuntu/fault-tolerant-airflow/src/spark/' # Command to run remote spark batch processing cmd = 'ssh [email protected] spark-submit' + ' ' + srcDir + 'PDS.py --master ec2-18-235-191-19.compute-1.amazonaws.com --deploy-mode=cluster' objectKey = 's3n://de-yk-bucket/PDS/XETR/DailyAverages/' + str(todays_date_str) + '.csv' # Bash operator that synchronizes the Deutsche XETR Public Dataset with my bucket stored in S3 s3_ingest_opr = BashOperator(task_id='s3_ingest', bash_command='aws s3 sync s3://deutsche-boerse-xetra-pds s3://de-yk-bucket/PDS/XETR/ ', dag=dag) # Remote batch processing operator that calculates the daily averages of stock prices spark_batch_opr = BashOperator(task_id='spark_batch', bash_command=cmd, dag=dag) # S3 file sensor operator that senses the temporarily created csv file in S3 s3_file_sensor_opr = S3KeySensor( task_id='s3_file_sensor', poke_interval=60, timeout=10, soft_fail=True, bucket_key=objectKey, bucket_name=None, dag=dag) # Store to DB operator that stores the calculated daily average prices in PostgreSQL
'start_date': datetime(2015, 6, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('test', default_args=default_args, schedule_interval=timedelta(1)) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id='echo_cal', bash_command='cal', dag=dag) t2 = BashOperator(task_id='sleep', bash_command='sleep 5 && echo "Hello World"', retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator(task_id='templated',
default_args=args, schedule_interval='@daily') d1 = PythonOperator(task_id='task1', provide_context=True, python_callable=tt1, op_kwargs={'param': 'apple'}, dag=dag) d2 = PythonOperator(task_id='task2', provide_context=True, python_callable=tt2, op_kwargs={'param': 'apple'}, dag=dag) d3 = BashOperator( task_id='task3', bash_command='source /Users/jigeonho/airflow/venv/bin/activate', dag=dag) d5 = BashOperator( task_id='task5', bash_command='/bin/bash /Users/jigeonho/airflow/function/runvenv.sh ', dag=dag) # d4 = PythonOperator(task_id='task4', # provide_context=True, # python_callable=pdf_main, # # op_kwargs={'param': 'apple'}, # dag=dag) d1 >> d2 >> d3
from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator from airflow.operators.bash_operator import BashOperator from airflow import DAG from datetime import datetime, timedelta dataflow_dag = DAG(dag_id="dataflow_pipeline", start_date=datetime(2017, 2, 2), schedule_interval=timedelta(seconds=15), max_active_runs=1, catchup=True) print_path_task = BashOperator(dag=dataflow_dag, bash_command="pwd", task_id="test_upstream_task_pwd") jar_task = DataFlowJavaOperator(dag=dataflow_dag, jar="/home/airflow/gcs/dags/" "jar/dataflow_pipeline-bundled-1.0.jar", options={ "project": "hybrid-elysium-118418", "stagingLocation": "gs://hybrid-elysium-118418/dataflow/" }, task_id="dataflow_pipeline") print_path_task.set_downstream(jar_task)
dag_id="atd_visionzero_reassociate_missing_locations_staging", description="This script re-processes location associations in VZD", default_args=args, schedule_interval="0 3 * * *", dagrun_timeout=timedelta(minutes=60), tags=["staging", "visionzero"], ) # # This process will find the locations for CR3 crashes that do not have one but # fall into a location and they are not mainlanes. # process_cr3 = BashOperator( task_id="process_cr3", bash_command= "python3 ~/dags/python_scripts/atd_vzd_update_cr3_locations.py", env=environment_vars, dag=dag, ) # # This process will find the locations for Non-CR3 crashes that do not have one but # fall into a location and they are not mainlanes. # process_noncr3 = BashOperator( task_id="process_noncr3", bash_command= "python3 ~/dags/python_scripts/atd_vzd_update_noncr3_locations.py", env=environment_vars, dag=dag, )
{{ params.base_path }}/{{ params.filename }} \ {{ params.base_path }} """ pyspark_date_bash_command = """ spark-submit --master {{ params.master }} \ {{ params.base_path }}/{{ params.filename }} \ {{ ts }} {{ params.base_path }} """ # Gather the training data for our classifier extract_features_operator = BashOperator(task_id="pyspark_extract_features", bash_command=pyspark_bash_command, params={ "master": "local[8]", "filename": "ch08/extract_features.py", "base_path": "{}/".format(PROJECT_HOME) }, dag=training_dag) # Train and persist the classifier model train_classifier_model_operator = BashOperator( task_id="pyspark_train_classifier_model", bash_command=pyspark_bash_command, params={ "master": "local[8]", "filename": "ch08/train_spark_mllib_model.py", "base_path": "{}/".format(PROJECT_HOME) },
'start_date': datetime(2020,1,30), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } dag:DAG = DAG( 'project_dag', default_args=default_args, schedule_interval=timedelta(hours=1) ) # operators (executors) for each task we must run named according to the BigData assignment numbers. project_2 = BashOperator( task_id='project_2', bash_command='/home/jeff/BigData/002-NetCat/run_howto.sh ', dag=dag ) project_3 = BashOperator( task_id='project_3', bash_command='/home/jeff/BigData/003-SkillsTest/run.sh ', dag=dag ) project_4 = BashOperator( task_id='project_4', bash_command='/home/jeff/BigData/004-WordCount/python/run ', dag=dag ) project_8 = BashOperator( task_id='project_8', bash_command='/home/jeff/BigData/008-WordCountInScala/run ', dag=dag
cnopts.hostkeys = None #3. Create a pysftp connection with host="35.222.158.208", username="******", password="******",cnopts = cnopts json_remote_file = "comments."+str(time.time())+".json" #4. Do a sftp with following parameters # Local file (comments.json) # Remote file path "Upload/YourName/json_remote_file" # Closes the connection srv.close() #5. Add your name to the DAG name below with DAG('airflow_sftp_YourName', default_args=default_args, schedule_interval='*/3 * * * *',) as dag: download_json_task = PythonOperator(task_id='Get_comments', python_callable=download_json_comments) print_json_ok = BashOperator(task_id='Download_check', bash_command='echo "Json comments file downloaded successfully"') upload_json_sftp_task = PythonOperator(task_id='Upload_comments_sftp', python_callable=upload_sftp) print_upload_ok = BashOperator(task_id='Upload_check', bash_command='echo "Json comments file uploaded successfully"') #6. Create worflow DAG with following sequence: download_json_task, print_json_ok, upload_json_sftp_task, print_upload_ok