# The contents of the obtained file_path_sp directory file_names_sp = os.listdir(file_path_sp) # Get ul file path file_path_ul = task_name_ul + '/' + option + '/bin/' # The contents of the obtained file_path_ul directory file_names_ul = os.listdir(file_path_ul) # Loop to get sp each file name for file_name_sp in file_names_sp: if os.path.isfile(file_path_sp + file_name_sp): task_option_path = '/usr/bin/perl ' + task_name_sp + '/' + option + '/bin/' + file_name_sp + ' ' task_option_path_ld = '/usr/bin/perl ' + task_name_ld + '/' + option + '/bin/StructuralLoad.pl ' t = BashOperator(task_id='LD_' + option, bash_command=task_option_path_ld, dag=dag) t.set_upstream(branching) dummy_follow = BashOperator(task_id='SP_' + option, bash_command=task_option_path, dag=dag) t.set_downstream(dummy_follow) dummy_follow.set_downstream(join) # Loop to get ul each file name for file_name_ul in file_names_ul: if os.path.isfile(file_path_ul + file_name_ul): task_option_path = '/usr/bin/perl ' + task_name_ul + '/' + option + '/bin/' + file_name_ul + ' ' dummy_follow = BashOperator(task_id='UL_' + option, bash_command=task_option_path, dag=dag) dummy_follow.set_upstream(branching) dummy_follow.set_downstream(join)
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': min_10, 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('scrape_cdc', default_args=default_args) run_this = PythonOperator( task_id='print_the_context', provide_context=True, python_callable=print_context, dag=dag) t1 = BashOperator( task_id='testairflow', bash_command=f'python {file_path}', dag=dag) t1.set_downstream(run_this) if __name__ == "__main__": dag.cli()
args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG( dag_id='example_bash_operator', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60)) cmd = 'ls -l' run_this_last = DummyOperator(task_id='run_this_last', dag=dag) run_this = BashOperator( task_id='run_after_loop', bash_command='echo 1', dag=dag) run_this.set_downstream(run_this_last) for i in range(3): i = str(i) task = BashOperator( task_id='runme_'+i, bash_command='echo "{{ task_instance_key_str }}" && sleep 1', dag=dag) task.set_downstream(run_this) task = BashOperator( task_id='also_run_this', bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', dag=dag) task.set_downstream(run_this_last)
isalnum = a_string.isalnum() #print('Is String Alphanumeric :', isalnum) alphanumeric_filter = filter(str.isalnum, a_string) alphanumeric_string = "".join(alphanumeric_filter) #remove / from file path return alphanumeric_string.replace("/", "__") with models.DAG( 'import_ingestion', # Continue to run DAG once per day schedule_interval='@once', default_args=default_dag_args) as dag: start = DummyOperator(task_id='start') wait = DummyOperator(task_id='wait', trigger_rule="all_done") end = DummyOperator(task_id='end', trigger_rule="all_done") for blob in blobs: #print(blob.name) print_file = BashOperator(task_id='print_file_' + get_alphanumeric_task_id(blob.name), bash_command='echo "hello "+blob.name', dag=dag) start.set_downstream(print_file) print_file.set_downstream(wait) wait >> end
# To email on failure or retry set 'email' arg to your email and enable # emailing here. 'email_on_failure': False, 'email_on_retry': False, # If a task fails, retry it once after waiting at least 5 minutes 'retries': 0, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } from google.cloud import storage client = storage.Client() i = 0 with models.DAG('loop_over_gcs_bucket_files_example', schedule_interval=None, default_args=default_dag_args) as dag: start = DummyOperator(task_id='start') wait = DummyOperator(task_id='wait', trigger_rule=TriggerRule.ONE_SUCCESS) for blob in client.list_blobs('myBucket', prefix='myFolder/mySubfolder'): #task id must only contain alphanumeric chars bash_cmd = "echo " + str(blob.name) i = i + 1 bash_operator = BashOperator(task_id='bash_operator' + str(i), bash_command=bash_cmd) start.set_downstream(bash_operator) bash_operator.set_downstream(wait) end = DummyOperator(task_id='end') wait >> end
'docker_sample', default_args=default_args, schedule_interval=timedelta(minutes=10)) t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag) t2 = BashOperator( task_id='sleep', bash_command='sleep 5', retries=3, dag=dag) t3 = DockerOperator(api_version='1.19', docker_url='tcp://localhost:2375', #Set your docker URL command='/bin/sleep 30', image='centos:latest', network_mode='bridge', task_id='docker_op_tester', dag=dag) t4 = BashOperator( task_id='print_hello', bash_command='echo "hello world!!!"', dag=dag) t1.set_downstream(t2) t1.set_downstream(t3) t3.set_downstream(t4)
with models.DAG( 'search_console_with_quata', # Continue to run DAG once per day schedule_interval=None, default_args=default_dag_args) as dag: #dummy - proceed only if success start = DummyOperator(task_id='start') wait = DummyOperator(task_id='wait') end = DummyOperator(task_id='end') for single_date in daterange(start_date, end_date): temp_date=single_date.strftime("%Y-%m-%d") day_after_single_date=single_date+ datetime.timedelta(days = 1) day_after_single_date=day_after_single_date.strftime("%Y-%m-%d") ##notice trigger_rule="all_done" bash_run_report_remotly_cmd='gcloud beta compute --project myProject ssh search-console --internal-ip --zone us-central1-c --command "sudo -u omid python /home/omid/search_analytics_api_sample.py sc-domain:investing.com '+temp_date+" "+day_after_single_date+'"' run_report_remotly = BashOperator(task_id='run_report_remotly_'+temp_date,retries=2,retry_delay=datetime.timedelta(minutes=15),retry_exponential_backoff=True,max_retry_delay=datetime.timedelta(hours=48),bash_command=bash_run_report_remotly_cmd,trigger_rule="all_done") start.set_downstream(run_report_remotly) run_report_remotly.set_downstream(wait) mv_to_data_lake = BashOperator( task_id='mv_to_data_lake',bash_command='gcloud beta compute --project gap---all-sites-1245 ssh search-console --internal-ip --zone us-central1-c --command "sudo -u omid gsutil -m mv -r /tmp/search* gs://data_lake_ingestion_us/search_console/"',dag=dag) load="""bq --location US load --source_format CSV --replace=true --skip_leading_rows 1 --allow_quoted_newlines --quote "" DATA_LAKE_INGESTION_US.search_console_partition gs://data_lake_ingestion_us/search_console/*""" load_to_data_lake = BashOperator( task_id='load_to_data_lake',bash_command=load,dag=dag) wait >> mv_to_data_lake >> load_to_data_lake >> end
schedule_interval=None, start_date=datetime.now() - timedelta(minutes=1)) MysqlToHive = BashOperator( task_id='MysqlToHive', bash_command= """ sh /home/cloudera/Documents/PracticalExercise2/MysqlToHive.sh """, dag=dag) csvToHive = BashOperator( task_id='csvToHive', bash_command= """ sh /home/cloudera/Documents/PracticalExercise2/csvToHive.sh """, dag=dag) ReportingTables1 = BashOperator( task_id='ReportingTables1', bash_command= """ sh /home/cloudera/Documents/PracticalExercise2/ReportingTables1.sh """, dag=dag) ReportingTables2 = BashOperator( task_id='ReportingTables2', bash_command= """ sh /home/cloudera/Documents/PracticalExercise2/ReportingTables2.sh """, dag=dag) MysqlToHive.set_downstream(csvToHive) csvToHive.set_downstream(ReportingTables1) ReportingTables1.set_downstream(ReportingTables2)
dag=dag) import_sql_hive = BashOperator( task_id='import_sql_hive', bash_command= """sh /home/cloudera/Downloads/practical/import_sql_hive.sh -u root -p /user/cloudera/password.txt -d practical_exercise_1 """, dag=dag) create_csv = BashOperator( task_id='create_csv', bash_command= """python3 /home/cloudera/Downloads/practical/practical_exercise_data_generator.py --create_csv """, dag=dag) import_csv_hive = BashOperator( task_id='import_csv_hive', bash_command= """sh /home/cloudera/Downloads/practical/import_csv_hive.sh -d practical_exercise_1 """, dag=dag) generate_report = BashOperator( task_id='generate_report', bash_command= """sh /home/cloudera/Downloads/practical/generate_report.sh -d practical_exercise_1 """, dag=dag) create_csv.set_downstream(import_csv_hive) load_data.set_downstream(import_sql_hive) import_sql_hive.set_downstream(generate_report) import_csv_hive.set_downstream(generate_report)
"%Y%m%d") + '.json' bash_api_call_GET_DESKTOP_TRAFFIC = BashOperator( task_id='bash_api_call_GET_DESKTOP_TRAFFIC' + single_date.strftime("%Y%m%d"), bash_command=bash_cmd) bash_cmd2 = """gsutil mv /tmp/file_""" + single_date.strftime( "%Y%m%d") + '.json gs://data_lake/similar_web_desktop_traffic/' bash_gsutil_mv_files_to_ingestion = BashOperator( task_id='bash_gsutil_mv_files_to_ingestion' + single_date.strftime("%Y%m%d"), bash_command=bash_cmd2) #bash_cmd="""ls""" #bash_api_call_GET_DESKTOP_TRAFFIC = BashOperator(task_id='bash_opr_'+str(item),bash_command=bash_cmd) start.set_downstream(bash_api_call_GET_DESKTOP_TRAFFIC) bash_api_call_GET_DESKTOP_TRAFFIC.set_downstream( bash_gsutil_mv_files_to_ingestion) bash_gsutil_mv_files_to_ingestion.set_downstream(wait) load_to_bg_GET_DESKTOP_TRAFFIC = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='load_to_bg_GET_DESKTOP_TRAFFIC', source_objects=['*'], write_disposition='WRITE_TRUNCATE', #overwrite? create_disposition='CREATE_IF_NEEDED', bucket=DST_BUCKET, destination_project_dataset_table=dst_table, autodetect='true') end = DummyOperator(task_id='end') wait >> load_to_bg_GET_DESKTOP_TRAFFIC >> end
region="us-east1") bq_load_profeco_data = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_csv_profeco", bucket='gnp-storage', source_objects=["Profeco/resources/Sin-fecha/profeco.pdf"], destination_project_dataset_table=PROJECT_ID + ".GNP.Profeco_table", autodetect=True, source_format="CSV", field_delimiter=',', create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0) delete_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="us-east1", trigger_rule=TriggerRule.ALL_DONE) unzip_files.dag = dag unzip_files.set_downstream(create_cluster) create_cluster.set_downstream(PythonOperator) PythonOperator.set_downstream([submit_pyspark, bq_load_profeco_data]) submit_pyspark.set_downstream(delete_cluster)
bash_command=" hadoop fs -mkdir -p /user/cloudera/workshop/process/ ", dag=dag) Create_Database = BashOperator( task_id='Create_Database', bash_command="""impala-shell -q "create database practical_exercise_1;" """, dag=dag) Sqoop_Job= BashOperator( task_id='Sqoop_Job', bash_command="sqoop job --meta-connect jdbc:hsqldb:hsql://localhost:16000/sqoop --create practical_exercise_1.activitylog -- import --connect jdbc:mysql://localhost/practical_exercise_1 --username root --password-file /user/cloudera/root_pwd.txt --table activitylog -m 4 --hive-import --hive-database practical_exercise_1 --hive-table activitylog --incremental append --check-column id --last-value 0 ", dag=dag) External_table = BashOperator( task_id='External_table', bash_command="""hive -e "CREATE EXTERNAL TABLE practical_exercise_1.user_upload_dump ( user_id int, file_name STRING, timestamp int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/user/cloudera/workshop/process/' tblproperties ('skip.header.line.count'='1');" """ , dag=dag) Creating_table_user_total = BashOperator( task_id='Creating_table_user_total', bash_command="""impala-shell -q "create table if not exists practical_exercise_1.user_total(time_ran timestamp, total_users bigint, users_added bigint);" """, dag=dag) Starting_Sqoop_Metajob.set_downstream(Sqoop_Job) Creating_Directories.set_downstream(External_table) Create_Database.set_downstream(External_table) Create_Database.set_downstream(Sqoop_Job) Create_Database.set_downstream(Creating_table_user_total)
'start_date': datetime.utcnow(), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('docker_sample', default_args=default_args, schedule_interval=timedelta(minutes=10)) t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag) t2 = BashOperator(task_id='sleep', bash_command='sleep 5', retries=3, dag=dag) t3 = DockerOperator(api_version='1.21', command='/bin/sleep 30', image='busybox:latest', network_mode='bridge', task_id='docker_op_tester', dag=dag) t4 = BashOperator(task_id='print_hello', bash_command='echo "hello world!!!"', dag=dag) t1.set_downstream(t2) t1.set_downstream(t3) t3.set_downstream(t4)
destination_project_dataset_table=PROJECT_ID+".data_analysis.flights_delays", autodetect=True, source_format="AVRO", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0 ) # delete_cluster=DataprocClusterDeleteOperator( # task_id="delete_cluster", # cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", # region='asia-east1', # trigger_rule=TriggerRule.ALL_DONE # ) create_cluster.dag=dag create_cluster.set_downstream(submit_sqoop) submit_sqoop.set_downstream(bq_load_flight_delays) # bq_load_flight_delays.set_downstream(delete_cluster)
'owner': 'milseiei', 'depends_on_past': False, 'start_date': datetime(2017, 1, 12), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } dag = DAG('Time_splitter', schedule_interval='0,30 * * * *', default_args=default_args) t1 = BashOperator(task_id='task_1_date_data', bash_command='date > /home/ubuntu/airflow/dags/data.txt', dag=dag) t2 = BashOperator( task_id='task_2_split_time', bash_command='python /home/ubuntu/airflow/dags/split_into_time.py', dag=dag) t3 = BashOperator( task_id='task_3_split_into_mins', bash_command='python /home/ubuntu/airflow/dags/split_into_mins.py', dag=dag) t1.set_downstream(t2) t2.set_downstream(t3)
valid_chars='-_.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' def sanitize(text): return ''.join(c for c in text if c in valid_chars) #this is where pipeline-generated bash commands come in.... bash_commands = ('echo "hi russ"', 'echo "hello again"') conclusion_command = 'echo "all done"' conclusion = BashOperator(task_id='conclude', bash_command=conclusion_command, dag=dag) for cmd in bash_commands: cmd.rstrip() run_this = BashOperator( task_id=sanitize(cmd), bash_command=cmd, dag=dag) run_this.set_downstream(conclusion) # def push(**kwargs): # # pushes an XCom without a specific target # kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1) # def push_by_returning(**kwargs): # # pushes an XCom without a specific target, just by returning it # return value_2
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('init_airflow_practical_exercise', default_args=default_args, schedule_interval=None, start_date=datetime.now() - timedelta(minutes=1)) Init_MysqlToHive = BashOperator( task_id='Init_MysqlToHive', bash_command= """ sh /home/cloudera/Documents/PracticalExercise2/Init_MysqlToHive.sh """, dag=dag) Init_csvToHive = BashOperator( task_id='Init_csvToHive', bash_command= """ sh /home/cloudera/Documents/PracticalExercise2/Init_csvToHive.sh """, dag=dag) Init_ReportingTables2 = BashOperator( task_id='Init_ReportingTables2', bash_command= """ sh /home/cloudera/Documents/PracticalExercise2/Init_ReportingTables2.sh """, dag=dag) Init_MysqlToHive.set_downstream(Init_csvToHive) Init_csvToHive.set_downstream(Init_ReportingTables2)
'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_bash_operator', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60)) cmd = 'ls -l' run_this_last = DummyOperator(task_id='run_this_last', dag=dag) run_this = BashOperator(task_id='run_after_loop', bash_command='echo 1', dag=dag) run_this.set_downstream(run_this_last) for i in range(3): i = str(i) task = BashOperator( task_id='runme_' + i, bash_command='echo "{{ task_instance_key_str }}" && sleep 1', dag=dag) task.set_downstream(run_this) task = BashOperator( task_id='also_run_this', bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', dag=dag) task.set_downstream(run_this_last)
dag=dag) create_findisactive = BashOperator( task_id='create_findisactive', bash_command= """ impala-shell -q "drop table if exists practical_exercise_1.find_isactive; create table practical_exercise_1.find_isactive as select sums.user_id,sums.total_updates, sums.total_inserts, sums.total_deletes, sums.upload_count, d.last_active_type, d.is_active from practical_exercise_1.sums join (select c.user_id, t, activitylog.type last_active_type, c.is_active from (select activitylog.user_id, max(timestamps) t, if(unix_timestamp()-max(timestamps)<=172800,'TRUE','FALSE') is_active from practical_exercise_1.activitylog group by user_id)c right outer join practical_exercise_1.activitylog on c.t=activitylog.timestamps where c.user_id=activitylog.user_id)d on d.user_id=sums.user_id;" """, dag=dag) create_userreport = BashOperator( task_id='create_userreport', bash_command= """ impala-shell -q "drop table if exists practical_exercise_1.user_report; create table practical_exercise_1.user_report as select user.id, find_isactive.total_updates, find_isactive.total_inserts, find_isactive.total_deletes,find_isactive.upload_count,find_isactive.last_active_type,find_isactive.is_active from practical_exercise_1.find_isactive right outer join practical_exercise_1.user on user.id=find_isactive.user_id;" """, dag=dag) insert_usertotal = BashOperator( task_id='insert_usertotal', bash_command= """ impala-shell -q "insert into practical_exercise_1.user_total select current_timestamp(), sub1.t , case when sub2.t1 is NULL then sub1.t when sub2.t1 is not NULL then sub1.t-sub2.t1 end from (select count(distinct id) as t from practical_exercise_1.user)sub1, (select max(total_users) t1 from user_total) sub2;" """, dag=dag) load_data.set_downstream(addition_data) load_data.set_downstream(import_user) create_csv.set_downstream(move_csv_to_hdfs) move_csv_to_hdfs.set_downstream(create_userdump) move_csv_to_hdfs.set_downstream(move_csv_to_archive) addition_data.set_downstream(create_sums) create_sums.set_downstream(create_findisactive) create_findisactive.set_downstream(create_userreport) import_user.set_downstream(insert_usertotal) create_userdump.set_downstream(create_sums)
dag = DAG('initialization', default_args=default_args, schedule_interval=None, start_date=datetime.now() - timedelta(minutes=1)) create_db = BashOperator( task_id='create_db', bash_command= """ impala-shell -q "create database if not exists practical_exercise_1;" """, dag=dag) meta_store = BashOperator(task_id='meta_store', bash_command=""" nohup sqoop metastore & """, dag=dag) sqoop_job = BashOperator( task_id='sqoop_job', bash_command= """ sqoop job --meta-connect jdbc:hsqldb:hsql://localhost:16000/sqoop --create practical_exercise_1.activitylog -- import --connect jdbc:mysql://localhost/practical_exercise_1 --username root --password-file /user/cloudera/pwd.txt --table activitylog -m 2 --hive-import --hive-database practical_exercise_1 --hive-table activitylog --incremental append --check-column id --last-value 0 """, dag=dag) user_totaltable = BashOperator( task_id='user_totaltable', bash_command= """ impala-shell -q "create table if not exists practical_exercise_1.user_total(time_ran timestamp, total_users bigint, users_added bigint);" """, dag=dag) create_db.set_downstream(sqoop_job) meta_store.set_downstream(sqoop_job) create_db.set_downstream(user_totaltable)
bash_command = ACTIVATE_VENV + \ cmd_fmt.format(cmd='build_set_mongo-hadoop_1_3_3.sh ') ) pre_work = BashOperator(task_id='prework', env=options_env, retries=1, dag=dag, bash_command = ACTIVATE_VENV + \ cmd_fmt.format(cmd='generate_sql_statements.sh ') ) etl_work = BashOperator(task_id='etl_work', env=options_env, retries=3, dag=dag, bash_command = ACTIVATE_VENV + \ cmd_fmt.format(cmd='workflow_start_resume_mongo.sh ') ) csv_prepare_work = BashOperator(task_id='csv_prepare_work', env=options_env, retries=3, dag=dag, bash_command = ACTIVATE_VENV + \ cmd_fmt.format(cmd='workflow_start_resume_csv.sh ')) final_work = BashOperator(task_id='final_work', env=options_env, retries=3, dag=dag, bash_command = ACTIVATE_VENV + \ cmd_fmt.format(cmd='workflow_start_resume_export.sh ') ) #jobs dependencies pre_clean.set_downstream([repo1_scp, repo2_scp, repo3_scp]) pre_build.set_upstream([repo1_scp, repo2_scp, repo3_scp]) build.set_upstream(pre_build) pre_work.set_upstream(build) etl_work.set_upstream(pre_work) csv_prepare_work.set_upstream(etl_work) final_work.set_upstream(csv_prepare_work) #for test purposes username = mongodb_creds[1]
from airflow import DAG from airflow.operators import BashOperator from datetime import datetime default_args = { 'owner': 'root', 'start_date': datetime.today(), } dag = DAG('zaim', default_args=default_args, schedule_interval='00 00 * * *') task1 = BashOperator( task_id='retrieve_zaim_data', bash_command='python /app/zaim_downloader.py', dag=dag) task2 = BashOperator( task_id='update_data', bash_command='/app/update_data.sh ', dag=dag) task1.set_downstream(task2)
bash_cleanup_cmd = 'gsutil rm gs://myBucket/google/gam/example_report/*report_example_using_service_account_with_date_range_' + temp_date + '*' bash_cleanup = BashOperator(task_id='bash_cleanup_' + temp_date, retries=0, bash_command=bash_cleanup_cmd, trigger_rule="all_done") ##notice trigger_rule="all_done" bash_run_report_remotly_cmd = 'gcloud beta compute --project myProjectName ssh scheduler2 --internal-ip --zone us-central1-a --command "sudo -u omid python3 /home/omid/gam_data_transfer/report_example_using_service_account_with_date_range.py --start ' + temp_date + " --end " + temp_date + '"' run_report_remotly = BashOperator( task_id='run_report_remotly_' + temp_date, retries=0, bash_command=bash_run_report_remotly_cmd, trigger_rule="all_done") start.set_downstream(bash_cleanup) bash_cleanup.set_downstream(run_report_remotly) run_report_remotly.set_downstream(wait) ##notice trigger_rule="all_done" run_gsutil_mv = BashOperator(task_id='bash_gsutil_mv_cmd', retries=0, bash_command=bash_gsutil_mv_cmd, trigger_rule="all_done") load_to_bq_from_gcs = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='load_to_bq_from_gcs', source_objects='*', skip_leading_rows=1, create_disposition='CREATE_NEVER', write_disposition='WRITE_TRUNCATE', #overwrite? bucket='myBucket/google/gam/example_report',
task_id="hdfs_to_gcs", bash_command= "gcloud compute ssh ephemeral-spark-cluster-{{ds_nodash}}-m --zone='asia-southeast2-a' -- -T 'hadoop distcp /incremental_buckets/*.avro gs://bigdata-etl-2_flights/sqoop_output/'", dag=dag) bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_flight_delays", bucket="bigdata-etl-2_flights", source_objects=["sqoop_output/part.20190515_*.avro"], destination_project_dataset_table=PROJECT_ID + ".data_flights.flights_delays", autodetect=True, source_format="AVRO", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0) # delete_cluster = DataprocClusterDeleteOperator( # task_id='delete_dataproc_cluster', # cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", # region='asia-east1', # trigger_rule=TriggerRule.ALL_DONE # ) create_cluster.dag = dag create_cluster.set_downstream(sqoop_inc_import) sqoop_inc_import.set_downstream(hdfs_to_gcs) hdfs_to_gcs.set_downstream(bq_load_flight_delays) # bq_load_delays_by_distance.set_downstream(delete_cluster)
Create_user_report= BashOperator( task_id='Create_user_report', bash_command=""" impala-shell -q "create table practical_exercise_1.user_report(user_id bigint, total_updates bigint, total_inserts bigint, total_deletes bigint, last_activity_type string, is_active boolean, upload_count bigint);" """, dag=dag) Insert_user_report= BashOperator( task_id='Insert_user_report', bash_command=""" NOW=$(date +%s); impala-shell -q "insert into practical_exercise_1.user_report select a.user_id,COALESCE(b.co,0) as total_updates,COALESCE(c.co,0) as total_inserts, COALESCE(d.co,0) as total_deletes, e.co as last_activity_type, COALESCE(f.co,FALSE) as is_active, COALESCE(g.co,0) as upload_count from (select id as user_id from practical_exercise_1.user group by id) as a left join (select user_id, count(user_id) as co from practical_exercise_1.activitylog where type='UPDATE' group by user_id) as b on a.user_id=b.user_id left join (select user_id, count(user_id) as co from practical_exercise_1.activitylog where type='INSERT' group by user_id) as c on a.user_id=c.user_id left join(select user_id, count(user_id) as co from practical_exercise_1.activitylog where type='DELETE' group by user_id) as d on a.user_id=d.user_id left join (SELECT a.user_id, a.type as co FROM practical_exercise_1.activitylog a INNER JOIN (SELECT user_id, MAX(\`timestamp\`) as ti FROM practical_exercise_1.activitylog GROUP BY user_id ) AS b ON a.user_id = b.user_id AND a.\`timestamp\` = b.ti) as e on a.user_id=e.user_id left join (select user_id, if(count(*) = 0, FALSE, TRUE) as co from practical_exercise_1.activitylog where \`timestamp\` > $NOW-172800 group by user_id) as f on a.user_id=f.user_id left join (select user_id, count(user_id) as co from practical_exercise_1.user_upload_dump group by user_id) as g on a.user_id=g.user_id;" """, dag=dag) Insert_user_total= BashOperator( task_id='Insert_user_total', bash_command=""" impala-shell -q "insert into practical_exercise_1.user_total select current_timestamp(), sub1.t , case when sub2.t1 is NULL then sub1.t when sub2.t1 is not NULL then sub1.t-sub2.t1 end from (select count(distinct id) as t from practical_exercise_1.user)sub1, (select max(total_users) t1 from practical_exercise_1.user_total) sub2;" """, dag=dag) generating_the_MySql_data.set_downstream(Sqoop_import_user) generating_the_MySql_data.set_downstream(Sqoop_import_activitylog) generating_the_CSV_data.set_downstream(CSV_to_HDFS) CSV_to_HDFS.set_downstream(Archiving) CSV_to_HDFS.set_downstream(Drop_user_report_table) Sqoop_import_user.set_downstream(Drop_user_report_table) Sqoop_import_user.set_downstream(Insert_user_total) Sqoop_import_activitylog.set_downstream(Drop_user_report_table) Drop_user_report_table.set_downstream(Create_user_report) Create_user_report.set_downstream(Insert_user_report)