sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc', cluster_label='default', fetch_logs=True, dag=dag) t5 = QuboleOperator( task_id='pig_cmd', command_type="pigcmd", script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig", parameters="key1=value1 key2=value2", trigger_rule="all_done", dag=dag) t4.set_upstream(branching) t5.set_upstream(t4) t5.set_downstream(join) t6 = QuboleOperator( task_id='presto_cmd', command_type='prestocmd', query='show tables', dag=dag) t7 = QuboleOperator( task_id='shell_cmd', command_type="shellcmd", script_location="s3://public-qubole/qbol-library/scripts/shellx.sh", parameters="param1 param2", trigger_rule="all_done", dag=dag)
params={ 'cluster_label': 'default', } ) t5 = QuboleOperator( task_id='pig_cmd', command_type="pigcmd", script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig", parameters="key1=value1 key2=value2", trigger_rule="all_done", dag=dag) t4.set_upstream(branching) t5.set_upstream(t4) t5.set_downstream(join) t6 = QuboleOperator( task_id='presto_cmd', command_type='prestocmd', query='show tables', dag=dag) t7 = QuboleOperator( task_id='shell_cmd', command_type="shellcmd", script_location="s3://public-qubole/qbol-library/scripts/shellx.sh", parameters="param1 param2", trigger_rule="all_done", dag=dag)
dag=dag) start.set_downstream(cleanup) # Task = t1 (create schema) # cleanup ---> t1 (create schemas) t1 = QuboleOperator( task_id='hive_create_schema', command_type='hivecmd', script_location="s3n://uwddefbucket/scripts/ecommerce_create_schema.hql", cluster_label='hadoop2', tags= 'airflow_example_run', # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id qubole_conn_id= 'qubole_default', # Connection id to submit commands inside QDS, if not set "qubole_default" is used dag=dag) cleanup.set_downstream(t1) # Task = join1 join1 = DummyOperator(task_id='join1', trigger_rule="all_success", dag=dag) # Task = t2 (dbimport categories) # t1 ---> t2 (dbimport categories) ---> join1 t2 = QuboleOperator(task_id='db_import_categories', command_type='dbimportcmd', mode=1, hive_table='ecommerce_db.categories', db_table='categories', db_parallelism=2, dbtap_id="508", customer_cluster_label='hadoop2', use_customer_cluster='true',
select original_page.page_id redirect_id, original_page.page_title redirect_title, \ final_page.page_title as true_title, final_page.page_id, final_page.page_latest \ from page final_page join redirect on (redirect.page_title = final_page.page_title) \ join page original_page on (redirect.rd_from = original_page.page_id);", dag=dag) t5 = QuboleOperator( task_id='create_page_lookup', command_type='hivecmd', query= "DROP TABLE if exists page_lookup; \ CREATE TABLE page_lookup (redirect_id bigint, redirect_title STRING, true_title STRING, page_id BIGINT, page_version BIGINT); \ INSERT OVERWRITE TABLE page_lookup \ SELECT redirect_id, redirect_title, true_title, page_id, page_version \ FROM ( \ SELECT redirect_id, redirect_title, true_title, page_id, page_version \ FROM page_lookup_nonredirect \ UNION ALL \ SELECT redirect_id, redirect_title, true_title, page_id, page_version \ FROM page_lookup_redirect \ ) u;", dag=dag) t1.set_downstream(join) t2.set_downstream(join) join.set_downstream(t3) join.set_downstream(t4) t3.set_downstream(t5) t4.set_downstream(t5)
WHERE not pvs.page_title RLIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\:(.*)' and \ pvs.page_title RLIKE '^([A-Z])(.*)' and \ not pvs.page_title RLIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and \ pvs.page_title <> '404_error/' and \ pvs.page_title <> 'Main_Page' and \ pvs.page_title <> 'Hypertext_Transfer_Protocol' and \ pvs.page_title <> 'Favicon.ico' and \ pvs.page_title <> 'Search' and \ pvs.`date` = '{{ ds }}' \ GROUP BY \ regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\s*([a-zA-Z0-9]+).*','$1');", dag=dag) t6 = QuboleOperator( task_id='populate_normalized_pagecounts', command_type="hivecmd", query="INSERT overwrite table normalized_pagecounts partition(`date`='{{ ds }}') \ SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, views, bytes_sent \ FROM page_lookup pl JOIN filtered_pagecounts fp \ ON fp.page_title = pl.redirect_title where fp.`date`='{{ ds }}';", dag=dag) t1.set_downstream(t2) t1.set_downstream(t3) t1.set_downstream(t4) t5.set_upstream(t2) t5.set_upstream(t3) t6.set_upstream(t4) t6.set_upstream(t5)