task_id='hadoop_jar_cmd', command_type='hadoopcmd', sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc', cluster_label='default', fetch_logs=True, dag=dag) t5 = QuboleOperator( task_id='pig_cmd', command_type="pigcmd", script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig", parameters="key1=value1 key2=value2", trigger_rule="all_done", dag=dag) t4.set_upstream(branching) t5.set_upstream(t4) t5.set_downstream(join) t6 = QuboleOperator( task_id='presto_cmd', command_type='prestocmd', query='show tables', dag=dag) t7 = QuboleOperator( task_id='shell_cmd', command_type="shellcmd", script_location="s3://public-qubole/qbol-library/scripts/shellx.sh", parameters="param1 param2", trigger_rule="all_done",
fetch_logs=True, dag=dag, params={ 'cluster_label': 'default', } ) t5 = QuboleOperator( task_id='pig_cmd', command_type="pigcmd", script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig", parameters="key1=value1 key2=value2", trigger_rule="all_done", dag=dag) t4.set_upstream(branching) t5.set_upstream(t4) t5.set_downstream(join) t6 = QuboleOperator( task_id='presto_cmd', command_type='prestocmd', query='show tables', dag=dag) t7 = QuboleOperator( task_id='shell_cmd', command_type="shellcmd", script_location="s3://public-qubole/qbol-library/scripts/shellx.sh", parameters="param1 param2", trigger_rule="all_done",
join1 = DummyOperator(task_id='join1', trigger_rule="all_success", dag=dag) # Task = t2 (dbimport categories) # t1 ---> t2 (dbimport categories) ---> join1 t2 = QuboleOperator(task_id='db_import_categories', command_type='dbimportcmd', mode=1, hive_table='ecommerce_db.categories', db_table='categories', db_parallelism=2, dbtap_id="508", customer_cluster_label='hadoop2', use_customer_cluster='true', qubole_conn_id='qubole_default', dag=dag) t2.set_upstream(t1) t2.set_downstream(join1) # Task = t3 (dbimport customers) # start ---> t3 (dbimport customers) ---> join1 t3 = QuboleOperator(task_id='db_import_customers', command_type='dbimportcmd', mode=1, hive_table='ecommerce_db.customers', db_table='customers', db_parallelism=2, dbtap_id="508", customer_cluster_label='hadoop2', use_customer_cluster='true', qubole_conn_id='qubole_default', dag=dag)
python_callable=print_context, dag=dag) qubole_task = QuboleOperator( task_id='qubole_task', command_type='shellcmd', script='ls /usr/lib/airflow', cluster_label='airflow-demo', fetch_logs= True, # If true, will fetch qubole command logs and concatenate them into corresponding airflow task logs # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id qubole_conn_id= 'qubole_default', # Connection id to submit commands inside QDS, if not set "qubole_default" is used dag=dag) bash_task = BashOperator( task_id='bash_task', bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', dag=dag) http_sensor_task = HttpSensor(task_id='http_sensor_task', http_conn_id='http_default', endpoint='', request_params={}, response_check=lambda response: True if "Google" in str(response.content) else False, poke_interval=5, dag=dag) qubole_task.set_upstream(python_task) bash_task.set_upstream(python_task) http_sensor_task.set_upstream(python_task)
WHERE not pvs.page_title RLIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\:(.*)' and \ pvs.page_title RLIKE '^([A-Z])(.*)' and \ not pvs.page_title RLIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and \ pvs.page_title <> '404_error/' and \ pvs.page_title <> 'Main_Page' and \ pvs.page_title <> 'Hypertext_Transfer_Protocol' and \ pvs.page_title <> 'Favicon.ico' and \ pvs.page_title <> 'Search' and \ pvs.`date` = '{{ ds }}' \ GROUP BY \ regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\s*([a-zA-Z0-9]+).*','$1');", dag=dag) t6 = QuboleOperator( task_id='populate_normalized_pagecounts', command_type="hivecmd", query="INSERT overwrite table normalized_pagecounts partition(`date`='{{ ds }}') \ SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, views, bytes_sent \ FROM page_lookup pl JOIN filtered_pagecounts fp \ ON fp.page_title = pl.redirect_title where fp.`date`='{{ ds }}';", dag=dag) t1.set_downstream(t2) t1.set_downstream(t3) t1.set_downstream(t4) t5.set_upstream(t2) t5.set_upstream(t3) t6.set_upstream(t4) t6.set_upstream(t5)