task_id='hadoop_jar_cmd',
    command_type='hadoopcmd',
    sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc',
    cluster_label='default',
    fetch_logs=True,
    dag=dag)

t5 = QuboleOperator(
    task_id='pig_cmd',
    command_type="pigcmd",
    script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig",
    parameters="key1=value1 key2=value2",
    trigger_rule="all_done",
    dag=dag)

t4.set_upstream(branching)
t5.set_upstream(t4)
t5.set_downstream(join)

t6 = QuboleOperator(
    task_id='presto_cmd',
    command_type='prestocmd',
    query='show tables',
    dag=dag)

t7 = QuboleOperator(
    task_id='shell_cmd',
    command_type="shellcmd",
    script_location="s3://public-qubole/qbol-library/scripts/shellx.sh",
    parameters="param1 param2",
    trigger_rule="all_done",
    fetch_logs=True,
    dag=dag,
    params={
        'cluster_label': 'default',
    }
)

t5 = QuboleOperator(
    task_id='pig_cmd',
    command_type="pigcmd",
    script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig",
    parameters="key1=value1 key2=value2",
    trigger_rule="all_done",
    dag=dag)

t4.set_upstream(branching)
t5.set_upstream(t4)
t5.set_downstream(join)

t6 = QuboleOperator(
    task_id='presto_cmd',
    command_type='prestocmd',
    query='show tables',
    dag=dag)

t7 = QuboleOperator(
    task_id='shell_cmd',
    command_type="shellcmd",
    script_location="s3://public-qubole/qbol-library/scripts/shellx.sh",
    parameters="param1 param2",
    trigger_rule="all_done",
join1 = DummyOperator(task_id='join1', trigger_rule="all_success", dag=dag)

# Task = t2 (dbimport categories)
# t1 ---> t2 (dbimport categories) ---> join1
t2 = QuboleOperator(task_id='db_import_categories',
                    command_type='dbimportcmd',
                    mode=1,
                    hive_table='ecommerce_db.categories',
                    db_table='categories',
                    db_parallelism=2,
                    dbtap_id="508",
                    customer_cluster_label='hadoop2',
                    use_customer_cluster='true',
                    qubole_conn_id='qubole_default',
                    dag=dag)
t2.set_upstream(t1)
t2.set_downstream(join1)

# Task = t3 (dbimport customers)
# start ---> t3 (dbimport customers) ---> join1
t3 = QuboleOperator(task_id='db_import_customers',
                    command_type='dbimportcmd',
                    mode=1,
                    hive_table='ecommerce_db.customers',
                    db_table='customers',
                    db_parallelism=2,
                    dbtap_id="508",
                    customer_cluster_label='hadoop2',
                    use_customer_cluster='true',
                    qubole_conn_id='qubole_default',
                    dag=dag)
Exemple #4
0
                             python_callable=print_context,
                             dag=dag)

qubole_task = QuboleOperator(
    task_id='qubole_task',
    command_type='shellcmd',
    script='ls /usr/lib/airflow',
    cluster_label='airflow-demo',
    fetch_logs=
    True,  # If true, will fetch qubole command logs and concatenate them into corresponding airflow task logs # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection id to submit commands inside QDS, if not set "qubole_default" is used
    dag=dag)

bash_task = BashOperator(
    task_id='bash_task',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    dag=dag)

http_sensor_task = HttpSensor(task_id='http_sensor_task',
                              http_conn_id='http_default',
                              endpoint='',
                              request_params={},
                              response_check=lambda response: True
                              if "Google" in str(response.content) else False,
                              poke_interval=5,
                              dag=dag)

qubole_task.set_upstream(python_task)
bash_task.set_upstream(python_task)
http_sensor_task.set_upstream(python_task)
           WHERE not pvs.page_title RLIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\:(.*)' and \
                pvs.page_title RLIKE '^([A-Z])(.*)' and \
                not pvs.page_title RLIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and \
                pvs.page_title <> '404_error/' and \
                pvs.page_title <> 'Main_Page' and \
                pvs.page_title <> 'Hypertext_Transfer_Protocol' and \
                pvs.page_title <> 'Favicon.ico' and \
                pvs.page_title <> 'Search' and \
                pvs.`date` = '{{ ds }}' \
          GROUP BY \
                regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\s*([a-zA-Z0-9]+).*','$1');",
    dag=dag)

t6 = QuboleOperator(
    task_id='populate_normalized_pagecounts',
    command_type="hivecmd",
    query="INSERT overwrite table normalized_pagecounts partition(`date`='{{ ds }}') \
           SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, views, bytes_sent \
           FROM page_lookup pl JOIN filtered_pagecounts fp \
           ON fp.page_title = pl.redirect_title where fp.`date`='{{ ds }}';",
    dag=dag)

t1.set_downstream(t2)
t1.set_downstream(t3)
t1.set_downstream(t4)
t5.set_upstream(t2)
t5.set_upstream(t3)
t6.set_upstream(t4)
t6.set_upstream(t5)