sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc',
    cluster_label='default',
    fetch_logs=True,
    dag=dag)

t5 = QuboleOperator(
    task_id='pig_cmd',
    command_type="pigcmd",
    script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig",
    parameters="key1=value1 key2=value2",
    trigger_rule="all_done",
    dag=dag)

t4.set_upstream(branching)
t5.set_upstream(t4)
t5.set_downstream(join)

t6 = QuboleOperator(
    task_id='presto_cmd',
    command_type='prestocmd',
    query='show tables',
    dag=dag)

t7 = QuboleOperator(
    task_id='shell_cmd',
    command_type="shellcmd",
    script_location="s3://public-qubole/qbol-library/scripts/shellx.sh",
    parameters="param1 param2",
    trigger_rule="all_done",
    dag=dag)
    params={
        'cluster_label': 'default',
    }
)

t5 = QuboleOperator(
    task_id='pig_cmd',
    command_type="pigcmd",
    script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig",
    parameters="key1=value1 key2=value2",
    trigger_rule="all_done",
    dag=dag)

t4.set_upstream(branching)
t5.set_upstream(t4)
t5.set_downstream(join)

t6 = QuboleOperator(
    task_id='presto_cmd',
    command_type='prestocmd',
    query='show tables',
    dag=dag)

t7 = QuboleOperator(
    task_id='shell_cmd',
    command_type="shellcmd",
    script_location="s3://public-qubole/qbol-library/scripts/shellx.sh",
    parameters="param1 param2",
    trigger_rule="all_done",
    dag=dag)
    dag=dag)
start.set_downstream(cleanup)

# Task = t1 (create schema)
# cleanup ---> t1 (create schemas)
t1 = QuboleOperator(
    task_id='hive_create_schema',
    command_type='hivecmd',
    script_location="s3n://uwddefbucket/scripts/ecommerce_create_schema.hql",
    cluster_label='hadoop2',
    tags=
    'airflow_example_run',  # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection id to submit commands inside QDS, if not set "qubole_default" is used
    dag=dag)
cleanup.set_downstream(t1)

# Task = join1
join1 = DummyOperator(task_id='join1', trigger_rule="all_success", dag=dag)

# Task = t2 (dbimport categories)
# t1 ---> t2 (dbimport categories) ---> join1
t2 = QuboleOperator(task_id='db_import_categories',
                    command_type='dbimportcmd',
                    mode=1,
                    hive_table='ecommerce_db.categories',
                    db_table='categories',
                    db_parallelism=2,
                    dbtap_id="508",
                    customer_cluster_label='hadoop2',
                    use_customer_cluster='true',
Exemple #4
0
            select original_page.page_id redirect_id, original_page.page_title redirect_title, \
                    final_page.page_title as true_title, final_page.page_id, final_page.page_latest \
            from page final_page join redirect on (redirect.page_title = final_page.page_title) \
                join page original_page on (redirect.rd_from = original_page.page_id);",
    dag=dag)

t5 = QuboleOperator(
    task_id='create_page_lookup',
    command_type='hivecmd',
    query= "DROP TABLE if exists page_lookup; \
            CREATE TABLE page_lookup (redirect_id bigint, redirect_title STRING, true_title STRING, page_id BIGINT, page_version BIGINT); \
            INSERT OVERWRITE TABLE page_lookup \
            SELECT redirect_id, redirect_title, true_title, page_id, page_version \
            FROM ( \
                SELECT redirect_id, redirect_title, true_title, page_id, page_version \
                FROM page_lookup_nonredirect \
                UNION ALL \
                SELECT redirect_id, redirect_title, true_title, page_id, page_version \
                FROM page_lookup_redirect \
            ) u;",
    dag=dag)

t1.set_downstream(join)
t2.set_downstream(join)
join.set_downstream(t3)
join.set_downstream(t4)
t3.set_downstream(t5)
t4.set_downstream(t5)


           WHERE not pvs.page_title RLIKE '(MEDIA|SPECIAL||Talk|User|User_talk|Project|Project_talk|File|File_talk|MediaWiki|MediaWiki_talk|Template|Template_talk|Help|Help_talk|Category|Category_talk|Portal|Wikipedia|Wikipedia_talk|upload|Special)\:(.*)' and \
                pvs.page_title RLIKE '^([A-Z])(.*)' and \
                not pvs.page_title RLIKE '(.*).(jpg|gif|png|JPG|GIF|PNG|txt|ico)$' and \
                pvs.page_title <> '404_error/' and \
                pvs.page_title <> 'Main_Page' and \
                pvs.page_title <> 'Hypertext_Transfer_Protocol' and \
                pvs.page_title <> 'Favicon.ico' and \
                pvs.page_title <> 'Search' and \
                pvs.`date` = '{{ ds }}' \
          GROUP BY \
                regexp_replace (reflect ('java.net.URLDecoder','decode', reflect ('java.net.URLDecoder','decode',pvs.page_title)),'^\s*([a-zA-Z0-9]+).*','$1');",
    dag=dag)

t6 = QuboleOperator(
    task_id='populate_normalized_pagecounts',
    command_type="hivecmd",
    query="INSERT overwrite table normalized_pagecounts partition(`date`='{{ ds }}') \
           SELECT pl.page_id page_id, REGEXP_REPLACE(pl.true_title, '_', ' ') page_title, pl.true_title page_url, views, bytes_sent \
           FROM page_lookup pl JOIN filtered_pagecounts fp \
           ON fp.page_title = pl.redirect_title where fp.`date`='{{ ds }}';",
    dag=dag)

t1.set_downstream(t2)
t1.set_downstream(t3)
t1.set_downstream(t4)
t5.set_upstream(t2)
t5.set_upstream(t3)
t6.set_upstream(t4)
t6.set_upstream(t5)