def test_hyphen_args_note_id(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID, command_type='sparkcmd', note_id="123", dag=dag)

        self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[0], "--note-id=123")
    def test_get_hook(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID, command_type='hivecmd', dag=dag)

        hook = task.get_hook()
        self.assertEqual(hook.__class__, QuboleHook)
    def test_position_args_parameters(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID, command_type='pigcmd',
                          parameters="key1=value1 key2=value2", dag=dag)

        self.assertEqual(task.get_hook().create_cmd_args({'run_id':'dummy'})[1],
                         "key1=value1")
        self.assertEqual(task.get_hook().create_cmd_args({'run_id':'dummy'})[2],
                         "key2=value2")

        task = QuboleOperator(task_id=TASK_ID, command_type='hadoopcmd',
                          sub_command="s3distcp --src s3n://airflow/source_hadoopcmd " +
                                      "--dest s3n://airflow/destination_hadoopcmd", dag=dag)

        self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[1],
                         "s3distcp")
        self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[2],
                         "--src")
        self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[3],
                         "s3n://airflow/source_hadoopcmd")
        self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[4],
                         "--dest")
        self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[5],
                         "s3n://airflow/destination_hadoopcmd")
    def test_init_with_template_connection(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID, dag=dag,
                                  qubole_conn_id="{{ dag_run.conf['qubole_conn_id'] }}")

        result = task.render_template('qubole_conn_id', "{{ qubole_conn_id }}",
                                      {'qubole_conn_id' : TEMPLATE_CONN})
        self.assertEqual(task.task_id, TASK_ID)
        self.assertEqual(result, TEMPLATE_CONN)
예제 #5
0
    def test_position_args_parameters(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID, command_type='pigcmd',
                          parameters="key1=value1 key2=value2", dag=dag)

        self.assertEqual(task.get_hook().create_cmd_args({'run_id':'dummy'})[1],
                         "key1=value1")
        self.assertEqual(task.get_hook().create_cmd_args({'run_id':'dummy'})[2],
                         "key2=value2")
def compare_result(ds, **kwargs):
    ti = kwargs['ti']
    r1 = t1.get_results(ti)
    r2 = t2.get_results(ti)
    return filecmp.cmp(r1, r2)


t1 = QuboleOperator(
    task_id='hive_show_table',
    command_type='hivecmd',
    query='show tables',
    cluster_label='{{ params.cluster_label }}',
    fetch_logs=True,
    # If `fetch_logs`=true, will fetch qubole command logs and concatenate
    # them into corresponding airflow task logs
    tags='airflow_example_run',
    # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
    qubole_conn_id='qubole_default',
    # Connection id to submit commands inside QDS, if not set "qubole_default" is used
    dag=dag,
    params={
        'cluster_label': 'default',
    }
)

t2 = QuboleOperator(
    task_id='hive_s3_location',
    command_type="hivecmd",
    script_location="s3n://public-qubole/qbol-library/scripts/show_table.hql",
    notfiy=True,
    tags=['tag1', 'tag2'],
    'owner': 'REPLACE',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0),
    'email': ['REPLACE'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('Workshop-ML-Model-REPLACE', default_args=default_args)
start = DummyOperator(task_id='start', dag=dag)

ingestData = QuboleOperator(task_id='ingestData',
                            command_type="sparkcmd",
                            note_id="1271",
                            qubole_conn_id='qubole_default',
                            dag=dag)

analyze_data = QuboleOperator(task_id='analyze_data',
                              command_type="sparkcmd",
                              note_id="1274",
                              qubole_conn_id='qubole_default',
                              dag=dag)

# Spark Command - Run a Notebook
build_dashboards = QuboleOperator(task_id='build_dashboards',
                                  command_type="sparkcmd",
                                  note_id="1273",
                                  qubole_conn_id='qubole_default',
                                  dag=dag)
예제 #8
0
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2018, 1, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('import and analyze movies', DAG_DEFAULTS)

start = DummyOperator(task_id='start', dag=dag)

import_from_web = QuboleOperator(task_id='import_web_data',
                                 command_type="shellcmd",
                                 script=web_import_script,
                                 dag=dag)

make_hive_table = QuboleOperator(
    task_id='create_movies_table',
    command_type='hivecmd',
    query=create_hive_table,
    cluster_label='default',
    tags=
    'create_movies_table',  # Attach tags to Qubole command, auto attaches 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection ID to submit commands inside QDS, if not set **qubole_default** is used
    dag=dag)

query_hive_table = QuboleOperator(
    task_id='count_movies_table',
    def test_position_args_parameters(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID,
                                  command_type='pigcmd',
                                  parameters="key1=value1 key2=value2",
                                  dag=dag)

        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[1],
            "key1=value1")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[2],
            "key2=value2")

        cmd = "s3distcp --src s3n://airflow/source_hadoopcmd --dest s3n://airflow/destination_hadoopcmd"
        task = QuboleOperator(task_id=TASK_ID + "_1",
                              command_type='hadoopcmd',
                              dag=dag,
                              sub_command=cmd)

        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[1],
            "s3distcp")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[2], "--src")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[3],
            "s3n://airflow/source_hadoopcmd")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[4], "--dest")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[5],
            "s3n://airflow/destination_hadoopcmd")
예제 #10
0
object SparkPi {
def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Spark Pi")
    val spark = new SparkContext(conf)
    val slices = if (args.length > 0) args(0).toInt else 2
    val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
    val count = spark.parallelize(1 until n, slices).map { i =>
    val x = random * 2 - 1
    val y = random * 2 - 1
    if (x*x + y*y < 1) 1 else 0
    }.reduce(_ + _)
    println("Pi is roughly " + 4.0 * count / n)
    spark.stop()
    }
}
'''

dag = DAG('SampleDag',
          default_args=default_args,
          description='A simple tutorial DAG jmsample',
          schedule_interval=timedelta(days=1))

# Spark Command - Scala Program
QuboleOperator(task_id='spark_cmd',
               command_type="sparkcmd",
               program=prog,
               language='scala',
               arguments='--class SparkPi',
               cluster_label='spark-baseline',
               qubole_conn_id='qubole_default',
               dag=dag)
예제 #11
0
    pprint(kwargs)
    print(ds)
    return 'Whatever you return gets printed in the logs'


python_task = PythonOperator(task_id='python_task',
                             provide_context=True,
                             python_callable=print_context,
                             dag=dag)

qubole_task = QuboleOperator(
    task_id='qubole_shell_command',
    command_type='shellcmd',
    script="sleep 60",
    cluster_label='default',
    fetch_logs=
    True,  # If true, will fetch qubole command logs and concatenate them into corresponding airflow task logs
    tags=
    'aiflow_example_run',  # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection id to submit commands inside QDS, if not set "qubole_default" is used
    dag=dag)

bash_task = BashOperator(
    task_id='bash_task',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    pool='test',
    dag=dag)

http_sensor_task = HttpSensor(task_id='http_sensor_task',
                              http_conn_id='http_default',
                              endpoint='',
예제 #12
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': seven_days_ago,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False
}

dag = DAG('process_static_data', default_args=default_args, schedule_interval='@once')

t1 = QuboleOperator(
    task_id='create_page_table',
    command_type='hivecmd',
    query="DROP TABLE if exists page;\
          CREATE EXTERNAL TABLE page (page_id BIGINT, page_latest BIGINT, page_title STRING)\
          ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' \
          LOCATION 's3n://paid-qubole/default-datasets/demotrends/page/';",
    dag=dag)

t2 = QuboleOperator(
    task_id='create_redirect_table',
    command_type='hivecmd',
    query="DROP TABLE if exists redirect;\
          CREATE EXTERNAL TABLE redirect( rd_from BIGINT, page_title STRING) \
          ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' \
          LOCATION 's3n://paid-qubole/default-datasets/demotrends/redirect/';",
    dag=dag)

join = DummyOperator(
    task_id='join',
예제 #13
0
        "parallelism": 1
    }, {
        "name": "products",
        "parallelism": 1
    }, {
        "name": "salesperson",
        "parallelism": 1
    }]
}

import_table_array = []

start = DummyOperator(task_id='start', dag=dag)

for table in conf['tables']:
    task_id = "db_import_%s" % (table['name'])
    import_table_array.append(
        QuboleOperator(task_id=task_id,
                       command_type='dbimportcmd',
                       mode=1,
                       db_name="",
                       hive_table=table['name'],
                       db_table=table['name'],
                       parallelism=table['parallelism'],
                       dbtap_id=Variable.get(conf['db_tap_variable']),
                       dag=dag))

end = DummyOperator(task_id='end', dag=dag)

start >> import_table_array >> end
예제 #14
0
dag = DAG('ecommerce-airflow-demo',
          default_args=default_args,
          schedule_interval='@daily')

dag.doc_md = __doc__

# Task = start
start = DummyOperator(task_id='start', dag=dag)

# Task = cleanup (cleanup schema)
# start ---> cleanup (cleanup schema)
cleanup = QuboleOperator(
    task_id='hive_schema_cleanup',
    command_type='hivecmd',
    script_location="s3n://uwddefbucket/scripts/ecommerce_schema_cleanup.hql",
    cluster_label='hadoop2',
    tags=
    'airflow_example_run',  # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection id to submit commands inside QDS, if not set "qubole_default" is used
    dag=dag)
start.set_downstream(cleanup)

# Task = t1 (create schema)
# cleanup ---> t1 (create schemas)
t1 = QuboleOperator(
    task_id='hive_create_schema',
    command_type='hivecmd',
    script_location="s3n://uwddefbucket/scripts/ecommerce_create_schema.hql",
    cluster_label='hadoop2',
    tags=
    'airflow_example_run',  # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
 def test_init_with_default_connection(self):
     op = QuboleOperator(task_id=TASK_ID)
     self.assertEqual(op.task_id, TASK_ID)
     self.assertEqual(op.qubole_conn_id, DEFAULT_CONN)
예제 #16
0
branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)

join = DummyOperator(
    task_id='join',
    trigger_rule='one_success',
    dag=dag
)

t4 = QuboleOperator(
    task_id='hadoop_jar_cmd',
    command_type='hadoopcmd',
    sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc',
    cluster_label='default',
    fetch_logs=True,
    dag=dag)

t5 = QuboleOperator(
    task_id='pig_cmd',
    command_type="pigcmd",
    script_location="s3://public-qubole/qbol-library/scripts/script1-hadoop-s3-small.pig",
    parameters="key1=value1 key2=value2",
    trigger_rule="all_done",
    dag=dag)

t4.set_upstream(branching)
t5.set_upstream(t4)
t5.set_downstream(join)
예제 #17
0
    'start_date': datetime(2016, 8, 1),
    'end_date': datetime(2016, 8, 5)
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False
}

dag = DAG('process_daily_data', default_args=default_args, schedule_interval='@daily')

def_loc = 's3://BUCKET/DEF_LOC'


t1 = QuboleOperator(
    task_id='fetch_pagecount_data',
    command_type="shellcmd",
    script="wget -r https://dumps.wikimedia.org/other/pagecounts-raw/{{ ds.split('-')[0] }}/{{ '{0}-{1}'.format(ds.split('-')[0], ds.split('-')[1]) }} \
            -P pagecounts/{{ ds }} -A pagecounts-{{ macros.ds_format(ds, '%%Y-%%m-%%d', '%%Y%%m%%d') }}-*.gz; \
            s3cmd -c /usr/lib/hustler/s3cfg sync pagecounts/{{ ds }} %s/wikitrends/pagecounts/"%(def_loc),
    dag=dag)

t2 = QuboleOperator(
    task_id='create_pagecount_table',
    command_type="hivecmd",
    query= "CREATE EXTERNAL TABLE IF NOT EXISTS pagecounts (`group` STRING, page_title STRING, `views` BIGINT, bytes_sent BIGINT) \
            PARTITIONED BY (`date` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' LOCATION '{0}/demotrends/pagecounts/'; \
            ALTER TABLE pagecounts recover partitions;".
        format(def_loc),
    dag=dag)

t3 = QuboleOperator(
    task_id='create_filtered_pagecounts_table',