def test_position_args_parameters(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID,
                                  command_type='pigcmd',
                                  parameters="key1=value1 key2=value2",
                                  dag=dag)

        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[1],
            "key1=value1")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[2],
            "key2=value2")

        cmd = "s3distcp --src s3n://airflow/source_hadoopcmd --dest s3n://airflow/destination_hadoopcmd"
        task = QuboleOperator(task_id=TASK_ID,
                              command_type='hadoopcmd',
                              dag=dag,
                              sub_command=cmd)

        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[1],
            "s3distcp")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[2], "--src")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[3],
            "s3n://airflow/source_hadoopcmd")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[4], "--dest")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[5],
            "s3n://airflow/destination_hadoopcmd")
예제 #2
0
    def test_extra_serialized_field(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)
        with dag:
            QuboleOperator(
                task_id=TASK_ID,
                command_type='shellcmd',
                qubole_conn_id=TEST_CONN,
            )

        serialized_dag = SerializedDAG.to_dict(dag)
        self.assertIn("qubole_conn_id", serialized_dag["dag"]["tasks"][0])

        dag = SerializedDAG.from_dict(serialized_dag)
        simple_task = dag.task_dict[TASK_ID]
        self.assertEqual(getattr(simple_task, "qubole_conn_id"), TEST_CONN)

        #########################################################
        # Verify Operator Links work with Serialized Operator
        #########################################################
        self.assertIsInstance(
            list(simple_task.operator_extra_links)[0], QDSLink)

        ti = TaskInstance(task=simple_task, execution_date=DEFAULT_DATE)
        ti.xcom_push('qbol_cmd_id', 12345)

        # check for positive case
        url = simple_task.get_extra_links(DEFAULT_DATE, 'Go to QDS')
        self.assertEqual(url, 'http://localhost/v2/analyze?command_id=12345')

        # check for negative case
        url2 = simple_task.get_extra_links(datetime(2017, 1, 2), 'Go to QDS')
        self.assertEqual(url2, '')
예제 #3
0
    def test_hyphen_args_note_id(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID, command_type='sparkcmd', note_id="123", dag=dag)

        self.assertEqual(task.get_hook().create_cmd_args({'run_id': 'dummy'})[0], "--note-id=123")
예제 #4
0
    def test_get_hook(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID, command_type='hivecmd', dag=dag)

        hook = task.get_hook()
        self.assertEqual(hook.__class__, QuboleHook)
    def test_init_with_template_connection(self):
        with DAG(DAG_ID, start_date=DEFAULT_DATE):
            task = QuboleOperator(task_id=TASK_ID,
                                  qubole_conn_id="{{ qubole_conn_id }}")

        task.render_template_fields({'qubole_conn_id': TEMPLATE_CONN})
        self.assertEqual(task.task_id, TASK_ID)
        self.assertEqual(task.qubole_conn_id, TEMPLATE_CONN)
    def test_init_with_template_cluster_label(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)
        task = QuboleOperator(task_id=TASK_ID,
                              dag=dag,
                              cluster_label='{{ params.cluster_label }}',
                              params={'cluster_label': 'default'})

        ti = TaskInstance(task, DEFAULT_DATE)
        ti.render_templates()

        self.assertEqual(task.cluster_label, 'default')
예제 #7
0
    def test_init_with_template_connection(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID, dag=dag,
                                  qubole_conn_id="{{ dag_run.conf['qubole_conn_id'] }}")

        result = task.render_template('qubole_conn_id', "{{ qubole_conn_id }}",
                                      {'qubole_conn_id': TEMPLATE_CONN})
        self.assertEqual(task.task_id, TASK_ID)
        self.assertEqual(result, TEMPLATE_CONN)
    def test_position_args_parameters(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID,
                                  command_type='pigcmd',
                                  parameters="key1=value1 key2=value2",
                                  dag=dag)

        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[1],
            "key1=value1")
        self.assertEqual(
            task.get_hook().create_cmd_args({'run_id': 'dummy'})[2],
            "key2=value2")
    def test_get_redirect_url(self):
        dag = DAG(DAG_ID, start_date=DEFAULT_DATE)

        with dag:
            task = QuboleOperator(task_id=TASK_ID,
                                  qubole_conn_id=TEST_CONN,
                                  command_type='shellcmd',
                                  parameters="param1 param2",
                                  dag=dag)

        ti = TaskInstance(task=task, execution_date=DEFAULT_DATE)
        ti.xcom_push('qbol_cmd_id', 12345)

        # check for positive case
        url = task.get_extra_links(DEFAULT_DATE, 'Go to QDS')
        self.assertEqual(url, 'http://localhost/v2/analyze?command_id=12345')

        # check for negative case
        url2 = task.get_extra_links(datetime(2017, 1, 2), 'Go to QDS')
        self.assertEqual(url2, '')
예제 #10
0
# these args will get passed on to each operator
# you can override them on a per-task basis during operator initialization
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('QueryFile',
          default_args=default_args,
          description='A simple tutorial DAG',
          schedule_interval=timedelta(days=1))

t1 = QuboleOperator(
    task_id='hive_inline',
    command_type='hivecmd',
    # query='show tables',
    script_location=
    's3://omulay-common/airflow-examples/query_script_path_example/sample_query.sql',
    cluster_label='default',
    tags=
    'aiflow_example_run',  # Attach tags to Qubole command, auto attaches 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection ID to submit commands inside QDS, if not set **qubole_default** is used
    dag=dag)
예제 #11
0
dag = DAG('ecommerce-airflow-demo',
          default_args=default_args,
          schedule_interval='@daily')

dag.doc_md = __doc__

# Task = start
start = DummyOperator(task_id='start', dag=dag)

# Task = cleanup (cleanup schema)
# start ---> cleanup (cleanup schema)
cleanup = QuboleOperator(
    task_id='hive_schema_cleanup',
    command_type='hivecmd',
    script_location="s3n://uwddefbucket/scripts/ecommerce_schema_cleanup.hql",
    cluster_label='hadoop2',
    tags=
    'airflow_example_run',  # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection id to submit commands inside QDS, if not set "qubole_default" is used
    dag=dag)
start.set_downstream(cleanup)

# Task = t1 (create schema)
# cleanup ---> t1 (create schemas)
t1 = QuboleOperator(
    task_id='hive_create_schema',
    command_type='hivecmd',
    script_location="s3n://uwddefbucket/scripts/ecommerce_create_schema.hql",
    cluster_label='hadoop2',
    tags=
    'airflow_example_run',  # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
예제 #12
0
        "parallelism": 1
    }, {
        "name": "products",
        "parallelism": 1
    }, {
        "name": "salesperson",
        "parallelism": 1
    }]
}

import_table_array = []

start = DummyOperator(task_id='start', dag=dag)

for table in conf['tables']:
    task_id = "db_import_%s" % (table['name'])
    import_table_array.append(
        QuboleOperator(task_id=task_id,
                       command_type='dbimportcmd',
                       mode=1,
                       db_name="",
                       hive_table=table['name'],
                       db_table=table['name'],
                       parallelism=table['parallelism'],
                       dbtap_id=Variable.get(conf['db_tap_variable']),
                       dag=dag))

end = DummyOperator(task_id='end', dag=dag)

start >> import_table_array >> end
 def test_init_with_default_connection(self):
     op = QuboleOperator(task_id=TASK_ID)
     self.assertEqual(op.task_id, TASK_ID)
     self.assertEqual(op.qubole_conn_id, DEFAULT_CONN)
        :return: True if the files are the same, False otherwise.
        :rtype: bool
        """
        ti = kwargs['ti']
        qubole_result_1 = t1.get_results(ti)
        qubole_result_2 = t2.get_results(ti)
        return filecmp.cmp(qubole_result_1, qubole_result_2)

    t1 = QuboleOperator(
        task_id='hive_show_table',
        command_type='hivecmd',
        query='show tables',
        cluster_label='{{ params.cluster_label }}',
        fetch_logs=True,
        # If `fetch_logs`=true, will fetch qubole command logs and concatenate
        # them into corresponding airflow task logs
        tags='airflow_example_run',
        # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
        qubole_conn_id='qubole_default',
        # Connection id to submit commands inside QDS, if not set "qubole_default" is used
        params={
            'cluster_label': 'default',
        })

    t2 = QuboleOperator(
        task_id='hive_s3_location',
        command_type="hivecmd",
        script_location=
        "s3n://public-qubole/qbol-library/scripts/show_table.hql",
        notfiy=True,
        tags=['tag1', 'tag2'],
예제 #15
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': seven_days_ago,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False
}

dag = DAG('process_static_data', default_args=default_args, schedule_interval='@once')

t1 = QuboleOperator(
    task_id='create_page_table',
    command_type='hivecmd',
    query="DROP TABLE if exists page;\
          CREATE EXTERNAL TABLE page (page_id BIGINT, page_latest BIGINT, page_title STRING)\
          ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' \
          LOCATION 's3n://paid-qubole/default-datasets/demotrends/page/';",
    dag=dag)

t2 = QuboleOperator(
    task_id='create_redirect_table',
    command_type='hivecmd',
    query="DROP TABLE if exists redirect;\
          CREATE EXTERNAL TABLE redirect( rd_from BIGINT, page_title STRING) \
          ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' \
          LOCATION 's3n://paid-qubole/default-datasets/demotrends/redirect/';",
    dag=dag)

join = DummyOperator(
    task_id='join',
예제 #16
0
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2018, 1, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('import and analyze movies', DAG_DEFAULTS)

start = DummyOperator(task_id='start', dag=dag)

import_from_web = QuboleOperator(task_id='import_web_data',
                                 command_type="shellcmd",
                                 script=web_import_script,
                                 dag=dag)

make_hive_table = QuboleOperator(
    task_id='create_movies_table',
    command_type='hivecmd',
    query=create_hive_table,
    cluster_label='hive_cluster',
    tags=
    'create_movies_table',  # Attach tags to Qubole command, auto attaches 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection ID to submit commands inside QDS, if not set **qubole_default** is used
    dag=dag)

count_movies_table = QuboleOperator(
    task_id='count_movies_table',
예제 #17
0
dag = DAG('import from mysql', DAG_DEFAULTS)

#Read the configuration file
conf = json.loads(open(dag.folder + "/conf.json").read())

start = DummyOperator(task_id='start', dag=dag)

import_table_array = []

for table in conf['tables']:
    import_table_array.append(
        QuboleOperator(task_id='db_import',
                       command_type='dbimportcmd',
                       mode=1,
                       hive_table=table['name'],
                       db_table=table['name'],
                       where_clause='id < 10',
                       parallelism=2,
                       dbtap_id=Variable.get("MYSQL_DBTAP_ID"),
                       dag=dag))

check_variable_exists = BranchPythonOperator(
    task_id='check_variable_exists',
    python_callable=variable_exists,
    op_kwargs={"key": conf['db_tap_variable']},
    trigger_rule=False,
    dag=dag)

email_missing_variable = EmailOperator(
    task_id='email_missing_variable',
    to="*****@*****.**",
예제 #18
0
    'email_on_failure': False,
    'email_on_retry': False
}

dag = DAG('example_qubole_operator', default_args=default_args)

def compare_result(ds, **kwargs):
    ti = kwargs['ti']
    r1 = t1.get_results(ti)
    r2 = t2.get_results(ti)
    return filecmp.cmp(r1, r2)

t1 = QuboleOperator(
    task_id='hive_show_table',
    command_type='hivecmd',
    query='show tables',
    cluster_label='default',
    fetch_logs=True,
    tags='aiflow_example_run',
    dag=dag)

t2 = QuboleOperator(
    task_id='hive_s3_location',
    command_type="hivecmd",
    script_location="s3n://dev.canopydata.com/airflow/show_table.hql",
    notfiy=True,
    tags=['tag1', 'tag2'],
    trigger_rule="all_done",
    dag=dag)

t3 = PythonOperator(
    task_id='compare_result',
예제 #19
0
def print_context(ds, **kwargs):
    pprint(kwargs)
    print(ds)
    return 'Whatever you return gets printed in the logs'


python_task = PythonOperator(task_id='python_task',
                             provide_context=True,
                             python_callable=print_context,
                             dag=dag)

qubole_task = QuboleOperator(
    task_id='qubole_task',
    command_type='shellcmd',
    script='ls /usr/lib/airflow',
    cluster_label='airflow-demo',
    fetch_logs=
    True,  # If true, will fetch qubole command logs and concatenate them into corresponding airflow task logs # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection id to submit commands inside QDS, if not set "qubole_default" is used
    dag=dag)

bash_task = BashOperator(
    task_id='bash_task',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    dag=dag)

http_sensor_task = HttpSensor(task_id='http_sensor_task',
                              http_conn_id='http_default',
                              endpoint='',
                              request_params={},
                              response_check=lambda response: True
예제 #20
0
    'start_date': datetime(2016, 8, 1),
    'end_date': datetime(2016, 8, 5)
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False
}

dag = DAG('process_daily_data', default_args=default_args, schedule_interval='@daily')

def_loc = 's3://BUCKET/DEF_LOC'


t1 = QuboleOperator(
    task_id='fetch_pagecount_data',
    command_type="shellcmd",
    script="wget -r https://dumps.wikimedia.org/other/pagecounts-raw/{{ ds.split('-')[0] }}/{{ '{0}-{1}'.format(ds.split('-')[0], ds.split('-')[1]) }} \
            -P pagecounts/{{ ds }} -A pagecounts-{{ macros.ds_format(ds, '%%Y-%%m-%%d', '%%Y%%m%%d') }}-*.gz; \
            s3cmd -c /usr/lib/hustler/s3cfg sync pagecounts/{{ ds }} %s/wikitrends/pagecounts/"%(def_loc),
    dag=dag)

t2 = QuboleOperator(
    task_id='create_pagecount_table',
    command_type="hivecmd",
    query= "CREATE EXTERNAL TABLE IF NOT EXISTS pagecounts (`group` STRING, page_title STRING, `views` BIGINT, bytes_sent BIGINT) \
            PARTITIONED BY (`date` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' LOCATION '{0}/demotrends/pagecounts/'; \
            ALTER TABLE pagecounts recover partitions;".
        format(def_loc),
    dag=dag)

t3 = QuboleOperator(
    task_id='create_filtered_pagecounts_table',
예제 #21
0
object SparkPi {
def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Spark Pi")
    val spark = new SparkContext(conf)
    val slices = if (args.length > 0) args(0).toInt else 2
    val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
    val count = spark.parallelize(1 until n, slices).map { i =>
    val x = random * 2 - 1
    val y = random * 2 - 1
    if (x*x + y*y < 1) 1 else 0
    }.reduce(_ + _)
    println("Pi is roughly " + 4.0 * count / n)
    spark.stop()
    }
}
'''

dag = DAG('SampleDag',
          default_args=default_args,
          description='A simple tutorial DAG jmsample',
          schedule_interval=timedelta(days=1))

# Spark Command - Scala Program
QuboleOperator(task_id='spark_cmd',
               command_type="sparkcmd",
               program=prog,
               language='scala',
               arguments='--class SparkPi',
               cluster_label='spark-baseline',
               qubole_conn_id='qubole_default',
               dag=dag)
    'owner': 'REPLACE',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0),
    'email': ['REPLACE'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('Workshop-ML-Model-REPLACE', default_args=default_args)
start = DummyOperator(task_id='start', dag=dag)

ingestData = QuboleOperator(task_id='ingestData',
                            command_type="sparkcmd",
                            note_id="1271",
                            qubole_conn_id='qubole_default',
                            dag=dag)

analyze_data = QuboleOperator(task_id='analyze_data',
                              command_type="sparkcmd",
                              note_id="1274",
                              qubole_conn_id='qubole_default',
                              dag=dag)

# Spark Command - Run a Notebook
build_dashboards = QuboleOperator(task_id='build_dashboards',
                                  command_type="sparkcmd",
                                  note_id="1273",
                                  qubole_conn_id='qubole_default',
                                  dag=dag)
예제 #23
0
    p.product_name AS Product
    ,COUNT (oi.order_item_quantity) AS QuantityOrdered
  FROM
    ecommerce.order_items oi INNER JOIN ecommerce.products p
      ON oi.order_item_product_id = p.product_id
  GROUP BY
    p.product_name
  ORDER BY
    QuantityOrdered DESC LIMIT 10"""

qubole_task = QuboleOperator(
    task_id='qubole_task',
    command_type='prestocmd',
    query=qubole_query,
    cluster_label='presto-for-airflow',
    fetch_logs=
    True,  # If true, will fetch qubole command logs and concatenate them into corresponding airflow task logs
    tags=
    'aiflow_example_run',  # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection id to submit commands inside QDS, if not set "qubole_default" is used
    dag=dag)

bash_task = BashOperator(
    task_id='bash_task',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    pool='test',
    dag=dag)

http_sensor_task = HttpSensor(task_id='http_sensor_task',
                              http_conn_id='http_default',
                              endpoint='',
예제 #24
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2018, 5, 6),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('qubole_example',
          default_args=default_args,
          schedule_interval=timedelta(hours=1),
          catchup=False)

# Running Example Qubole Query
t1 = QuboleOperator(
    task_id='hive_example',
    command_type='hivecmd',
    script_location="s3://sovrn-datascience/qubole_scripts/test_query.hql",
    macros='[{"dt": "{{ execution_date.strftime("%Y%m%d%H") }}"}]',
    cluster_label='default',
    fetch_logs=
    True,  # If true, will fetch qubole command logs and concatenate them into corresponding airflow task logs
    tags=
    'aiflow_example_run',  # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id
    qubole_conn_id=
    'qubole_default',  # Connection id to submit commands inside QDS, if not set "qubole_default" is used
    dag=dag)