Beispiel #1
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-


from datacanvas.new_runtime import DataCanvas
dc = DataCanvas(__name__)

def write_main_hql(params):
    limit_string = ""
    if(params.limit is not None and params.limit.strip(' ') != ''):
        limit_string = "limit "+ params.limit
        with open("main.hql","w") as file:
            file.write(
"""
DROP TABLE IF EXISTS ${OUTPUT_ordered_table};

CREATE TABLE ${OUTPUT_ordered_table} AS
SELECT *
FROM ${INPUT_from_table}
ORDER BY ${PARAM_order_by_columns} %s 
;
 
""" % limit_string
)

@dc.hadoop_runtime(spec_json="spec.json")
def mymodule(rt, params, inputs, outputs):
    
    write_main_hql(params)

    rt.execute_hive_filename("main.hql")
Beispiel #2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from datacanvas.new_runtime import GenericHadoopRuntime, DataCanvas
from datacanvas.io_types import DS_HDFS
import shlex


dc = DataCanvas(__name__)


@dc.hadoop_runtime(spec_json="spec.json")
def mymodule(rt, params, inputs, outputs):

    if rt.hadoop_type not in ["CDH4", "CDH5"]:
        raise Exception("Do not support cluster type '%s'" % rt.cluster_type)
    
    # Check params
    if not params.connect_string.val.strip():
        raise ValueError("Param 'connect_string' should not be empty")

    if not params.table.val.strip():
        raise ValueError("Param 'table' should not be empty")

    # Build params for sqoop command
    additional_params = shlex.split(params.additional_params)
    sqoop_cmd = ["sqoop", "import",
                 "--connect", params.connect_string.val,
                 "--username", params.username.val,
                 "--password", params.password.val,
                 "--table", params.table.val]