コード例 #1
0
    def __init__(self, project_key, config, plugin_config):
        """
        :param project_key: the project in which the runnable executes
        :param config: the dict of the configuration of the object
        :param plugin_config: contains the plugin settings
        """
        self.project_key = project_key
        self.config = config
        self.plugin_config = plugin_config

        input_dataset_name = config.get('input_dataset')
        input_cols_map = config.get('input_cols_map')
        if not input_cols_map:
            raise ValueError('List of columns to modify is empty')

        self.dssh = dataiku.api_client()
        self.project = self.dssh.get_project(project_key)

        dataset_list = [d['name'] for d in self.project.list_datasets()]
        print('dataset list: %s' % dataset_list)
        if input_dataset_name not in dataset_list:
            raise ValueError(
                'Input dataset {} not found in project datasets {}'.format(
                    input_dataset_name, dataset_list))

        self.input_dataset = self.project.get_dataset(input_dataset_name)
        self.input_dataset_schema = self.input_dataset.get_schema()
        print('Retreived DSS schema for input dataset {}'.format(
            input_dataset_name))
        pprint.pprint(self.input_dataset_schema)

        eligible_columns = [
            c['name'] for c in self.input_dataset_schema['columns']
            if c['type'] == 'string'
        ]
        print('Eligible columns (with string type only): %s' %
              eligible_columns)
        for col_key in input_cols_map.keys():
            if col_key not in eligible_columns:
                raise ValueError(
                    'Column to modify {} is either not present or not of string type'
                    .format(col_key))

        #Basic sanity checks done - fixing the instance objects
        self.input_cols_map = input_cols_map
        self.query_ds = dataiku.Dataset(input_dataset_name,
                                        project_key=self.project_key)
        self.hive_executor = HiveExecutor(dataset=self.query_ds)
        self.input_dataset_name = input_dataset_name
コード例 #2
0
class dialectHandler():
    def __init__(self, dataset):
        self.dataset = dataset
        self.is_hdfs = 'hiveTableName' in dataset.get_config().get(
            'params').keys()
        self.executor = HiveExecutor(
            dataset=dataset) if self.is_hdfs else SQLExecutor2(dataset=dataset)

    def convertToSQL(self, sql_object):
        if self.is_hdfs:
            return toSQL(sql_object, dialect='Hive')
        else:
            return toSQL(sql_object, dataset=self.dataset)

    def get_executor(self):
        return self.executor

    def execute_in_database(self, query, output_dataset=None):
        if self.is_hdfs:
            self.executor.exec_recipe_fragment(query)
        else:
            self.executor.exec_recipe_fragment(output_dataset=output_dataset,
                                               query=query)
コード例 #3
0
 def __init__(self, dataset):
     self.dataset = dataset
     self.is_hdfs = 'hiveTableName' in dataset.get_config().get(
         'params').keys()
     self.executor = HiveExecutor(
         dataset=dataset) if self.is_hdfs else SQLExecutor2(dataset=dataset)
コード例 #4
0
is_hivelike = False
if dataset_config["type"] in SUPPORTED_DB:
    q = '"'
    sqlexec = SQLExecutor2(dataset=dataset)
    logging.info("Dataset config: %s" % dataset_config)
    #table = dataset_config["params"].get("table", dataset.short_name)
    table = '_'.join(dataset.name.split('.'))
elif dataset_config["type"] == "HDFS":
    q = '`'
    if use_impala and compute_distinct:
        raise ValueError("Cannot compute distinct values on Impala")
    if "IMPALA" in dss_settings["features"] and use_impala:
        sqlexec = ImpalaExecutor(dataset=dataset)
    else:
        sqlexec = HiveExecutor(dataset=dataset)
    is_hivelike = True
    table = dataset.short_name
else:
    raise Exception("Unsupported input dataset type: %s" %
                    dataset_config["type"])

logging.info("Using executor of kind : %s" % sqlexec)

# Generate a single query for all numerical columns
# And also in same query: string + num columns: one pass for cardinality and nmissing
chunks = []
for col in num_columns:
    chunks.append("MIN(%s%s%s) as %s%s_min%s" % (q, col, q, q, col, q))
    chunks.append("MAX(%s%s%s) as %s%s_max%s" % (q, col, q, q, col, q))
    chunks.append("AVG(%s%s%s) as %s%s_avg%s" % (q, col, q, q, col, q))
コード例 #5
0
import dataiku
from dataiku.core.sql import HiveExecutor

he = HiveExecutor(database="default")
client = dataiku.api_client()

databases_list = he.query_to_df("show databases")["database_name"].values
print "Databases: %s" % databases_list

for database in databases_list:
    print "Starting to index %s" % database
    c.catalog_index_connections(["@virtual(hive-jdbc):%s" % database])
    print "Done indexing %s" % database
コード例 #6
0
class MyRunnable(Runnable):
    """The base interface for a Python runnable"""

    input_dataset_name = None
    input_cols_map = None
    query_ds = None  #dataiku.Dataset() object
    hive_executor = None

    def __init__(self, project_key, config, plugin_config):
        """
        :param project_key: the project in which the runnable executes
        :param config: the dict of the configuration of the object
        :param plugin_config: contains the plugin settings
        """
        self.project_key = project_key
        self.config = config
        self.plugin_config = plugin_config

        input_dataset_name = config.get('input_dataset')
        input_cols_map = config.get('input_cols_map')
        if not input_cols_map:
            raise ValueError('List of columns to modify is empty')

        self.dssh = dataiku.api_client()
        self.project = self.dssh.get_project(project_key)

        dataset_list = [d['name'] for d in self.project.list_datasets()]
        print('dataset list: %s' % dataset_list)
        if input_dataset_name not in dataset_list:
            raise ValueError(
                'Input dataset {} not found in project datasets {}'.format(
                    input_dataset_name, dataset_list))

        self.input_dataset = self.project.get_dataset(input_dataset_name)
        self.input_dataset_schema = self.input_dataset.get_schema()
        print('Retreived DSS schema for input dataset {}'.format(
            input_dataset_name))
        pprint.pprint(self.input_dataset_schema)

        eligible_columns = [
            c['name'] for c in self.input_dataset_schema['columns']
            if c['type'] == 'string'
        ]
        print('Eligible columns (with string type only): %s' %
              eligible_columns)
        for col_key in input_cols_map.keys():
            if col_key not in eligible_columns:
                raise ValueError(
                    'Column to modify {} is either not present or not of string type'
                    .format(col_key))

        #Basic sanity checks done - fixing the instance objects
        self.input_cols_map = input_cols_map
        self.query_ds = dataiku.Dataset(input_dataset_name,
                                        project_key=self.project_key)
        self.hive_executor = HiveExecutor(dataset=self.query_ds)
        self.input_dataset_name = input_dataset_name

    def get_progress_target(self):
        """
        If the runnable will return some progress info, have this function return a tuple of 
        (target, unit) where unit is one of: SIZE, FILES, RECORDS, NONE
        """
        return None

    def get_hive_schema(self, pre_queries=None):
        describe_query = 'DESCRIBE {}'.format(self.input_dataset_name)
        print('Executing query {}'.format(describe_query))
        res_iter = self.hive_executor.query_to_iter(describe_query,
                                                    pre_queries=pre_queries)

        return [l for l in res_iter.iter_tuples()]

    def run(self, progress_callback):
        """
        Do stuff here. Can return a string or raise an exception.
        The progress_callback is a function expecting 1 value: current progress
        """
        prev_sch = self.get_hive_schema()

        alter_query_fmt = "ALTER TABLE {table} CHANGE COLUMN {colname} {colname} VARCHAR({colsize})"

        pre_queries = []
        for col_key in self.input_cols_map.keys():
            alter_query = alter_query_fmt.format(
                table=self.input_dataset_name,
                colname=col_key,
                colsize=self.input_cols_map[col_key])
            print('Appending to pre-queries query: {}'.format(alter_query))
            pre_queries.append(alter_query)

        #push ALTER statement to pre-queries of the DESCRIBE to avoid connecting multiple times
        new_sch = self.get_hive_schema(pre_queries=pre_queries)

        result = {'previous_schema': prev_sch, 'new_schema': new_sch}

        return '<pre>' + json.dumps(result) + '</pre>'