Esempio n. 1
0
 def execute(self, context):
     # Reinitiating the hook, as some template fields might have changed
     self.hook = QuboleHook(*self.args, **self.kwargs)
     return self.hook.execute(context)
Esempio n. 2
0
class QuboleOperator(BaseOperator):
    """
    Executes commands on Qubole (https://qubole.com).

    mandatory:
        :param command_type: type of command to be executed, e.g. hivecmd, shellcmd, hadoopcmd
    other:
        hivecmd:
            :param query: inline query statement
            :param script_location: s3 location containing query statement
            :param sample_size:
            :param macros: macro values which were used in query
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        prestocmd:
            :param query: inline query statement
            :param script_location: s3 location containing query statement
            :param macros: macro values which were used in query
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        hadoopcmd:
            :param sub_commnad: must be one these ["jar", "s3distcp", "streaming"] followed by 1 or more args
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        shellcmd:
            :param script: inline command with args
            :param script_location: s3 location containing query statement
            :param files: list of files in s3 bucket as file1,file2 format. These files will be copied into the working
                          directory where the qubole command is being executed.
            :param archives: list of archives in s3 bucket as archive1,archive2 format. These will be unarchived into
                             the working directory where the qubole command is being executed
            :param parameters: any extra args which need to be passed to script (only wwhen script_location is supplied)
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        pigcmd:
            :param script: inline query statement (latin_statements)
            :param script_location: s3 location containing pig query
            :param parameters: any extra args which need to be passed to script (only wwhen script_location is supplied
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        dbtapquerycmd:
            :param db_tap_id: data store ID of the target database, in Qubole.
            :param query: inline query statement
            :param macros: macro values which were used in query
            :param tags: array of tags to be assigned with the command
            :param name: name to be given to command
        sparkcmd:
            :param program: the complete Spark Program in Scala, SQL, Command, R, or Python
            :param cmdline: spark-submit command line, all required information must be specify in cmdline itself.
            :param sql: inline sql query
            :param script_location: s3 location containing query statement
            :param language: language of the program, Scala, SQL, Command, R, or Python
            :param app_id: ID of an Spark job server app
            :param arguments: spark-submit command line arguments
            :param user_program_arguments: arguments that the user program takes in
            :param macros: macro values which were used in query
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        dbexportcmd:
            :param mode: 1 (simple), 2 (advance)
            :param hive_table: Name of the hive table
            :param partition_spec: partition specification for Hive table.
            :param dbtap_id: data store ID of the target database, in Qubole.
            :param db_table: name of the db table
            :param db_update_mode: allowinsert or updateonly
            :param db_update_keys: columns used to determine the uniqueness of rows
            :param export_dir: HDFS/S3 location from which data will be exported.
            :param fields_terminated_by:
            :param tags: array of tags to be assigned with the command
            :param name: name to be given to command
        dbimportcmd:
            :param mode: 1 (simple), 2 (advance)
            :param hive_table: Name of the hive table
            :param dbtap_id: data store ID of the target database, in Qubole.
            :param db_table: name of the db table
            :param where_clause: where clause, if any
            :param parallelism: number of parallel db connections to use for extracting data
            :param extract_query: SQL query to extract data from db. $CONDITIONS must be part of the where clause.
            :param boundary_query: Query to be used get range of row IDs to be extracted
            :param split_column: Column used as row ID to split data into ranges (mode 2)
            :param tags: array of tags to be assigned with the command
            :param name: name to be given to command

    """

    template_fields = ('query', 'script_location', 'sub_command', 'script',
                       'files', 'archives', 'program', 'cmdline', 'sql',
                       'where_clause', 'extract_query', 'boundary_query',
                       'macros', 'tags', 'name')
    template_ext = ('.hql', '.sql', '.sh', '.bash', '.pig')
    ui_color = '#3064A1'
    ui_fgcolor = '#fff'

    @apply_defaults
    def __init__(self, qubole_conn_id="qubole_default", *args, **kwargs):
        self.args = args
        self.kwargs = kwargs
        self.kwargs['qubole_conn_id'] = qubole_conn_id
        self.hook = QuboleHook(*self.args, **self.kwargs)
        super(QuboleOperator, self).__init__(*args, **kwargs)

    def execute(self, context):
        # Reinitiating the hook, as some template fields might have changed
        self.hook = QuboleHook(*self.args, **self.kwargs)
        return self.hook.execute(context)

    def on_kill(self, ti):
        self.hook.kill(ti)

    def get_results(self,
                    ti=None,
                    fp=None,
                    inline=True,
                    delim=None,
                    fetch=True):
        return self.hook.get_results(ti, fp, inline, delim, fetch)

    def get_log(self, ti):
        return self.hook.get_log(ti)

    def get_jobs_id(self, ti):
        return self.hook.get_jobs_id(ti)

    def __getattribute__(self, name):
        if name in QuboleOperator.template_fields:
            if name in self.kwargs:
                return self.kwargs[name]
            else:
                return ''
        else:
            return object.__getattribute__(self, name)

    def __setattr__(self, name, value):
        if name in QuboleOperator.template_fields:
            self.kwargs[name] = value
        else:
            object.__setattr__(self, name, value)
Esempio n. 3
0
class QuboleOperator(BaseOperator):
    """
    Executes commands on Qubole (https://qubole.com).

    mandatory:
        :param command_type: type of command to be executed, e.g. hivecmd, shellcmd, hadoopcmd
    other:
        hivecmd:
            :param query: inline query statement
            :param script_location: s3 location containing query statement
            :param sample_size:
            :param macros: macro values which were used in query
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        prestocmd:
            :param query: inline query statement
            :param script_location: s3 location containing query statement
            :param macros: macro values which were used in query
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        hadoopcmd:
            :param sub_commnad: must be one these ["jar", "s3distcp", "streaming"] followed by 1 or more args
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        shellcmd:
            :param script: inline command with args
            :param script_location: s3 location containing query statement
            :param files: list of files in s3 bucket as file1,file2 format. These files will be copied into the working
                          directory where the qubole command is being executed.
            :param archives: list of archives in s3 bucket as archive1,archive2 format. These will be unarchived into
                             the working directory where the qubole command is being executed
            :param parameters: any extra args which need to be passed to script (only wwhen script_location is supplied)
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        pigcmd:
            :param script: inline query statement (latin_statements)
            :param script_location: s3 location containing pig query
            :param parameters: any extra args which need to be passed to script (only wwhen script_location is supplied
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        dbtapquerycmd:
            :param db_tap_id: data store ID of the target database, in Qubole.
            :param query: inline query statement
            :param macros: macro values which were used in query
            :param tags: array of tags to be assigned with the command
            :param name: name to be given to command
        sparkcmd:
            :param program: the complete Spark Program in Scala, SQL, Command, R, or Python
            :param cmdline: spark-submit command line, all required information must be specify in cmdline itself.
            :param sql: inline sql query
            :param script_location: s3 location containing query statement
            :param language: language of the program, Scala, SQL, Command, R, or Python
            :param app_id: ID of an Spark job server app
            :param arguments: spark-submit command line arguments
            :param user_program_arguments: arguments that the user program takes in
            :param macros: macro values which were used in query
            :param tags: array of tags to be assigned with the command
            :param cluster_label: cluster label on which to execute command
            :param name: name to be given to command
        dbexportcmd:
            :param mode: 1 (simple), 2 (advance)
            :param hive_table: Name of the hive table
            :param partition_spec: partition specification for Hive table.
            :param dbtap_id: data store ID of the target database, in Qubole.
            :param db_table: name of the db table
            :param db_update_mode: allowinsert or updateonly
            :param db_update_keys: columns used to determine the uniqueness of rows
            :param export_dir: HDFS/S3 location from which data will be exported.
            :param fields_terminated_by:
            :param tags: array of tags to be assigned with the command
            :param name: name to be given to command
        dbimportcmd:
            :param mode: 1 (simple), 2 (advance)
            :param hive_table: Name of the hive table
            :param dbtap_id: data store ID of the target database, in Qubole.
            :param db_table: name of the db table
            :param where_clause: where clause, if any
            :param parallelism: number of parallel db connections to use for extracting data
            :param extract_query: SQL query to extract data from db. $CONDITIONS must be part of the where clause.
            :param boundary_query: Query to be used get range of row IDs to be extracted
            :param split_column: Column used as row ID to split data into ranges (mode 2)
            :param tags: array of tags to be assigned with the command
            :param name: name to be given to command

    """

    template_fields = ('query', 'script_location', 'sub_command', 'script', 'files', 'archives', 'program', 'cmdline', 'sql', 'where_clause', 'extract_query', 'boundary_query', 'macros', 'tags', 'name')
    template_ext = ('.hql', '.sql', '.sh', '.bash', '.pig')
    ui_color = '#3064A1'
    ui_fgcolor = '#fff'

    @apply_defaults
    def __init__(self, qubole_conn_id="qubole_default", *args, **kwargs):
        self.args = args
        self.kwargs = kwargs
        self.kwargs['qubole_conn_id'] = qubole_conn_id
        self.hook = QuboleHook(*self.args, **self.kwargs)
        super(QuboleOperator, self).__init__(*args, **kwargs)

    def execute(self, context):
        # Reinitiating the hook, as some template fields might have changed
        self.hook = QuboleHook(*self.args, **self.kwargs)
        return self.hook.execute(context)

    def on_kill(self, ti):
        self.hook.kill(ti)

    def get_results(self, ti=None, fp=None, inline=True, delim=None, fetch=True):
        return self.hook.get_results(ti, fp, inline, delim, fetch)

    def get_log(self, ti):
        return self.hook.get_log(ti)

    def get_jobs_id(self, ti):
        return self.hook.get_jobs_id(ti)

    def __getattribute__(self, name):
        if name in QuboleOperator.template_fields:
            if name in self.kwargs:
                return self.kwargs[name]
            else:
                return ''
        else:
            return object.__getattribute__(self, name)

    def __setattr__(self, name, value):
        if name in QuboleOperator.template_fields:
            self.kwargs[name] = value
        else:
            object.__setattr__(self, name, value)
Esempio n. 4
0
 def __init__(self, qubole_conn_id="qubole_default", *args, **kwargs):
     self.args = args
     self.kwargs = kwargs
     self.kwargs['qubole_conn_id'] = qubole_conn_id
     self.hook = QuboleHook(*self.args, **self.kwargs)
     super(QuboleOperator, self).__init__(*args, **kwargs)
Esempio n. 5
0
 def execute(self, context):
     # Reinitiating the hook, as some template fields might have changed
     self.hook = QuboleHook(*self.args, **self.kwargs)
     return self.hook.execute(context)
Esempio n. 6
0
 def __init__(self, qubole_conn_id="qubole_default", *args, **kwargs):
     self.args = args
     self.kwargs = kwargs
     self.kwargs['qubole_conn_id'] = qubole_conn_id
     self.hook = QuboleHook(*self.args, **self.kwargs)
     super(QuboleOperator, self).__init__(*args, **kwargs)
Esempio n. 7
0
class QuboleOperator(BaseOperator):
    """
    Execute tasks (commands) on QDS (https://qubole.com).

    :param qubole_conn_id: Connection id which consists of qds auth_token
    :type qubole_conn_id: str

    kwargs:
        :command_type: type of command to be executed, e.g. hivecmd, shellcmd, hadoopcmd
        :tags: array of tags to be assigned with the command
        :cluster_label: cluster label on which the command will be executed
        :name: name to be given to command

        **Arguments specific to command types**

        hivecmd:
            :query: inline query statement
            :script_location: s3 location containing query statement
            :sample_size: size of sample in bytes on which to run query
            :macros: macro values which were used in query
        prestocmd:
            :query: inline query statement
            :script_location: s3 location containing query statement
            :macros: macro values which were used in query
        hadoopcmd:
            :sub_commnad: must be one these ["jar", "s3distcp", "streaming"] followed by 1 or more args
        shellcmd:
            :script: inline command with args
            :script_location: s3 location containing query statement
            :files: list of files in s3 bucket as file1,file2 format. These files will be copied into the working directory where the qubole command is being executed.
            :archives: list of archives in s3 bucket as archive1,archive2 format. These will be unarchived intothe working directory where the qubole command is being executed
            :parameters: any extra args which need to be passed to script (only when script_location is supplied)
        pigcmd:
            :script: inline query statement (latin_statements)
            :script_location: s3 location containing pig query
            :parameters: any extra args which need to be passed to script (only when script_location is supplied
        sparkcmd:
            :program: the complete Spark Program in Scala, SQL, Command, R, or Python
            :cmdline: spark-submit command line, all required information must be specify in cmdline itself.
            :sql: inline sql query
            :script_location: s3 location containing query statement
            :language: language of the program, Scala, SQL, Command, R, or Python
            :app_id: ID of an Spark job server app
            :arguments: spark-submit command line arguments
            :user_program_arguments: arguments that the user program takes in
            :macros: macro values which were used in query
        dbtapquerycmd:
            :db_tap_id: data store ID of the target database, in Qubole.
            :query: inline query statement
            :macros: macro values which were used in query
        dbexportcmd:
            :mode: 1 (simple), 2 (advance)
            :hive_table: Name of the hive table
            :partition_spec: partition specification for Hive table.
            :dbtap_id: data store ID of the target database, in Qubole.
            :db_table: name of the db table
            :db_update_mode: allowinsert or updateonly
            :db_update_keys: columns used to determine the uniqueness of rows
            :export_dir: HDFS/S3 location from which data will be exported.
            :fields_terminated_by: hex of the char used as column separator in the dataset.
        dbimportcmd:
            :mode: 1 (simple), 2 (advance)
            :hive_table: Name of the hive table
            :dbtap_id: data store ID of the target database, in Qubole.
            :db_table: name of the db table
            :where_clause: where clause, if any
            :parallelism: number of parallel db connections to use for extracting data
            :extract_query: SQL query to extract data from db. $CONDITIONS must be part of the where clause.
            :boundary_query: Query to be used get range of row IDs to be extracted
            :split_column: Column used as row ID to split data into ranges (mode 2)

    .. note:: Following fields are template-supported : ``query``, ``script_location``, ``sub_command``, ``script``, ``files``,
        ``archives``, ``program``, ``cmdline``, ``sql``, ``where_clause``, ``extract_query``, ``boundary_query``, ``macros``, ``tags``,
        ``name``, ``parameters``. You can also use ``.txt`` files for template driven use cases.
    """

    template_fields = (
        "query",
        "script_location",
        "sub_command",
        "script",
        "files",
        "archives",
        "program",
        "cmdline",
        "sql",
        "where_clause",
        "extract_query",
        "boundary_query",
        "macros",
        "tags",
        "name",
        "parameters",
    )
    template_ext = ".txt"
    ui_color = "#3064A1"
    ui_fgcolor = "#fff"

    @apply_defaults
    def __init__(self, qubole_conn_id="qubole_default", *args, **kwargs):
        self.args = args
        self.kwargs = kwargs
        self.kwargs["qubole_conn_id"] = qubole_conn_id
        self.hook = QuboleHook(*self.args, **self.kwargs)
        super(QuboleOperator, self).__init__(*args, **kwargs)

    def execute(self, context):
        # Reinitiating the hook, as some template fields might have changed
        self.hook = QuboleHook(*self.args, **self.kwargs)
        return self.hook.execute(context)

    def on_kill(self, ti):
        self.hook.kill(ti)

    def get_results(self, ti=None, fp=None, inline=True, delim=None, fetch=True):
        return self.hook.get_results(ti, fp, inline, delim, fetch)

    def get_log(self, ti):
        return self.hook.get_log(ti)

    def get_jobs_id(self, ti):
        return self.hook.get_jobs_id(ti)

    def __getattribute__(self, name):
        if name in QuboleOperator.template_fields:
            if name in self.kwargs:
                return self.kwargs[name]
            else:
                return ""
        else:
            return object.__getattribute__(self, name)

    def __setattr__(self, name, value):
        if name in QuboleOperator.template_fields:
            self.kwargs[name] = value
        else:
            object.__setattr__(self, name, value)
Esempio n. 8
0
 def execute(self, context):
     self.hook = QuboleHook(*self.args, **self.kwargs)
     return self.hook.execute(context)
Esempio n. 9
0
 def execute(self, context):
     self.hook = QuboleHook(*self.args, **self.kwargs)
     return self.hook.execute(context)