Esempio n. 1
0
 def get_completed_job(job_id):
     cmd = HiveCommand()
     cmd = cmd.find(job_id)
     print('Retrieving job:', job_id)
     if cmd.attributes.get('status', None) != 'done':
         raise ValueError('Job not yet completed')
     print('Job data successfully retrieved from Qubole')
     return cmd
def upgrade(context):
    logging.debug("Execute upgrade of `%d`" % version)

    template = Template("""
        CREATE TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.tco_table (
                    node_id BIGINT,
                    cluster_inst_id INT,
                    cluster_id INT,
                    instance_type STRING,
                    availability_zone STRING,
                    region STRING,
                    hour BIGINT,
                    price DOUBLE,
                    node_run_time INT,
                    approx_per_hour_price DOUBLE,
                    ondemand_price DOUBLE,
                    up_time STRING,
                    down_time STRING,
                    node_type STRING
                )
                PARTITIONED BY (account_id INT, event_date STRING)
                STORED AS ORC
                LOCATION '${defloc}/qubole_bi/tco_table/'
    """)
    context["revisions.upgraded"].append(version)
    Migrations.upgrade(
        migration_number=version,
        command=HiveCommand.run(
            query=template.render_unicode(env=context["env"],
                                          account_id=context['account_id'],
                                          defloc=context['defloc'])))
    print context
Esempio n. 3
0
    def drop_temp_hive_table(self):
        command = None
        create_table_stmt = None
        try:
            create_table_stmt = "DROP TABLE IF EXISTS %s_airflow_temp;" % self.table_name

            command = HiveCommand.run(query=create_table_stmt, label='default')
            self.monitor_command(command, create_table_stmt)
        except Exception as e:
            if command is None:
                raise AirflowException(
                    'create_temp_hive_table call for %s failed. No command Id available.\n%s'
                    % (create_table_stmt, e))
            else:
                raise AirflowException(
                    'create_temp_hive_table call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s'
                    % (create_table_stmt, command.id, e))

        try:
            stmt = "s3cmd -c /usr/lib/hustler/s3cfg rm -rf s3://%s/result_cache/%s_airflow_temp/;" % \
                   (self.s3_bucket, self.table_name)

            command = Command.run(command_type='ShellCommand',
                                  inline=stmt,
                                  label='default')
            self.monitor_command(command, stmt)
        except Exception as e:
            if command is None:
                raise AirflowException(
                    'create_temp_hive_table call for %s failed. No command Id available.\n%s'
                    % (create_table_stmt, e))
            else:
                raise AirflowException(
                    'create_temp_hive_table call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s'
                    % (create_table_stmt, command.id, e))
def downgrade(context):
    logging.debug("Execute downgrade of `%d`" % version)
    template = Template("""
        DROP TABLE IF EXISTS qubole_bi_${env}_${account_id}.clusters_metadata2;
    """)
    Migrations.downgrade(migration_number=version,
                                command=HiveCommand.run(query=template.render_unicode(env=context["env"],
                                account_id = context['account_id'])))
Esempio n. 5
0
 def read_data_from_hive(self, query, cluster, verbose=False, **kwargs):
     print('Running Hive query')
     cmd = HiveCommand.create(query=query,
                              print_logs_live=True,
                              label=cluster,
                              **kwargs)
     while cmd.attributes.get('status', None) != 'done':
         if verbose:
             cmd = self._get_logs(cmd)
         else:
             cmd = self._get_status(cmd)
     return cmd
Esempio n. 6
0
 def _get_logs(self, cmd):
     new_std_out = (cmd.get_log_partial()[0]).split('\n')
     for line in new_std_out[len(self.old_std_out):]:
         print(line)
     cmd = HiveCommand.find(cmd.id)
     self.old_std_out = new_std_out
     time.sleep(5)
     if cmd.attributes['status'] == 'error':
         raise RuntimeError('Job Failed')
     if cmd.attributes['status'] == 'cancelled':
         raise RuntimeError('Job Cancelled')
     return cmd
Esempio n. 7
0
def upgrade(context):
    logging.debug("Execute upgrade of `%d`" % version)

    template = Template("""
        CREATE DATABASE IF NOT EXISTS qubole_bi_${env}_${account_id};
    """)
    context["revisions.upgraded"].append(version)
    Migrations.upgrade(
        migration_number=version,
        command=HiveCommand.run(query=template.render_unicode(
            env=context["env"], account_id=context['account_id'])))
    print context
Esempio n. 8
0
    def get_job_result(self, qubole_jid):
        """Finds and retrieves results for existing Qubole job.

        Args:
            id: qubole job id.

        Returns:
            Job stdout output.
        """
        self._configure_qubole()
        qubole_jid = str(qubole_jid)
        return self._get_qubole_command_output(HiveCommand.find(qubole_jid))
Esempio n. 9
0
    def get_job_result(self, qubole_jid):
        """Finds and retrieves results for existing Qubole job.

        Args:
            id: qubole job id.

        Returns:
            Job stdout output.
        """
        self._configure_qubole()
        qubole_jid = str(qubole_jid)
        return self._get_qubole_command_output(HiveCommand.find(qubole_jid))
Esempio n. 10
0
    def run_query(self, query, user):
        qbol.configure(api_token=self.configuration.get('token'),
                       api_url='%s/api' % self.configuration.get('endpoint'))

        try:
            query_type = self.configuration.get('query_type', 'hive')

            if query_type == 'quantum':
                cmd = SqlCommand.create(query=query)
            elif query_type == 'hive':
                cmd = HiveCommand.create(query=query, label=self.configuration.get('cluster'))
            elif query_type == 'presto':
                cmd = PrestoCommand.create(query=query, label=self.configuration.get('cluster'))
            else:
                raise Exception("Invalid Query Type:%s.\
                        It must be : hive / presto / quantum." % self.configuration.get('query_type'))

            logging.info("Qubole command created with Id: %s and Status: %s", cmd.id, cmd.status)

            while not Command.is_done(cmd.status):
                time.sleep(qbol.poll_interval)
                cmd = Command.find(cmd.id)
                logging.info("Qubole command Id: %s and Status: %s", cmd.id, cmd.status)

            rows = []
            columns = []
            error = None

            if cmd.status == 'done':
                fp = StringIO()
                cmd.get_results(fp=fp, inline=True, delim='\t', fetch=False,
                                qlog=None, arguments=['true'])

                results = fp.getvalue()
                fp.close()

                data = results.split('\r\n')
                columns = self.fetch_columns([(i, TYPE_STRING) for i in data.pop(0).split('\t')])
                rows = [dict(zip((c['name'] for c in columns), row.split('\t'))) for row in data]

            json_data = json_dumps({'columns': columns, 'rows': rows})
        except KeyboardInterrupt:
            logging.info('Sending KILL signal to Qubole Command Id: %s', cmd.id)
            cmd.cancel()
            error = "Query cancelled by user."
            json_data = None

        return json_data, error
Esempio n. 11
0
def qubole(api_token,sql,replacements,filename):
    Qubole.configure(api_token=api_token)
    with open(sql,'r') as f:
        query = f.read()
        
    label='Trading-spark'
    query = find_replace_multi(query,replacements)
    hc = HiveCommand.run(query=query, label=label)
    cmd = Command.find(hc.id)
    out_file = filename + '.csv'
    
    with open(out_file, 'wb') as writer:
        cmd.get_results(writer)

    df = pd.read_csv(out_file, delimiter='\t')

    return df
Esempio n. 12
0
    def copy_to_temp_hive(self, columns):
        command = None
        copy_table_stmt = None
        try:
            copy_table_stmt = '''SET hive.exec.compress.output=false; 
                INSERT INTO %s_airflow_temp select * from %s''' % (
                self.table_name, self.table_name)

            command = HiveCommand.run(query=copy_table_stmt, label='default')
            self.monitor_command(command, copy_table_stmt)
        except Exception as e:
            if command is None:
                raise AirflowException(
                    'RS_copy_to_temp_hive call for %s failed. No command Id available. \n%s'
                    % (copy_table_stmt, e))
            else:
                raise AirflowException(
                    'RS_copy_to_temp_hive call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s'
                    % (copy_table_stmt, command.id, e))
def upgrade(context):
    logging.debug("Execute upgrade of `%d`" % version)

    template = Template("""
        CREATE EXTERNAL TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.clusters_metadata2 (
                    cluster_id STRING,
                    cluster_inst_id STRING,
                    tag STRING,
                    ec2_tag STRING
                )
                PARTITIONED BY (account_id INT, event_date STRING)
                STORED AS ORC
                Location '${defloc}/qubole_bi/clusters_metadata2/'
    """)
    context["revisions.upgraded"].append(version)
    Migrations.upgrade(migration_number=version,
                                command=HiveCommand.run(query=template.render_unicode(env=context["env"],
                                account_id = context['account_id'],
                                defloc =  context['defloc'])))
    print context
    def test_execute_assert_query_fail(self, mock_get_hook):

        mock_cmd = mock.Mock()
        mock_cmd.status = 'error'
        mock_cmd.id = 123
        mock_cmd.is_success = mock.Mock(
            return_value=HiveCommand.is_success(mock_cmd.status))

        mock_hook = mock.Mock()
        mock_hook.get_first.return_value = [11]
        mock_hook.cmd = mock_cmd
        mock_get_hook.return_value = mock_hook

        operator = self.__construct_operator('select value from tab1 limit 1;', 5, 1)

        with self.assertRaises(AirflowException) as cm:
            operator.execute()

        self.assertNotIn('Qubole Command Id: ', str(cm.exception))
        mock_cmd.is_success.assert_called_with(mock_cmd.status)
    def test_execute_assertion_fail(self, mock_get_hook):

        mock_cmd = mock.Mock()
        mock_cmd.status = 'done'
        mock_cmd.id = 123
        mock_cmd.is_success = mock.Mock(
            return_value=HiveCommand.is_success(mock_cmd.status))

        mock_hook = mock.Mock()
        mock_hook.get_first.return_value = [11]
        mock_hook.cmd = mock_cmd
        mock_get_hook.return_value = mock_hook

        operator = self.__construct_operator('select value from tab1 limit 1;', 5, 1)

        with self.assertRaisesRegexp(AirflowException,
                                     'Qubole Command Id: ' + str(mock_cmd.id)):
            operator.execute()

        mock_cmd.is_success.assert_called_with(mock_cmd.status)
    def test_execute_assertion_fail(self, mock_get_hook):

        mock_cmd = mock.Mock()
        mock_cmd.status = 'done'
        mock_cmd.id = 123
        mock_cmd.is_success = mock.Mock(
            return_value=HiveCommand.is_success(mock_cmd.status))

        mock_hook = mock.Mock()
        mock_hook.get_first.return_value = [11]
        mock_hook.cmd = mock_cmd
        mock_get_hook.return_value = mock_hook

        operator = self.__construct_operator('select value from tab1 limit 1;', 5, 1)

        with self.assertRaisesRegex(AirflowException,
                                    'Qubole Command Id: ' + str(mock_cmd.id)):
            operator.execute()

        mock_cmd.is_success.assert_called_once_with(mock_cmd.status)
    def test_execute_assert_query_fail(self, mock_get_hook):

        mock_cmd = mock.Mock()
        mock_cmd.status = 'error'
        mock_cmd.id = 123
        mock_cmd.is_success = mock.Mock(
            return_value=HiveCommand.is_success(mock_cmd.status))

        mock_hook = mock.Mock()
        mock_hook.get_first.return_value = [11]
        mock_hook.cmd = mock_cmd
        mock_get_hook.return_value = mock_hook

        operator = self.__construct_operator('select value from tab1 limit 1;', 5, 1)

        with self.assertRaises(AirflowException) as cm:
            operator.execute()

        self.assertNotIn('Qubole Command Id: ', str(cm.exception))
        mock_cmd.is_success.assert_called_once_with(mock_cmd.status)
def upgrade(context):
    logging.debug("Execute upgrade of `%d`" % version)

    template = Template("""
        CREATE EXTERNAL TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.queries_info (
                    command_id STRING,
                    engine STRING,
                    similarity_identifier STRING,
                    similarity_payload MAP<STRING, STRING>,
                    counters MAP<STRING, STRING>,
                    cluster_id INT)
              PARTITIONED BY (account_id INT, event_date STRING)
              STORED AS ORC
              LOCATION '${defloc}/qubole_bi/queries_info/'
    """)
    context["revisions.upgraded"].append(version)
    Migrations.upgrade(
        migration_number=version,
        command=HiveCommand.run(
            query=template.render_unicode(env=context["env"],
                                          account_id=context['account_id'],
                                          defloc=context['defloc'])))
    print context
Esempio n. 19
0
    def create_temp_hive_table(self, columns):
        command = None
        create_table_stmt = None
        try:
            create_table_stmt = """
            SET hive.exec.compress.output=false;

            DROP TABLE IF EXISTS %s_airflow_temp;  

            CREATE EXTERNAL TABLE %s_airflow_temp(
            %s)
            COMMENT 'The table %s_airflow_temp was generated by Hadoop Result Caching'
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            LINES TERMINATED BY '\\n'
            STORED AS TEXTFILE
            LOCATION 's3n://%s/result_cache/%s_airflow_temp/'
            TBLPROPERTIES("orc.compress"="NONE");""" % \
                                (self.table_name,
                                 self.table_name,
                                 ',\n'.join(['%s     %s' % (c[0], c[1]) for c in columns]),
                                 self.table_name,
                                 self.s3_bucket,
                                 self.table_name
                                 )

            command = HiveCommand.run(query=create_table_stmt, label='default')
            self.monitor_command(command, create_table_stmt)
        except Exception as e:
            if command is None:
                raise AirflowException(
                    'RS_create_temp_hive_table call for %s failed. No command Id available. \n %s'
                    % (create_table_stmt, e))
            else:
                raise AirflowException(
                    'RS_create_temp_hive_table call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s \n %s'
                    % (create_table_stmt, command.id, e))
Esempio n. 20
0
    def run_sql(self):
        command = None
        sql_stmt = None
        try:
            template = Template(self.raw_sql)
            dwd = DataWarehouseDates()
            sql_stmt = template.render(
                DS=self.ds,
                DS_TODAY=dwd.ds_today(self.ds),
                DS_DATE_ID=dwd.date_id_from_date_str(self.ds),
                DS_DATE_ADD=lambda days: dwd.date_add(self.ds, days),
                DS_TODAY_DATE_ADD=lambda days: dwd.date_add(
                    dwd.ds_today(self.ds), days))
            sql_stmt_with_tracking = '-- %s %s %s\n%s' % (
                self.dag_id, self.task_id, self.ds, sql_stmt)
            qubole_name = '%s_%s_%s' % (self.dag_id, self.task_id, self.ds)
            if 'PRESTO' in self.db_type.upper():
                command = PrestoCommand.create(query=sql_stmt_with_tracking,
                                               label=self.label,
                                               name=qubole_name)
            elif 'HIVE' in self.db_type.upper():
                command = HiveCommand.run(query=sql_stmt_with_tracking,
                                          label=self.label,
                                          name=qubole_name)
            else:
                raise AirflowException('Invalid db_type specified.')

            self.monitor_command(command, sql_stmt_with_tracking)
        except Exception as e:
            if command is None:
                raise AirflowException(
                    'run_sql call for %s failed. No command Id available.\n%s'
                    % (sql_stmt, e))
            else:
                raise AirflowException(
                    'run_sql call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s'
                    % (sql_stmt, command.id, e))
Esempio n. 21
0
    def get_table_columns(self):
        command = None
        sql = None
        try:
            sql = "show create table %s" % self.table_name
            command = HiveCommand.run(query=sql, label='default')
            buffer = io.BytesIO()
            command.get_results(fp=buffer)
            buffer.seek(0)

            buffer_list = [t for t in buffer]

            buffer.close()
            buffer = None

            column_names_types = []
            for l in buffer_list:
                decoded = l.decode("utf-8")
                column_type = decoded.split()
                if len(column_type) > 1:
                    if self.is_type(column_type[1].strip().strip(',').strip(
                            '(').strip(')')):
                        column_names_types.append(
                            (column_type[0].strip(),
                             column_type[1].strip(',').strip('(').strip(')')))

            return column_names_types

        except Exception as e:
            if command is None:
                raise AirflowException(
                    'RS_get_table_columns call for %s failed. No command Id available.\n%s'
                    % (sql, e))
            else:
                raise AirflowException(
                    'RS_get_table_columns call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s'
                    % (sql, command.id, e))
def upgrade(context):
    logging.debug("Execute upgrade of `%d`" % version)

    template = Template("""
        CREATE EXTERNAL TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.`usage`(
                `cluster_id` INT,
                `cluster_inst_id` INT,
                `node_type` STRING,
                `cluster_type` STRING,
                `compute_usage` DECIMAL(30,8))
            PARTITIONED BY (
               `account_id` INT,
               `event_date` STRING)
            STORED AS ORC
            LOCATION '${defloc}/qubole_bi/qcuh/'
    """)
    context["revisions.upgraded"].append(version)
    Migrations.upgrade(
        migration_number=version,
        command=HiveCommand.run(
            query=template.render_unicode(env=context["env"],
                                          account_id=context['account_id'],
                                          defloc=context['defloc'])))
    print context
Esempio n. 23
0
def setup():
    """
    Execute a test query and return the HiveCommand object
    """
    hc = HiveCommand.run(query=TEST_QUERY)
    return hc
Esempio n. 24
0
def setup():
    """
    Execute a test query and return the HiveCommand object
    """
    hc = HiveCommand.run(query=TEST_QUERY)
    return hc
Esempio n. 25
0
    def run_query(self, query, user):
        qbol.configure(
            api_token=self.configuration.get("token"),
            api_url="%s/api" % self.configuration.get("endpoint"),
        )

        try:
            query_type = self.configuration.get("query_type", "hive")

            if query_type == "quantum":
                cmd = SqlCommand.create(query=query)
            elif query_type == "hive":
                cmd = HiveCommand.create(
                    query=query, label=self.configuration.get("cluster"))
            elif query_type == "presto":
                cmd = PrestoCommand.create(
                    query=query, label=self.configuration.get("cluster"))
            else:
                raise Exception("Invalid Query Type:%s.\
                        It must be : hive / presto / quantum." %
                                self.configuration.get("query_type"))

            logging.info("Qubole command created with Id: %s and Status: %s",
                         cmd.id, cmd.status)

            while not Command.is_done(cmd.status):
                time.sleep(qbol.poll_interval)
                cmd = Command.find(cmd.id)
                logging.info("Qubole command Id: %s and Status: %s", cmd.id,
                             cmd.status)

            rows = []
            columns = []
            error = None

            if cmd.status == "done":
                fp = StringIO()
                cmd.get_results(
                    fp=fp,
                    inline=True,
                    delim="\t",
                    fetch=False,
                    qlog=None,
                    arguments=["true"],
                )

                results = fp.getvalue()
                fp.close()

                data = results.split("\r\n")
                columns = self.fetch_columns([
                    (i, TYPE_STRING) for i in data.pop(0).split("\t")
                ])
                rows = [
                    dict(
                        zip((column["name"] for column in columns),
                            row.split("\t"))) for row in data
                ]

            json_data = json_dumps({"columns": columns, "rows": rows})
        except KeyboardInterrupt:
            logging.info("Sending KILL signal to Qubole Command Id: %s",
                         cmd.id)
            cmd.cancel()
            error = "Query cancelled by user."
            json_data = None

        return json_data, error
def run_hive_query_asynchronous(cluster_label, query_filename, **query_kwargs):
    template_loader = TemplateLoader(QUERY_DIR)
    query = template_loader.load_from_file(query_filename, **query_kwargs)
    return HiveCommand.create(query=query, label=cluster_label)