Beispiel #1
0
 def on_kill(self):
     if not self.qubole_cmd_id:
         return
     try:
         SparkCommand.cancel(self.qubole_cmd_id)
     except Exception as e:
         logger.error("Failed to stop qubole", e)
Beispiel #2
0
    def run_spark_job(self):
        command = None
        program = None
        try:
            template = Template(self.program)
            dwd = DataWarehouseDates()
            program = template.render(
                DS=self.ds,
                DS_TODAY=dwd.ds_today(self.ds),
                DS_DATE_ID=dwd.date_id_from_date_str(self.ds),
                DS_DATE_ADD=lambda days: dwd.date_add(self.ds, days),
                DS_TODAY_DATE_ADD=lambda days: dwd.date_add(
                    dwd.ds_today(self.ds), days))

            qubole_name = '%s_%s_%s' % (self.dag_id, self.task_id, self.ds)

            if 'SQL' in self.language.upper():
                program_with_tracking = '-- %s %s %s\n%s' % (
                    self.dag_id, self.task_id, self.ds, program)
                command = SparkCommand.create(sql=program_with_tracking,
                                              arguments=self.arguments,
                                              label=self.label,
                                              name=qubole_name)
            elif 'PYTHON' in self.language.upper(
            ) or 'R' in self.language.upper():
                program_with_tracking = '# %s %s %s\n%s' % (
                    self.dag_id, self.task_id, self.ds, program)
                command = SparkCommand.create(program=program_with_tracking,
                                              language=self.language,
                                              arguments=self.arguments,
                                              label=self.label,
                                              name=qubole_name)
            elif 'SCALA' in self.language.upper():
                program_with_tracking = '// %s %s %s\n%s' % (
                    self.dag_id, self.task_id, self.ds, program)
                command = SparkCommand.create(program=program_with_tracking,
                                              language=self.language,
                                              arguments=self.arguments,
                                              label=self.label,
                                              name=qubole_name)
            else:
                raise AirflowException('Invalid Spark language specified.')

            self.monitor_command(command, program_with_tracking)
        except Exception as e:
            if command is None:
                raise AirflowException(
                    'run_sql call for %s failed. No command Id available.\n%s'
                    % (sql_stmt, e))
            else:
                raise AirflowException(
                    'run_sql call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s'
                    % (program, command.id, e))
Beispiel #3
0
    def _handle_qubole_operator_execution(self, cmd):
        """
        Handles the Airflow + Databricks lifecycle logic for a Databricks operator
        :param run_id: Databricks run_id
        :param hook: Airflow databricks hook
        :param task_id: Databand Task Id.

        """
        self.qubole_cmd_id = cmd_id = cmd.id
        self.qubole_job_url = self._get_url(cmd.id)
        self.task_run.set_external_resource_urls({"qubole url": self.qubole_job_url})
        self.task_run.tracker.log_metric("qubole_cmd_id", cmd_id)

        self._qubole_banner(cmd.status)

        log_ptr, err_ptr = 0, 0

        while True:
            cmd = SparkCommand.find(cmd_id)
            status = cmd.status

            log, err_ptr, log_ptr, received_log = self._print_partial_log(
                cmd, err_ptr, log_ptr
            )
            if self.qubole_config.show_spark_log:
                self._qubole_banner(status)
                if received_log > 0:
                    logger.info("Spark LOG:")
                    logger.info(log)

            if SparkCommand.is_done(status):
                if SparkCommand.is_success(status):
                    results = self._get_results(cmd)
                    logger.info("Spark results:")
                    logger.info(results)
                    return True
                else:
                    results = self._get_results(cmd)
                    if results == "":
                        results = self._get_results(cmd, fallback=True)
                    logger.info("Spark results:")
                    logger.info(results)
                    recent_log = "\n".join(log.split("\n")[-50:])
                    raise failed_to_run_qubole_job(
                        status, self.qubole_job_url, recent_log
                    )
            else:
                time.sleep(self.qubole_config.status_polling_interval_seconds)
Beispiel #4
0
 def run_spark_job(self, query, cluster, verbose=False, **kwargs):
     print('Running Spark job')
     cmd = SparkCommand.create(query=query, label=cluster, **kwargs)
     while cmd.attributes.get('status', None) != 'done':
         if verbose:
             cmd = self._get_logs(cmd)
         else:
             cmd = self._get_status(cmd)
     return cmd
Beispiel #5
0
    def run_spark_step(self, uri_script, language, **kwargs):
        program_body = read_file_content(uri_script)
        spark_cmd = SparkCommand.run(program=program_body,
                                     language=language,
                                     **kwargs)

        self.logger.info('command id: {0}; Status: {1}'.format(
            spark_cmd.id, spark_cmd.status))
        self.logger.info('command result: {0}'.format(spark_cmd.get_results()))
        self.logger.info('command log: {0}'.format(spark_cmd.get_log()))
Beispiel #6
0
    def run_spark(self, main_class):
        spark_cmd_line = CmdLineBuilder()
        spark_cmd_line.add("/usr/lib/spark/bin/spark-submit", "--class", main_class)
        spark_cmd_line.extend(self.config_to_command_line())

        # application jar
        spark_cmd_line.add(self.deploy.sync(self.config.main_jar))
        # add user side args
        spark_cmd_line.extend(list_of_strings(self.task.application_args()))

        cmd = SparkCommand.create(
            cmdline=spark_cmd_line.get_cmd_line(safe_curly_brackets=True),
            language="command_line",
            label=self.qubole_config.cluster_label,
            name=self.task.task_id,
        )
        self._handle_qubole_operator_execution(cmd)
Beispiel #7
0
    def run_pyspark(self, pyspark_script):
        # should be reimplemented using SparkSubmitHook (maybe from airflow)
        # note that config jars are not supported.

        arguments = list2cmdline_safe(list_of_strings(
            self.task.application_args()),
                                      safe_curly_brackets=True)

        cmd = SparkCommand.create(
            script_location=self.deploy.sync(pyspark_script),
            language="python",
            user_program_arguments=arguments,
            arguments=list2cmdline_safe(self.config_to_command_line(),
                                        safe_curly_brackets=True),
            label=self.qubole_config.cluster_label,
            name=self.task.task_id,
        )
        self._handle_qubole_operator_execution(cmd)

        return True
Beispiel #8
0
 def get_logs(self):
     if self.qubole_cmd_id:
         return SparkCommand.get_log_id(self.qubole_cmd_id)
     return None