def on_kill(self): if not self.qubole_cmd_id: return try: SparkCommand.cancel(self.qubole_cmd_id) except Exception as e: logger.error("Failed to stop qubole", e)
def run_spark_job(self): command = None program = None try: template = Template(self.program) dwd = DataWarehouseDates() program = template.render( DS=self.ds, DS_TODAY=dwd.ds_today(self.ds), DS_DATE_ID=dwd.date_id_from_date_str(self.ds), DS_DATE_ADD=lambda days: dwd.date_add(self.ds, days), DS_TODAY_DATE_ADD=lambda days: dwd.date_add( dwd.ds_today(self.ds), days)) qubole_name = '%s_%s_%s' % (self.dag_id, self.task_id, self.ds) if 'SQL' in self.language.upper(): program_with_tracking = '-- %s %s %s\n%s' % ( self.dag_id, self.task_id, self.ds, program) command = SparkCommand.create(sql=program_with_tracking, arguments=self.arguments, label=self.label, name=qubole_name) elif 'PYTHON' in self.language.upper( ) or 'R' in self.language.upper(): program_with_tracking = '# %s %s %s\n%s' % ( self.dag_id, self.task_id, self.ds, program) command = SparkCommand.create(program=program_with_tracking, language=self.language, arguments=self.arguments, label=self.label, name=qubole_name) elif 'SCALA' in self.language.upper(): program_with_tracking = '// %s %s %s\n%s' % ( self.dag_id, self.task_id, self.ds, program) command = SparkCommand.create(program=program_with_tracking, language=self.language, arguments=self.arguments, label=self.label, name=qubole_name) else: raise AirflowException('Invalid Spark language specified.') self.monitor_command(command, program_with_tracking) except Exception as e: if command is None: raise AirflowException( 'run_sql call for %s failed. No command Id available.\n%s' % (sql_stmt, e)) else: raise AirflowException( 'run_sql call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (program, command.id, e))
def _handle_qubole_operator_execution(self, cmd): """ Handles the Airflow + Databricks lifecycle logic for a Databricks operator :param run_id: Databricks run_id :param hook: Airflow databricks hook :param task_id: Databand Task Id. """ self.qubole_cmd_id = cmd_id = cmd.id self.qubole_job_url = self._get_url(cmd.id) self.task_run.set_external_resource_urls({"qubole url": self.qubole_job_url}) self.task_run.tracker.log_metric("qubole_cmd_id", cmd_id) self._qubole_banner(cmd.status) log_ptr, err_ptr = 0, 0 while True: cmd = SparkCommand.find(cmd_id) status = cmd.status log, err_ptr, log_ptr, received_log = self._print_partial_log( cmd, err_ptr, log_ptr ) if self.qubole_config.show_spark_log: self._qubole_banner(status) if received_log > 0: logger.info("Spark LOG:") logger.info(log) if SparkCommand.is_done(status): if SparkCommand.is_success(status): results = self._get_results(cmd) logger.info("Spark results:") logger.info(results) return True else: results = self._get_results(cmd) if results == "": results = self._get_results(cmd, fallback=True) logger.info("Spark results:") logger.info(results) recent_log = "\n".join(log.split("\n")[-50:]) raise failed_to_run_qubole_job( status, self.qubole_job_url, recent_log ) else: time.sleep(self.qubole_config.status_polling_interval_seconds)
def run_spark_job(self, query, cluster, verbose=False, **kwargs): print('Running Spark job') cmd = SparkCommand.create(query=query, label=cluster, **kwargs) while cmd.attributes.get('status', None) != 'done': if verbose: cmd = self._get_logs(cmd) else: cmd = self._get_status(cmd) return cmd
def run_spark_step(self, uri_script, language, **kwargs): program_body = read_file_content(uri_script) spark_cmd = SparkCommand.run(program=program_body, language=language, **kwargs) self.logger.info('command id: {0}; Status: {1}'.format( spark_cmd.id, spark_cmd.status)) self.logger.info('command result: {0}'.format(spark_cmd.get_results())) self.logger.info('command log: {0}'.format(spark_cmd.get_log()))
def run_spark(self, main_class): spark_cmd_line = CmdLineBuilder() spark_cmd_line.add("/usr/lib/spark/bin/spark-submit", "--class", main_class) spark_cmd_line.extend(self.config_to_command_line()) # application jar spark_cmd_line.add(self.deploy.sync(self.config.main_jar)) # add user side args spark_cmd_line.extend(list_of_strings(self.task.application_args())) cmd = SparkCommand.create( cmdline=spark_cmd_line.get_cmd_line(safe_curly_brackets=True), language="command_line", label=self.qubole_config.cluster_label, name=self.task.task_id, ) self._handle_qubole_operator_execution(cmd)
def run_pyspark(self, pyspark_script): # should be reimplemented using SparkSubmitHook (maybe from airflow) # note that config jars are not supported. arguments = list2cmdline_safe(list_of_strings( self.task.application_args()), safe_curly_brackets=True) cmd = SparkCommand.create( script_location=self.deploy.sync(pyspark_script), language="python", user_program_arguments=arguments, arguments=list2cmdline_safe(self.config_to_command_line(), safe_curly_brackets=True), label=self.qubole_config.cluster_label, name=self.task.task_id, ) self._handle_qubole_operator_execution(cmd) return True
def get_logs(self): if self.qubole_cmd_id: return SparkCommand.get_log_id(self.qubole_cmd_id) return None