def _run_spark_submit(self, application, jars): assert_airflow_package_installed() from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook from airflow.exceptions import AirflowException # task_env = get_cloud_config(Clouds.local) spark_local_config = SparkLocalEngineConfig() _config = self.config deploy = self.deploy spark = SparkSubmitHook( conf=_config.conf, conn_id=spark_local_config.conn_id, name=self.job.job_id, application_args=list_of_strings(self.task.application_args()), java_class=self.task.main_class, files=deploy.arg_files(_config.files), py_files=deploy.arg_files(self.task.get_py_files()), driver_class_path=_config.driver_class_path, jars=deploy.arg_files(jars), packages=_config.packages, exclude_packages=_config.exclude_packages, repositories=_config.repositories, total_executor_cores=_config.total_executor_cores, executor_cores=_config.executor_cores, executor_memory=_config.executor_memory, driver_memory=_config.driver_memory, keytab=_config.keytab, principal=_config.principal, num_executors=_config.num_executors, env_vars=self._get_env_vars(), verbose=_config.verbose, ) log_buffer = StringIO() with log_buffer as lb: dbnd_log_handler = self._capture_submit_log(spark, lb) try: spark.submit(application=application) except AirflowException as ex: return_code = self._get_spark_return_code_from_exception(ex) if return_code != "0": error_snippets = parse_spark_log_safe( log_buffer.getvalue().split(os.linesep)) raise failed_to_run_spark_script( self, spark._build_spark_submit_command( application=application), application, return_code, error_snippets, ) else: raise failed_spark_status(ex) finally: spark.log.handlers = [ h for h in spark.log.handlers if not dbnd_log_handler ]
def _run_spark_submit(self, file, jars): """ Request Body Description Type file File containing the application to run (required) path proxyUser User ID to impersonate when running the job string className Application Java or Spark main class string args Command line arguments for the application list of strings jars Jar files to be used in this session list of strings pyFiles Python files to be used in this session list of strings files Other files to be used in this session list of strings driverMemory Amount of memory to use for the driver process string driverCores Number of cores to use for the driver process int executorMemory Amount of memory to use for each executor process string executorCores Number of cores to use for each executor int numExecutors Number of executors to launch for this session int archives Archives to be used in this session list of strings queue The name of the YARN queue to which the job should be submitted string name Name of this session string conf Spark configuration properties Map of key=val :param task: :return: """ task = self.task # type: SparkTask _config = task.spark_config # deploy = self.deploy data = dict( conf=_config.conf, file=deploy.sync(file), className=task.main_class, name=self.job.job_id, args=list_of_strings(task.application_args()), files=deploy.sync_files(_config.files), pyFiles=deploy.sync_files(self.task.get_py_files()), jars=deploy.sync_files(jars), executorCores=_config.executor_cores, executorMemory=_config.executor_memory, driverMemory=_config.driver_memory, driverCores=_config.executor_cores, proxyUser=_config.proxy_user, queue=_config.queue, archives=_config.archives, numExecutors=_config.num_executors, ) data = {k: v for k, v in six.iteritems(data) if v is not None} livy_endpoint = self.get_livy_endpoint() logger.info("Connecting to: %s", livy_endpoint) livy_config = self.task_run.task.spark_engine livy = LivyBatchClient.from_endpoint( livy_endpoint, status_code_retries=livy_config.retry_on_status_error, ignore_ssl_errors=self.get_livy_ignore_ssl_errors(), ) batch = livy.post_batch(data) self._run_hook(batch, self.livy_config.job_submitted_hook) livy.track_batch_progress( batch["id"], status_reporter=self._report_livy_batch_status)
def run_pyspark(self, pyspark_script): # should be reimplemented using SparkSubmitHook (maybe from airflow) # note that config jars are not supported. if not self.databricks_config.cluster_id: spark_submit_parameters = [self.sync(pyspark_script)] + ( list_of_strings(self.task.application_args())) databricks_json = self._create_spark_submit_json( spark_submit_parameters) else: pyspark_script = self.sync(pyspark_script) parameters = [ self._dbfs_scheme_to_local(e) for e in list_of_strings(self.task.application_args()) ] databricks_json = self._create_pyspark_submit_json( python_file=pyspark_script, parameters=parameters) return self._run_spark_submit(databricks_json)
def _task_banner(self, banner, verbosity): b = banner b.new_section() try: spark_command_line = subprocess.list2cmdline( list_of_strings(self.application_args())) b.column("SPARK CMD LINE", spark_command_line) except Exception: logger.exception("Failed to get spark command line from %s" % self)
def _add_spark_info(self): b = self.banner b.new_section() try: spark_command_line = subprocess.list2cmdline( list_of_strings(self.task.application_args()) ) b.column("SPARK CMD LINE", spark_command_line) except Exception: logger.exception("Failed to get spark command line from %s" % self.task)
def _get_job_builder(self, job_type): job_builder = self.cluster_hook.create_job_template( self.task.task_id, self.dataproc.cluster, job_type=job_type, properties=self.config.conf, ) # we will have "unique" job name by set_job_name job_builder.set_job_name(self.job.job_name) job_builder.add_args(list_of_strings(self.task.application_args())) job_builder.add_file_uris(self.deploy.sync_files(self.config.files)) return job_builder
def run_spark(self, main_class): jars_list = [] jars = self.config.jars if jars: jars_list = ["--jars"] + jars # should be reimplemented using SparkSubmitHook (maybe from airflow) spark_submit_parameters = [ "--class", main_class, self.sync(self.config.main_jar), ] + (list_of_strings(self.task.application_args()) + jars_list) databricks_json = self._create_spark_submit_json(spark_submit_parameters) return self._run_spark_submit(databricks_json)
def run_spark(self, main_class): spark_cmd_line = CmdLineBuilder() spark_cmd_line.add("/usr/lib/spark/bin/spark-submit", "--class", main_class) spark_cmd_line.extend(self.config_to_command_line()) # application jar spark_cmd_line.add(self.deploy.sync(self.config.main_jar)) # add user side args spark_cmd_line.extend(list_of_strings(self.task.application_args())) cmd = SparkCommand.create( cmdline=spark_cmd_line.get_cmd_line(safe_curly_brackets=True), language="command_line", label=self.qubole_config.cluster_label, name=self.task.task_id, ) self._handle_qubole_operator_execution(cmd)
def _run_spark_submit(self, file, jars): from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook _config = self.config deploy = self.deploy spark = SparkSubmitHook( conf=_config.conf, conn_id=self.emr_config.conn_id, name=self.job.job_id, application_args=list_of_strings(self.task.application_args()), java_class=self.task.main_class, files=deploy.arg_files(_config.files), py_files=deploy.arg_files(_config.py_files), driver_class_path=_config.driver_class_path, jars=deploy.arg_files(jars), packages=_config.packages, exclude_packages=_config.exclude_packages, repositories=_config.repositories, total_executor_cores=_config.total_executor_cores, executor_cores=_config.executor_cores, executor_memory=_config.executor_memory, driver_memory=_config.driver_memory, keytab=_config.keytab, principal=_config.principal, num_executors=_config.num_executors, env_vars=self._get_env_vars(), verbose=_config.verbose, ) step_id = self.emr_cluster.run_spark_submit_step( name=self.job.job_id, spark_submit_command=spark._build_spark_submit_command( application=deploy.sync(file) ), ) self.task_run.set_external_resource_urls( self.emr_cluster.get_emr_logs_dict(self.spark_application_logs) ) self.emr_cluster.wait_for_step_completion( step_id, status_reporter=self._report_step_status ) pass
def run_pyspark(self, pyspark_script): # should be reimplemented using SparkSubmitHook (maybe from airflow) # note that config jars are not supported. arguments = list2cmdline_safe(list_of_strings( self.task.application_args()), safe_curly_brackets=True) cmd = SparkCommand.create( script_location=self.deploy.sync(pyspark_script), language="python", user_program_arguments=arguments, arguments=list2cmdline_safe(self.config_to_command_line(), safe_curly_brackets=True), label=self.qubole_config.cluster_label, name=self.task.task_id, ) self._handle_qubole_operator_execution(cmd) return True
def _run_spark_submit(self, application, jars): # task_env = get_cloud_config(Clouds.local) spark_local_config = SparkLocalEngineConfig() _config = self.config deploy = self.deploy AIRFLOW_ON = is_airflow_enabled() if AIRFLOW_ON: from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook from airflow.exceptions import AirflowException as SparkException else: from dbnd_spark._vendor.airflow.spark_hook import ( SparkException, SparkSubmitHook, ) spark = SparkSubmitHook( conf=_config.conf, conn_id=spark_local_config.conn_id, name=self.job.job_id, application_args=list_of_strings(self.task.application_args()), java_class=self.task.main_class, files=deploy.arg_files(_config.files), py_files=deploy.arg_files(self.task.get_py_files()), driver_class_path=_config.driver_class_path, jars=deploy.arg_files(jars), packages=_config.packages, exclude_packages=_config.exclude_packages, repositories=_config.repositories, total_executor_cores=_config.total_executor_cores, executor_cores=_config.executor_cores, executor_memory=_config.executor_memory, driver_memory=_config.driver_memory, keytab=_config.keytab, principal=_config.principal, num_executors=_config.num_executors, env_vars=self._get_env_vars(), verbose=_config.verbose, ) if not AIRFLOW_ON: # If there's no Airflow then there's no Connection so we # take conn information from spark config spark.set_connection(spark_local_config.conn_uri) log_buffer = StringIO() with log_buffer as lb: dbnd_log_handler = self._capture_submit_log(spark, lb) try: # We use str because we can accept Target objects (in case of JAR files) # or str objects (path to pyspark script) spark.submit(application=str(application)) except SparkException as ex: return_code = self._get_spark_return_code_from_exception(ex) if return_code != "0": error_snippets = parse_spark_log_safe( log_buffer.getvalue().split(os.linesep)) raise failed_to_run_spark_script( self, spark._build_spark_submit_command( application=application), application, return_code, error_snippets, ) else: raise failed_spark_status(ex) finally: spark.log.handlers = [ h for h in spark.log.handlers if not dbnd_log_handler ]