def test_process_spark_submit_log_k8s(self): # Given hook = SparkSubmitHook(conn_id='spark_k8s_cluster') log_lines = [ 'INFO LoggingPodStatusWatcherImpl:54 - State changed, new state:' + 'pod name: spark-pi-edf2ace37be7353a958b38733a12f8e6-driver' + 'namespace: default' + 'labels: spark-app-selector -> spark-465b868ada474bda82ccb84ab2747fcd,' + 'spark-role -> driver' + 'pod uid: ba9c61f6-205f-11e8-b65f-d48564c88e42' + 'creation time: 2018-03-05T10:26:55Z' + 'service account name: spark' + 'volumes: spark-init-properties, download-jars-volume,' + 'download-files-volume, spark-token-2vmlm' + 'node name: N/A' + 'start time: N/A' + 'container images: N/A' + 'phase: Pending' + 'status: []' + '2018-03-05 11:26:56 INFO LoggingPodStatusWatcherImpl:54 - State changed,' + ' new state:' + 'pod name: spark-pi-edf2ace37be7353a958b38733a12f8e6-driver' + 'namespace: default' + 'Exit code: 999' ] # When hook._process_spark_submit_log(log_lines) # Then self.assertEqual(hook._kubernetes_driver_pod, 'spark-pi-edf2ace37be7353a958b38733a12f8e6-driver') self.assertEqual(hook._spark_exit_code, 999)
def env_vars_exception_in_standalone_cluster_mode(): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster', env_vars={"bar": "foo"}) # When hook._build_spark_submit_command(self._spark_job_file)
def test_build_command(self): # Given hook = SparkSubmitHook(**self._config) # When cmd = hook._build_command(self._spark_job_file) # Then expected_build_cmd = [ 'spark-submit', '--master', 'yarn', '--conf', 'parquet.compression=SNAPPY', '--files', 'hive-site.xml', '--py-files', 'sample_library.py', '--jars', 'parquet.jar', '--num-executors', '10', '--total-executor-cores', '4', '--executor-cores', '4', '--executor-memory', '22g', '--driver-memory', '3g', '--keytab', 'privileged_user.keytab', '--principal', 'user/[email protected]', '--name', 'spark-job', '--class', 'com.foo.bar.AppMain', '--verbose', 'test_application.py', '-f', 'foo', '--bar', 'bar', 'baz' ] self.assertEquals(expected_build_cmd, cmd)
def test_build_spark_submit_command(self): # Given hook = SparkSubmitHook(**self._config) # When cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_build_cmd = [ 'spark-submit', '--master', 'yarn', '--conf', 'parquet.compression=SNAPPY', '--files', 'hive-site.xml', '--py-files', 'sample_library.py', '--jars', 'parquet.jar', '--packages', 'com.databricks:spark-avro_2.11:3.2.0', '--exclude-packages', 'org.bad.dependency:1.0.0', '--repositories', 'http://myrepo.org', '--num-executors', '10', '--total-executor-cores', '4', '--executor-cores', '4', '--executor-memory', '22g', '--driver-memory', '3g', '--keytab', 'privileged_user.keytab', '--principal', 'user/[email protected]', '--name', 'spark-job', '--class', 'com.foo.bar.AppMain', '--verbose', 'test_application.py', '-f', 'foo', '--bar', 'bar', '--with-spaces', 'args should keep embdedded spaces', 'baz' ] self.assertEquals(expected_build_cmd, cmd)
def test_resolve_spark_submit_env_vars_k8s(self): # Given hook = SparkSubmitHook(conn_id='spark_k8s_cluster', env_vars={"bar": "foo"}) # When cmd = hook._build_spark_submit_command(self._spark_job_file) # Then self.assertEqual(cmd[4], "spark.kubernetes.driverEnv.bar=foo")
def test_submit(self, mock_process): # We don't have spark-submit available, and this is hard to mock, so let's # just use this simple mock. mock_Popen = mock_process.Popen.return_value mock_Popen.stdout = StringIO(u'stdout') mock_Popen.stderr = StringIO(u'stderr') mock_Popen.returncode = None mock_Popen.communicate.return_value = ['extra stdout', 'extra stderr'] hook = SparkSubmitHook() hook.submit(self._spark_job_file)
def test_resolve_spark_submit_env_vars_standalone_client_mode(self): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster_client_mode', env_vars={"bar": "foo"}) # When hook._build_spark_submit_command(self._spark_job_file) # Then self.assertEqual(hook._env, {"bar": "foo"})
def test_spark_process_runcmd(self, mock_popen): # Given mock_popen.return_value.stdout = StringIO(u'stdout') mock_popen.return_value.stderr = StringIO(u'stderr') mock_popen.return_value.wait.return_value = 0 # When hook = SparkSubmitHook(conn_id='') hook.submit() # Then self.assertEqual(mock_popen.mock_calls[0], call(['spark-submit', '--master', 'yarn', '--name', 'default-name', ''], stdout=-1, stderr=-2))
def test_process_log(self): # Must select yarn connection hook = SparkSubmitHook(conn_id='spark_yarn_cluster') log_lines = [ 'SPARK_MAJOR_VERSION is set to 2, using Spark2', 'WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable', 'WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.', 'INFO Client: Requesting a new application from cluster with 10 NodeManagers', 'INFO Client: Submitting application application_1486558679801_1820 to ResourceManager' ] hook._process_log(log_lines) assert hook._yarn_application_id == 'application_1486558679801_1820'
def execute(self, context): """ Call the SparkSubmitHook to run the provided spark job """ self._hook = SparkSubmitHook( conf=self._conf, conn_id=self._conn_id, files=self._files, py_files=self._py_files, driver_classpath=self._driver_classpath, jars=self._jars, java_class=self._java_class, packages=self._packages, exclude_packages=self._exclude_packages, repositories=self._repositories, total_executor_cores=self._total_executor_cores, executor_cores=self._executor_cores, executor_memory=self._executor_memory, driver_memory=self._driver_memory, keytab=self._keytab, principal=self._principal, name=self._name, num_executors=self._num_executors, application_args=self._application_args, verbose=self._verbose ) self._hook.submit(self._application)
def test_resolve_connection_spark_binary_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_binary_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn", "spark_binary": "custom-spark-submit", "deploy_mode": None, "queue": None, "spark_home": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], 'custom-spark-submit')
def test_resolve_connection_spark_standalone_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "spark://spark-standalone-master:6066", "spark_binary": "spark-submit", "deploy_mode": "cluster", "queue": None, "spark_home": "/path/to/spark_home"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/path/to/spark_home/bin/spark-submit')
def test_process_spark_submit_log_standalone_cluster(self): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster') log_lines = [ 'Running Spark using the REST application submission protocol.', '17/11/28 11:14:15 INFO RestSubmissionClient: Submitting a request ' 'to launch an application in spark://spark-standalone-master:6066', '17/11/28 11:14:15 INFO RestSubmissionClient: Submission successfully ' + 'created as driver-20171128111415-0001. Polling submission state...' ] # When hook._process_spark_submit_log(log_lines) # Then self.assertEqual(hook._driver_id, 'driver-20171128111415-0001')
def test_resolve_connection_spark_binary_and_home_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_binary_and_home_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn", "spark_binary": "custom-spark-submit", "deploy_mode": None, "queue": None, "spark_home": "/path/to/spark_home", "namespace": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/path/to/spark_home/bin/custom-spark-submit')
def test_process_log(self): # Given hook = SparkSubmitHook(conn_id='spark_yarn_cluster') log_lines = [ 'SPARK_MAJOR_VERSION is set to 2, using Spark2', 'WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable', 'WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.', 'INFO Client: Requesting a new application from cluster with 10 NodeManagers', 'INFO Client: Submitting application application_1486558679801_1820 to ResourceManager' ] # When hook._process_log(log_lines) # Then self.assertEqual(hook._yarn_application_id, 'application_1486558679801_1820')
def test_resolve_connection_spark_binary_default_value(self): # Given hook = SparkSubmitHook(conn_id='spark_default') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn", "spark_binary": "spark-submit", "deploy_mode": None, "queue": 'root.default', "spark_home": None, "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], 'spark-submit')
def test_resolve_connection_spark_standalone_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "spark://spark-standalone-master:6066", "spark_binary": "spark-submit", "deploy_mode": "cluster", "queue": None, "spark_home": "/path/to/spark_home", "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/path/to/spark_home/bin/spark-submit')
def test_resolve_connection_spark_home_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_home_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = {"master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": "/opt/myspark", "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/opt/myspark/bin/spark-submit')
def test_resolve_connection_mesos_default_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_default_mesos') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "mesos://host:5050", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "mesos://host:5050")
def test_yarn_process_on_kill(self, mock_popen): # Given mock_popen.return_value.stdout = six.StringIO('stdout') mock_popen.return_value.stderr = six.StringIO('stderr') mock_popen.return_value.poll.return_value = None mock_popen.return_value.wait.return_value = 0 log_lines = [ 'SPARK_MAJOR_VERSION is set to 2, using Spark2', 'WARN NativeCodeLoader: Unable to load native-hadoop library for your ' + 'platform... using builtin-java classes where applicable', 'WARN DomainSocketFactory: The short-circuit local reads feature cannot ' + 'be used because libhadoop cannot be loaded.', 'INFO Client: Requesting a new application from cluster with 10 ' + 'NodeManagerapplication_1486558679801_1820s', 'INFO Client: Submitting application application_1486558679801_1820 ' + 'to ResourceManager' ] hook = SparkSubmitHook(conn_id='spark_yarn_cluster') hook._process_spark_submit_log(log_lines) hook.submit() # When hook.on_kill() # Then self.assertIn(call(['yarn', 'application', '-kill', 'application_1486558679801_1820'], stderr=-1, stdout=-1), mock_popen.mock_calls)
def test_resolve_connection_spark_home_not_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_home_not_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = { "master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None } self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], 'spark-submit')
def test_resolve_connection_yarn_default(self): # Given hook = SparkSubmitHook(conn_id='') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "yarn", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None, "namespace": 'default'} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "yarn")
def test_resolve_should_track_driver_status(self): # Given hook_default = SparkSubmitHook(conn_id='') hook_spark_yarn_cluster = SparkSubmitHook(conn_id='spark_yarn_cluster') hook_spark_k8s_cluster = SparkSubmitHook(conn_id='spark_k8s_cluster') hook_spark_default_mesos = SparkSubmitHook(conn_id='spark_default_mesos') hook_spark_home_set = SparkSubmitHook(conn_id='spark_home_set') hook_spark_home_not_set = SparkSubmitHook(conn_id='spark_home_not_set') hook_spark_binary_set = SparkSubmitHook(conn_id='spark_binary_set') hook_spark_binary_and_home_set = SparkSubmitHook( conn_id='spark_binary_and_home_set') hook_spark_standalone_cluster = SparkSubmitHook( conn_id='spark_standalone_cluster') # When should_track_driver_status_default = hook_default \ ._resolve_should_track_driver_status() should_track_driver_status_spark_yarn_cluster = hook_spark_yarn_cluster \ ._resolve_should_track_driver_status() should_track_driver_status_spark_k8s_cluster = hook_spark_k8s_cluster \ ._resolve_should_track_driver_status() should_track_driver_status_spark_default_mesos = hook_spark_default_mesos \ ._resolve_should_track_driver_status() should_track_driver_status_spark_home_set = hook_spark_home_set \ ._resolve_should_track_driver_status() should_track_driver_status_spark_home_not_set = hook_spark_home_not_set \ ._resolve_should_track_driver_status() should_track_driver_status_spark_binary_set = hook_spark_binary_set \ ._resolve_should_track_driver_status() should_track_driver_status_spark_binary_and_home_set = \ hook_spark_binary_and_home_set._resolve_should_track_driver_status() should_track_driver_status_spark_standalone_cluster = \ hook_spark_standalone_cluster._resolve_should_track_driver_status() # Then self.assertEqual(should_track_driver_status_default, False) self.assertEqual(should_track_driver_status_spark_yarn_cluster, False) self.assertEqual(should_track_driver_status_spark_k8s_cluster, False) self.assertEqual(should_track_driver_status_spark_default_mesos, False) self.assertEqual(should_track_driver_status_spark_home_set, False) self.assertEqual(should_track_driver_status_spark_home_not_set, False) self.assertEqual(should_track_driver_status_spark_binary_set, False) self.assertEqual(should_track_driver_status_spark_binary_and_home_set, False) self.assertEqual(should_track_driver_status_spark_standalone_cluster, True)
def test_build_command(self): hook = SparkSubmitHook(**self._config) # The subprocess requires an array but we build the cmd by joining on a space cmd = ' '.join(hook._build_command(self._spark_job_file)) # Check if the URL gets build properly and everything exists. assert self._spark_job_file in cmd # Check all the parameters assert "--files {}".format(self._config['files']) in cmd assert "--py-files {}".format(self._config['py_files']) in cmd assert "--jars {}".format(self._config['jars']) in cmd assert "--total-executor-cores {}".format( self._config['total_executor_cores']) in cmd assert "--executor-cores {}".format( self._config['executor_cores']) in cmd assert "--executor-memory {}".format( self._config['executor_memory']) in cmd assert "--keytab {}".format(self._config['keytab']) in cmd assert "--principal {}".format(self._config['principal']) in cmd assert "--name {}".format(self._config['name']) in cmd assert "--num-executors {}".format( self._config['num_executors']) in cmd assert "--class {}".format(self._config['java_class']) in cmd assert "--driver-memory {}".format( self._config['driver_memory']) in cmd # Check if all config settings are there for k in self._config['conf']: assert "--conf {0}={1}".format(k, self._config['conf'][k]) in cmd # Check the application arguments are there for a in self._config['application_args']: assert a in cmd # Check if application arguments are after the application application_idx = cmd.find(self._spark_job_file) for a in self._config['application_args']: assert cmd.find(a) > application_idx if self._config['verbose']: assert "--verbose" in cmd
def test_resolve_connection_spark_k8s_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_k8s_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"spark_home": "/opt/spark", "queue": None, "spark_binary": "spark-submit", "master": "k8s://https://k8s-master", "deploy_mode": "cluster", "namespace": "mynamespace"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master") self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
def execute(self, context): """ Call the SparkSubmitHook to run the provided spark job """ self._hook = SparkSubmitHook( conf=self._conf, conn_id=self._conn_id, files=self._files, py_files=self._py_files, jars=self._jars, executor_cores=self._executor_cores, executor_memory=self._executor_memory, keytab=self._keytab, principal=self._principal, name=self._name, num_executors=self._num_executors, verbose=self._verbose ) self._hook.submit(self._application)
def test_resolve_connection_spark_yarn_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_yarn_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": "cluster", "queue": "root.etl", "spark_home": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "yarn://yarn-master") self.assertEqual(dict_cmd["--queue"], "root.etl") self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
def _run_spark_submit(self, file, jars): from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook _config = self.config deploy = self.deploy spark = SparkSubmitHook( conf=_config.conf, conn_id=self.emr_config.conn_id, name=self.job.job_id, application_args=list_of_strings(self.task.application_args()), java_class=self.task.main_class, files=deploy.arg_files(_config.files), py_files=deploy.arg_files(_config.py_files), driver_class_path=_config.driver_class_path, jars=deploy.arg_files(jars), packages=_config.packages, exclude_packages=_config.exclude_packages, repositories=_config.repositories, total_executor_cores=_config.total_executor_cores, executor_cores=_config.executor_cores, executor_memory=_config.executor_memory, driver_memory=_config.driver_memory, keytab=_config.keytab, principal=_config.principal, num_executors=_config.num_executors, env_vars=self._get_env_vars(), verbose=_config.verbose, ) step_id = self.emr_cluster.run_spark_submit_step( name=self.job.job_id, spark_submit_command=spark._build_spark_submit_command( application=deploy.sync(file) ), ) self.task_run.set_external_resource_urls( self.emr_cluster.get_emr_logs_dict(self.spark_application_logs) ) self.emr_cluster.wait_for_step_completion( step_id, status_reporter=self._report_step_status ) pass
def test_process_spark_driver_status_log(self): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster') log_lines = [ 'Submitting a request for the status of submission ' + 'driver-20171128111415-0001 in spark://spark-standalone-master:6066', '17/11/28 11:15:37 INFO RestSubmissionClient: Server responded with ' + 'SubmissionStatusResponse:', '{', '"action" : "SubmissionStatusResponse",', '"driverState" : "RUNNING",', '"serverSparkVersion" : "1.6.0",', '"submissionId" : "driver-20171128111415-0001",', '"success" : true,', '"workerHostPort" : "172.18.0.7:38561",', '"workerId" : "worker-20171128110741-172.18.0.7-38561"', '}' ] # When hook._process_spark_status_log(log_lines) # Then self.assertEqual(hook._driver_status, 'RUNNING')
def test_spark_process_runcmd(self, mock_popen): # Given mock_popen.return_value.stdout = six.StringIO('stdout') mock_popen.return_value.stderr = six.StringIO('stderr') mock_popen.return_value.wait.return_value = 0 # When hook = SparkSubmitHook(conn_id='') hook.submit() # Then self.assertEqual( mock_popen.mock_calls[0], call([ 'spark-submit', '--master', 'yarn', '--name', 'default-name', '' ], stderr=-2, stdout=-1, universal_newlines=True, bufsize=-1))
def test_standalone_cluster_process_on_kill(self): # Given log_lines = [ 'Running Spark using the REST application submission protocol.', '17/11/28 11:14:15 INFO RestSubmissionClient: Submitting a request ' + 'to launch an application in spark://spark-standalone-master:6066', '17/11/28 11:14:15 INFO RestSubmissionClient: Submission successfully ' + 'created as driver-20171128111415-0001. Polling submission state...' ] hook = SparkSubmitHook(conn_id='spark_standalone_cluster') hook._process_spark_submit_log(log_lines) # When kill_cmd = hook._build_spark_driver_kill_command() # Then self.assertEqual(kill_cmd[0], '/path/to/spark_home/bin/spark-submit') self.assertEqual(kill_cmd[1], '--master') self.assertEqual(kill_cmd[2], 'spark://spark-standalone-master:6066') self.assertEqual(kill_cmd[3], '--kill') self.assertEqual(kill_cmd[4], 'driver-20171128111415-0001')
def test_resolve_connection_mesos_cluster_env_connection(self): # Given conn_name = self.gen_conn_name(10) os.environ["AIRFLOW_CONN_SPARK_{}".format( conn_name.upper())] = "mesos://mesos-master:5050" hook = SparkSubmitHook(conn_id='spark_{}'.format(conn_name)) # When connection = hook._resolve_connection() cmd = hook._build_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = { "master": "mesos://mesos-master:5050", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None } self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "mesos://mesos-master:5050")
def test_build_command(self): hook = SparkSubmitHook(**self._config) # The subprocess requires an array but we build the cmd by joining on a space cmd = ' '.join(hook._build_command(self._spark_job_file)) # Check if the URL gets build properly and everything exists. assert self._spark_job_file in cmd # Check all the parameters assert "--files {}".format(self._config['files']) in cmd assert "--py-files {}".format(self._config['py_files']) in cmd assert "--jars {}".format(self._config['jars']) in cmd assert "--total-executor-cores {}".format(self._config['total_executor_cores']) in cmd assert "--executor-cores {}".format(self._config['executor_cores']) in cmd assert "--executor-memory {}".format(self._config['executor_memory']) in cmd assert "--keytab {}".format(self._config['keytab']) in cmd assert "--principal {}".format(self._config['principal']) in cmd assert "--name {}".format(self._config['name']) in cmd assert "--num-executors {}".format(self._config['num_executors']) in cmd assert "--class {}".format(self._config['java_class']) in cmd assert "--driver-memory {}".format(self._config['driver_memory']) in cmd # Check if all config settings are there for k in self._config['conf']: assert "--conf {0}={1}".format(k, self._config['conf'][k]) in cmd # Check the application arguments are there for a in self._config['application_args']: assert a in cmd # Check if application arguments are after the application application_idx = cmd.find(self._spark_job_file) for a in self._config['application_args']: assert cmd.find(a) > application_idx if self._config['verbose']: assert "--verbose" in cmd
def test_SparkProcess_runcmd(self, mock_popen): # Given mock_popen.return_value.stdout = StringIO(u'stdout') mock_popen.return_value.stderr = StringIO(u'stderr') mock_popen.return_value.returncode = 0 mock_popen.return_value.communicate.return_value = [ StringIO(u'stdout\nstdout'), StringIO(u'stderr\nstderr') ] # When hook = SparkSubmitHook(conn_id='') hook.submit() # Then self.assertEqual( mock_popen.mock_calls[0], call([ 'spark-submit', '--master', 'yarn', '--name', 'default-name', '' ], stderr=-1, stdout=-1))
def test_resolve_connection_spark_k8s_cluster_ns_conf(self): # Given we specify the config option directly conf = { 'spark.kubernetes.namespace': 'airflow', } hook = SparkSubmitHook(conn_id='spark_k8s_cluster', conf=conf) # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"spark_home": "/opt/spark", "queue": None, "spark_binary": "spark-submit", "master": "k8s://https://k8s-master", "deploy_mode": "cluster", "namespace": "airflow"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master") self.assertEqual(dict_cmd["--deploy-mode"], "cluster") self.assertEqual(dict_cmd["--conf"], "spark.kubernetes.namespace=airflow")
def test_k8s_process_on_kill(self, mock_popen, mock_client_method): # Given mock_popen.return_value.stdout = six.StringIO('stdout') mock_popen.return_value.stderr = six.StringIO('stderr') mock_popen.return_value.poll.return_value = None mock_popen.return_value.wait.return_value = 0 client = mock_client_method.return_value hook = SparkSubmitHook(conn_id='spark_k8s_cluster') log_lines = [ 'INFO LoggingPodStatusWatcherImpl:54 - State changed, new state:' + 'pod name: spark-pi-edf2ace37be7353a958b38733a12f8e6-driver' + 'namespace: default' + 'labels: spark-app-selector -> spark-465b868ada474bda82ccb84ab2747fcd,' + 'spark-role -> driver' + 'pod uid: ba9c61f6-205f-11e8-b65f-d48564c88e42' + 'creation time: 2018-03-05T10:26:55Z' + 'service account name: spark' + 'volumes: spark-init-properties, download-jars-volume,' + 'download-files-volume, spark-token-2vmlm' + 'node name: N/A' + 'start time: N/A' + 'container images: N/A' + 'phase: Pending' + 'status: []' + '2018-03-05 11:26:56 INFO LoggingPodStatusWatcherImpl:54 - State changed,' + ' new state:' + 'pod name: spark-pi-edf2ace37be7353a958b38733a12f8e6-driver' + 'namespace: default' + 'Exit code: 0' ] hook._process_spark_submit_log(log_lines) hook.submit() # When hook.on_kill() # Then import kubernetes kwargs = {'pretty': True, 'body': kubernetes.client.V1DeleteOptions()} client.delete_namespaced_pod.assert_called_once_with( 'spark-pi-edf2ace37be7353a958b38733a12f8e6-driver', 'mynamespace', **kwargs)
def test_spark_process_on_kill(self, mock_popen): # Given mock_popen.return_value.stdout = six.StringIO('stdout') mock_popen.return_value.stderr = six.StringIO('stderr') mock_popen.return_value.poll.return_value = None mock_popen.return_value.wait.return_value = 0 log_lines = [ 'SPARK_MAJOR_VERSION is set to 2, using Spark2', 'WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable', 'WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.', 'INFO Client: Requesting a new application from cluster with 10 NodeManagerapplication_1486558679801_1820s', 'INFO Client: Submitting application application_1486558679801_1820 to ResourceManager' ] hook = SparkSubmitHook(conn_id='spark_yarn_cluster') hook._process_log(log_lines) hook.submit() # When hook.on_kill() # Then self.assertIn(call(['yarn', 'application', '-kill', 'application_1486558679801_1820'], stderr=-1, stdout=-1), mock_popen.mock_calls)
class SparkSubmitOperator(BaseOperator): """ This hook is a wrapper around the spark-submit binary to kick off a spark-submit job. It requires that the "spark-submit" binary is in the PATH. :param application: The application that submitted as a job, either jar or py file. :type application: str :param conf: Arbitrary Spark configuration properties :type conf: dict :param conn_id: The connection id as configured in Airflow administration. When an invalid connection_id is supplied, it will default to yarn. :type conn_id: str :param files: Upload additional files to the container running the job, separated by a comma. For example hive-site.xml. :type files: str :param py_files: Additional python files used by the job, can be .zip, .egg or .py. :type py_files: str :param jars: Submit additional jars to upload and place them in executor classpath. :type jars: str :param executor_cores: Number of cores per executor (Default: 2) :type executor_cores: int :param executor_memory: Memory per executor (e.g. 1000M, 2G) (Default: 1G) :type executor_memory: str :param keytab: Full path to the file that contains the keytab :type keytab: str :param principal: The name of the kerberos principal used for keytab :type principal: str :param name: Name of the job (default airflow-spark) :type name: str :param num_executors: Number of executors to launch :type num_executors: int :param verbose: Whether to pass the verbose flag to spark-submit process for debugging :type verbose: bool """ @apply_defaults def __init__(self, application='', conf=None, conn_id='spark_default', files=None, py_files=None, jars=None, executor_cores=None, executor_memory=None, keytab=None, principal=None, name='airflow-spark', num_executors=None, verbose=False, *args, **kwargs): super(SparkSubmitOperator, self).__init__(*args, **kwargs) self._application = application self._conf = conf self._files = files self._py_files = py_files self._jars = jars self._executor_cores = executor_cores self._executor_memory = executor_memory self._keytab = keytab self._principal = principal self._name = name self._num_executors = num_executors self._verbose = verbose self._hook = None self._conn_id = conn_id def execute(self, context): """ Call the SparkSubmitHook to run the provided spark job """ self._hook = SparkSubmitHook( conf=self._conf, conn_id=self._conn_id, files=self._files, py_files=self._py_files, jars=self._jars, executor_cores=self._executor_cores, executor_memory=self._executor_memory, keytab=self._keytab, principal=self._principal, name=self._name, num_executors=self._num_executors, verbose=self._verbose ) self._hook.submit(self._application) def on_kill(self): self._hook.on_kill()
def test_resolve_connection(self): # Default to the standard yarn connection because conn_id does not exists hook = SparkSubmitHook(conn_id='') self.assertEqual(hook._resolve_connection(), ('yarn', None, None, None)) assert "--master yarn" in ' '.join( hook._build_command(self._spark_job_file)) # Default to the standard yarn connection hook = SparkSubmitHook(conn_id='spark_default') self.assertEqual(hook._resolve_connection(), ('yarn', 'root.default', None, None)) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master yarn" in cmd assert "--queue root.default" in cmd # Connect to a mesos master hook = SparkSubmitHook(conn_id='spark_default_mesos') self.assertEqual(hook._resolve_connection(), ('mesos://host:5050', None, None, None)) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master mesos://host:5050" in cmd # Set specific queue and deploy mode hook = SparkSubmitHook(conn_id='spark_yarn_cluster') self.assertEqual(hook._resolve_connection(), ('yarn://yarn-master', 'root.etl', 'cluster', None)) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master yarn://yarn-master" in cmd assert "--queue root.etl" in cmd assert "--deploy-mode cluster" in cmd # Set the spark home hook = SparkSubmitHook(conn_id='spark_home_set') self.assertEqual(hook._resolve_connection(), ('yarn://yarn-master', None, None, '/opt/myspark')) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert cmd.startswith('/opt/myspark/bin/spark-submit') # Spark home not set hook = SparkSubmitHook(conn_id='spark_home_not_set') self.assertEqual(hook._resolve_connection(), ('yarn://yarn-master', None, None, None)) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert cmd.startswith('spark-submit')
def test_resolve_should_track_driver_status(self): # Given hook_default = SparkSubmitHook(conn_id='') hook_spark_yarn_cluster = SparkSubmitHook(conn_id='spark_yarn_cluster') hook_spark_default_mesos = SparkSubmitHook(conn_id='spark_default_mesos') hook_spark_home_set = SparkSubmitHook(conn_id='spark_home_set') hook_spark_home_not_set = SparkSubmitHook(conn_id='spark_home_not_set') hook_spark_binary_set = SparkSubmitHook(conn_id='spark_binary_set') hook_spark_binary_and_home_set = SparkSubmitHook( conn_id='spark_binary_and_home_set') hook_spark_standalone_cluster = SparkSubmitHook( conn_id='spark_standalone_cluster') # When should_track_driver_status_default = hook_default \ ._resolve_should_track_driver_status() should_track_driver_status_spark_yarn_cluster = hook_spark_yarn_cluster \ ._resolve_should_track_driver_status() should_track_driver_status_spark_default_mesos = hook_spark_default_mesos \ ._resolve_should_track_driver_status() should_track_driver_status_spark_home_set = hook_spark_home_set \ ._resolve_should_track_driver_status() should_track_driver_status_spark_home_not_set = hook_spark_home_not_set \ ._resolve_should_track_driver_status() should_track_driver_status_spark_binary_set = hook_spark_binary_set \ ._resolve_should_track_driver_status() should_track_driver_status_spark_binary_and_home_set = \ hook_spark_binary_and_home_set._resolve_should_track_driver_status() should_track_driver_status_spark_standalone_cluster = \ hook_spark_standalone_cluster._resolve_should_track_driver_status() # Then self.assertEqual(should_track_driver_status_default, False) self.assertEqual(should_track_driver_status_spark_yarn_cluster, False) self.assertEqual(should_track_driver_status_spark_default_mesos, False) self.assertEqual(should_track_driver_status_spark_home_set, False) self.assertEqual(should_track_driver_status_spark_home_not_set, False) self.assertEqual(should_track_driver_status_spark_binary_set, False) self.assertEqual(should_track_driver_status_spark_binary_and_home_set, False) self.assertEqual(should_track_driver_status_spark_standalone_cluster, True)
def test_resolve_connection(self): # Default to the standard yarn connection because conn_id does not exists hook = SparkSubmitHook(conn_id='') self.assertEqual(hook._resolve_connection(), ('yarn', None, None, None)) assert "--master yarn" in ' '.join(hook._build_command(self._spark_job_file)) # Default to the standard yarn connection hook = SparkSubmitHook(conn_id='spark_default') self.assertEqual( hook._resolve_connection(), ('yarn', 'root.default', None, None) ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master yarn" in cmd assert "--queue root.default" in cmd # Connect to a mesos master hook = SparkSubmitHook(conn_id='spark_default_mesos') self.assertEqual( hook._resolve_connection(), ('mesos://host:5050', None, None, None) ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master mesos://host:5050" in cmd # Set specific queue and deploy mode hook = SparkSubmitHook(conn_id='spark_yarn_cluster') self.assertEqual( hook._resolve_connection(), ('yarn://yarn-master', 'root.etl', 'cluster', None) ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert "--master yarn://yarn-master" in cmd assert "--queue root.etl" in cmd assert "--deploy-mode cluster" in cmd # Set the spark home hook = SparkSubmitHook(conn_id='spark_home_set') self.assertEqual( hook._resolve_connection(), ('yarn://yarn-master', None, None, '/opt/myspark') ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert cmd.startswith('/opt/myspark/bin/spark-submit') # Spark home not set hook = SparkSubmitHook(conn_id='spark_home_not_set') self.assertEqual( hook._resolve_connection(), ('yarn://yarn-master', None, None, None) ) cmd = ' '.join(hook._build_command(self._spark_job_file)) assert cmd.startswith('spark-submit')
class SparkSubmitOperator(BaseOperator): """ This hook is a wrapper around the spark-submit binary to kick off a spark-submit job. It requires that the "spark-submit" binary is in the PATH or the spark-home is set in the extra on the connection. :param application: The application that submitted as a job, either jar or py file. :type application: str :param conf: Arbitrary Spark configuration properties :type conf: dict :param conn_id: The connection id as configured in Airflow administration. When an invalid connection_id is supplied, it will default to yarn. :type conn_id: str :param files: Upload additional files to the executor running the job, separated by a comma. Files will be placed in the working directory of each executor. For example, serialized objects. :type files: str :param py_files: Additional python files used by the job, can be .zip, .egg or .py. :type py_files: str :param jars: Submit additional jars to upload and place them in executor classpath. :param driver_classpath: Additional, driver-specific, classpath settings. :type driver_classpath: str :type jars: str :param java_class: the main class of the Java application :type java_class: str :param packages: Comma-separated list of maven coordinates of jars to include on the driver and executor classpaths :type packages: str :param exclude_packages: Comma-separated list of maven coordinates of jars to exclude while resolving the dependencies provided in 'packages' :type exclude_packages: str :param repositories: Comma-separated list of additional remote repositories to search for the maven coordinates given with 'packages' :type repositories: str :param total_executor_cores: (Standalone & Mesos only) Total cores for all executors (Default: all the available cores on the worker) :type total_executor_cores: int :param executor_cores: (Standalone & YARN only) Number of cores per executor (Default: 2) :type executor_cores: int :param executor_memory: Memory per executor (e.g. 1000M, 2G) (Default: 1G) :type executor_memory: str :param driver_memory: Memory allocated to the driver (e.g. 1000M, 2G) (Default: 1G) :type driver_memory: str :param keytab: Full path to the file that contains the keytab :type keytab: str :param principal: The name of the kerberos principal used for keytab :type principal: str :param name: Name of the job (default airflow-spark) :type name: str :param num_executors: Number of executors to launch :type num_executors: int :param application_args: Arguments for the application being submitted :type application_args: list :param verbose: Whether to pass the verbose flag to spark-submit process for debugging :type verbose: bool """ template_fields = ('_name', '_application_args', '_packages') ui_color = WEB_COLORS['LIGHTORANGE'] @apply_defaults def __init__(self, application='', conf=None, conn_id='spark_default', files=None, py_files=None, driver_classpath=None, jars=None, java_class=None, packages=None, exclude_packages=None, repositories=None, total_executor_cores=None, executor_cores=None, executor_memory=None, driver_memory=None, keytab=None, principal=None, name='airflow-spark', num_executors=None, application_args=None, verbose=False, *args, **kwargs): super(SparkSubmitOperator, self).__init__(*args, **kwargs) self._application = application self._conf = conf self._files = files self._py_files = py_files self._driver_classpath = driver_classpath self._jars = jars self._java_class = java_class self._packages = packages self._exclude_packages = exclude_packages self._repositories = repositories self._total_executor_cores = total_executor_cores self._executor_cores = executor_cores self._executor_memory = executor_memory self._driver_memory = driver_memory self._keytab = keytab self._principal = principal self._name = name self._num_executors = num_executors self._application_args = application_args self._verbose = verbose self._hook = None self._conn_id = conn_id def execute(self, context): """ Call the SparkSubmitHook to run the provided spark job """ self._hook = SparkSubmitHook( conf=self._conf, conn_id=self._conn_id, files=self._files, py_files=self._py_files, driver_classpath=self._driver_classpath, jars=self._jars, java_class=self._java_class, packages=self._packages, exclude_packages=self._exclude_packages, repositories=self._repositories, total_executor_cores=self._total_executor_cores, executor_cores=self._executor_cores, executor_memory=self._executor_memory, driver_memory=self._driver_memory, keytab=self._keytab, principal=self._principal, name=self._name, num_executors=self._num_executors, application_args=self._application_args, verbose=self._verbose) self._hook.submit(self._application) def on_kill(self): self._hook.on_kill()
class SparkSubmitOperator(BaseOperator): """ This hook is a wrapper around the spark-submit binary to kick off a spark-submit job. It requires that the "spark-submit" binary is in the PATH or the spark-home is set in the extra on the connection. :param application: The application that submitted as a job, either jar or py file. :type application: str :param conf: Arbitrary Spark configuration properties :type conf: dict :param conn_id: The connection id as configured in Airflow administration. When an invalid connection_id is supplied, it will default to yarn. :type conn_id: str :param files: Upload additional files to the container running the job, separated by a comma. For example hive-site.xml. :type files: str :param py_files: Additional python files used by the job, can be .zip, .egg or .py. :type py_files: str :param jars: Submit additional jars to upload and place them in executor classpath. :param driver_classpath: Additional, driver-specific, classpath settings. :type driver_classpath: str :type jars: str :param java_class: the main class of the Java application :type java_class: str :param packages: Comma-separated list of maven coordinates of jars to include on the driver and executor classpaths :type packages: str :param exclude_packages: Comma-separated list of maven coordinates of jars to exclude while resolving the dependencies provided in 'packages' :type exclude_packages: str :param repositories: Comma-separated list of additional remote repositories to search for the maven coordinates given with 'packages' :type repositories: str :param total_executor_cores: (Standalone & Mesos only) Total cores for all executors (Default: all the available cores on the worker) :type total_executor_cores: int :param executor_cores: (Standalone & YARN only) Number of cores per executor (Default: 2) :type executor_cores: int :param executor_memory: Memory per executor (e.g. 1000M, 2G) (Default: 1G) :type executor_memory: str :param driver_memory: Memory allocated to the driver (e.g. 1000M, 2G) (Default: 1G) :type driver_memory: str :param keytab: Full path to the file that contains the keytab :type keytab: str :param principal: The name of the kerberos principal used for keytab :type principal: str :param name: Name of the job (default airflow-spark) :type name: str :param num_executors: Number of executors to launch :type num_executors: int :param application_args: Arguments for the application being submitted :type application_args: list :param verbose: Whether to pass the verbose flag to spark-submit process for debugging :type verbose: bool """ template_fields = ('_name', '_application_args','_packages') ui_color = WEB_COLORS['LIGHTORANGE'] @apply_defaults def __init__(self, application='', conf=None, conn_id='spark_default', files=None, py_files=None, driver_classpath=None, jars=None, java_class=None, packages=None, exclude_packages=None, repositories=None, total_executor_cores=None, executor_cores=None, executor_memory=None, driver_memory=None, keytab=None, principal=None, name='airflow-spark', num_executors=None, application_args=None, verbose=False, *args, **kwargs): super(SparkSubmitOperator, self).__init__(*args, **kwargs) self._application = application self._conf = conf self._files = files self._py_files = py_files self._driver_classpath = driver_classpath self._jars = jars self._java_class = java_class self._packages = packages self._exclude_packages = exclude_packages self._repositories = repositories self._total_executor_cores = total_executor_cores self._executor_cores = executor_cores self._executor_memory = executor_memory self._driver_memory = driver_memory self._keytab = keytab self._principal = principal self._name = name self._num_executors = num_executors self._application_args = application_args self._verbose = verbose self._hook = None self._conn_id = conn_id def execute(self, context): """ Call the SparkSubmitHook to run the provided spark job """ self._hook = SparkSubmitHook( conf=self._conf, conn_id=self._conn_id, files=self._files, py_files=self._py_files, driver_classpath=self._driver_classpath, jars=self._jars, java_class=self._java_class, packages=self._packages, exclude_packages=self._exclude_packages, repositories=self._repositories, total_executor_cores=self._total_executor_cores, executor_cores=self._executor_cores, executor_memory=self._executor_memory, driver_memory=self._driver_memory, keytab=self._keytab, principal=self._principal, name=self._name, num_executors=self._num_executors, application_args=self._application_args, verbose=self._verbose ) self._hook.submit(self._application) def on_kill(self): self._hook.on_kill()
def _run_spark_submit(self, application, jars): # task_env = get_cloud_config(Clouds.local) spark_local_config = SparkLocalEngineConfig() _config = self.config deploy = self.deploy AIRFLOW_ON = is_airflow_enabled() if AIRFLOW_ON: from airflow.contrib.hooks.spark_submit_hook import SparkSubmitHook from airflow.exceptions import AirflowException as SparkException else: from dbnd_spark._vendor.airflow.spark_hook import ( SparkException, SparkSubmitHook, ) spark = SparkSubmitHook( conf=_config.conf, conn_id=spark_local_config.conn_id, name=self.job.job_id, application_args=list_of_strings(self.task.application_args()), java_class=self.task.main_class, files=deploy.arg_files(_config.files), py_files=deploy.arg_files(self.task.get_py_files()), driver_class_path=_config.driver_class_path, jars=deploy.arg_files(jars), packages=_config.packages, exclude_packages=_config.exclude_packages, repositories=_config.repositories, total_executor_cores=_config.total_executor_cores, executor_cores=_config.executor_cores, executor_memory=_config.executor_memory, driver_memory=_config.driver_memory, keytab=_config.keytab, principal=_config.principal, num_executors=_config.num_executors, env_vars=self._get_env_vars(), verbose=_config.verbose, ) if not AIRFLOW_ON: # If there's no Airflow then there's no Connection so we # take conn information from spark config spark.set_connection(spark_local_config.conn_uri) log_buffer = StringIO() with log_buffer as lb: dbnd_log_handler = self._capture_submit_log(spark, lb) try: # sync the application file to remote if needed spark.submit(application=deploy.sync(application)) except SparkException as ex: return_code = self._get_spark_return_code_from_exception(ex) if return_code != "0": error_snippets = parse_spark_log_safe( log_buffer.getvalue().split(os.linesep)) raise failed_to_run_spark_script( self, spark._build_spark_submit_command( application=application), application, return_code, error_snippets, ) else: raise failed_spark_status(ex) finally: spark.log.handlers = [ h for h in spark.log.handlers if not dbnd_log_handler ]