def test_resolve_connection_spark_home_not_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_home_not_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = { "master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None, "namespace": None, } assert connection == expected_spark_connection assert cmd[0] == 'spark-submit'
def test_resolve_connection_spark_k8s_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_k8s_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"spark_home": "/opt/spark", "queue": None, "spark_binary": "spark-submit", "master": "k8s://https://k8s-master", "deploy_mode": "cluster", "namespace": "mynamespace"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master") self.assertEqual(dict_cmd["--deploy-mode"], "cluster")
def test_resolve_connection_yarn_default_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_default') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"master": "yarn", "spark_binary": "spark-submit", "deploy_mode": None, "queue": "root.default", "spark_home": None, "namespace": None} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "yarn") self.assertEqual(dict_cmd["--queue"], "root.default")
def test_resolve_connection_spark_standalone_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_standalone_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = { "master": "spark://spark-standalone-master:6066", "spark_binary": "spark-submit", "deploy_mode": "cluster", "queue": None, "spark_home": "/path/to/spark_home", "namespace": None } self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/path/to/spark_home/bin/spark-submit')
def test_resolve_connection_spark_binary_default_value(self): # Given hook = SparkSubmitHook(conn_id='spark_default') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = { "master": "yarn", "spark_binary": "spark-submit", "deploy_mode": None, "queue": 'root.default', "spark_home": None, "namespace": None } self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], 'spark-submit')
def test_resolve_connection_spark_home_set_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_home_set') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = { "master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": "/opt/myspark", "namespace": None } self.assertEqual(connection, expected_spark_connection) self.assertEqual(cmd[0], '/opt/myspark/bin/spark-submit')
def test_resolve_connection_spark_binary_default_value_override(self): # Given hook = SparkSubmitHook(conn_id='spark_binary_set', spark_binary='another-custom-spark-submit') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_spark_connection = { "master": "yarn", "spark_binary": "another-custom-spark-submit", "deploy_mode": None, "queue": None, "spark_home": None, "namespace": None, } assert connection == expected_spark_connection assert cmd[0] == 'another-custom-spark-submit'
def test_resolve_connection_mesos_default_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_default_mesos') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = { "master": "mesos://host:5050", "spark_binary": "spark-submit", "deploy_mode": None, "queue": None, "spark_home": None, "namespace": None, } assert connection == expected_spark_connection assert dict_cmd["--master"] == "mesos://host:5050"
def test_resolve_connection_spark_yarn_cluster_connection(self): # Given hook = SparkSubmitHook(conn_id='spark_yarn_cluster') # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = { "master": "yarn://yarn-master", "spark_binary": "spark-submit", "deploy_mode": "cluster", "queue": "root.etl", "spark_home": None, "namespace": None, } assert connection == expected_spark_connection assert dict_cmd["--master"] == "yarn://yarn-master" assert dict_cmd["--queue"] == "root.etl" assert dict_cmd["--deploy-mode"] == "cluster"
def test_resolve_connection_spark_k8s_cluster_ns_conf(self): # Given we specify the config option directly conf = { 'spark.kubernetes.namespace': 'airflow', } hook = SparkSubmitHook(conn_id='spark_k8s_cluster', conf=conf) # When connection = hook._resolve_connection() cmd = hook._build_spark_submit_command(self._spark_job_file) # Then dict_cmd = self.cmd_args_to_dict(cmd) expected_spark_connection = {"spark_home": "/opt/spark", "queue": None, "spark_binary": "spark-submit", "master": "k8s://https://k8s-master", "deploy_mode": "cluster", "namespace": "airflow"} self.assertEqual(connection, expected_spark_connection) self.assertEqual(dict_cmd["--master"], "k8s://https://k8s-master") self.assertEqual(dict_cmd["--deploy-mode"], "cluster") self.assertEqual(dict_cmd["--conf"], "spark.kubernetes.namespace=airflow")
def test_build_spark_submit_command(self): # Given hook = SparkSubmitHook(**self._config) # When cmd = hook._build_spark_submit_command(self._spark_job_file) # Then expected_build_cmd = [ 'spark-submit', '--master', 'yarn', '--conf', 'parquet.compression=SNAPPY', '--files', 'hive-site.xml', '--py-files', 'sample_library.py', '--archives', 'sample_archive.zip#SAMPLE', '--jars', 'parquet.jar', '--packages', 'com.databricks:spark-avro_2.11:3.2.0', '--exclude-packages', 'org.bad.dependency:1.0.0', '--repositories', 'http://myrepo.org', '--num-executors', '10', '--total-executor-cores', '4', '--executor-cores', '4', '--executor-memory', '22g', '--driver-memory', '3g', '--keytab', 'privileged_user.keytab', '--principal', 'user/[email protected]', '--proxy-user', 'sample_user', '--name', 'spark-job', '--class', 'com.foo.bar.AppMain', '--verbose', 'test_application.py', '-f', 'foo', '--bar', 'bar', '--with-spaces', 'args should keep embdedded spaces', 'baz' ] self.assertEqual(expected_build_cmd, cmd)