def test_unique_task_ids(): log.info('Submitting two sample Spark Applications') submit_args = [ "--conf spark.cores.max=1", "--class org.apache.spark.examples.SparkPi" ] driver_id_1 = utils.submit_job(app_url=utils.SPARK_EXAMPLES, app_args="100", args=submit_args) driver_id_2 = utils.submit_job(app_url=utils.SPARK_EXAMPLES, app_args="100", args=submit_args) log.info( 'Two Spark Applications submitted. Driver 1 ID: %s, Driver 2 ID: %s' % (driver_id_1, driver_id_2)) log.info('Waiting for completion. Polling state') completed = wait_for_jobs_completion(driver_id_1, driver_id_2) assert completed == True, 'Sample Spark Applications failed to successfully complete within given time' out = sdk_cmd.run_cli("task --completed --json") data = json.loads(out) log.info( 'Collecting tasks that belong to the drivers created in this test') task_ids = [] for d in data: if driver_id_1 in d['framework_id'] or driver_id_2 in d['framework_id']: task_ids.append(d['id']) log.info('Tasks found: %s' % (' '.join(task_ids))) assert len(task_ids) == len( set(task_ids) ), 'Task ids for two independent Spark Applications contain duplicates'
def submit_job(dispatcher): dispatcher_name, dispatcher_role, driver_role = dispatcher args = [ "--conf", "spark.cores.max=1", "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.mesos.role={}".format(driver_role), "--conf", "spark.mesos.executor.docker.image=mesosphere/spark-dev:931ca56273af913d103718376e2fbc04be7cbde0", # use Hector's image "--conf", "spark.port.maxRetries=32" # setting to allow up to 32 drivers on same node #"--conf", "spark.mesos.driverEnv.SPARK_USER=root", # Run as root on centos ] app_args = "100000 300" utils.submit_job(app_name="/{}".format(dispatcher_name), app_url=MONTE_CARLO_APP_URL, app_args=app_args, verbose=False, args=args)
def submit_job(app_url: str, app_args: str, dispatcher: typing.Dict, duration: int, config: typing.List[str]): dispatcher_name = dispatcher["service"]["name"] log.info("Submitting job to dispatcher: %s, with duration: %s min.", dispatcher_name, duration) spark_utils.submit_job( service_name=dispatcher_name, app_url=app_url, app_args=app_args, verbose=False, args=config, driver_role=dispatcher["roles"]["executors"], spark_user=dispatcher["service"]["user"] if sdk_utils.is_strict_mode() else None, principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None)
def _submit_consumer(broker_dns, common_conf, topic, spark_app_name, driver_role, num_words): consumer_args = " ".join([broker_dns, topic, num_words, kerberos_flag]) consumer_config = [ "--conf", "spark.cores.max=4", "--class", "KafkaConsumer" ] + common_conf spark_utils.submit_job(app_url=jar_url, app_args=consumer_args, app_name=spark_app_name, args=consumer_config, driver_role=driver_role, verbose=False)
def test_disconnect_from_master(): python_script_path = os.path.join(THIS_DIR, 'jobs', 'python', 'long_running.py') python_script_url = utils.upload_file(python_script_path) task_id = utils.submit_job( python_script_url, "{} {}".format(LONG_RUNNING_FW_NUM_TASKS, LONG_RUNNING_RUN_TIME_SEC), [ "--conf", "spark.mesos.driver.failoverTimeout=1800", "--conf", "spark.cores.max=1" ]) # Wait until executor is running utils.wait_for_executors_running(LONG_RUNNING_FW_NAME, LONG_RUNNING_FW_NUM_TASKS) # Block the driver's connection to Mesos master framework_info = shakedown.get_service(LONG_RUNNING_FW_NAME) (driver_host, port) = _parse_fw_pid_host_port(framework_info["pid"]) _block_master_connection(driver_host, port) # The connection will timeout after 15 minutes of inactivity. # Add 5 minutes to make sure the master has detected the disconnection. # The framework will be considered disconnected => failover_timeout kicks in. LOGGER.info( "Waiting {} seconds for connection with master to timeout...".format( MASTER_CONNECTION_TIMEOUT_SEC)) time.sleep(MASTER_CONNECTION_TIMEOUT_SEC + 5 * 60) # Restore the connection. The driver should reconnect. _unblock_master_connection(driver_host) # The executor and driver should finish. utils.check_job_output(task_id, "Job completed successfully")
def test_cni_labels(): driver_task_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args= "3000", # Long enough to examine the Driver's & Executor's task infos args=[ "--conf spark.mesos.network.name=dcos", "--conf spark.mesos.network.labels=key1:val1,key2:val2", "--conf spark.cores.max={}".format(CNI_TEST_NUM_EXECUTORS), "--class org.apache.spark.examples.SparkPi" ]) # Wait until executors are running sdk_tasks.check_running(SPARK_PI_FW_NAME, CNI_TEST_NUM_EXECUTORS, timeout_seconds=600) # Check for network name / labels in Driver task info driver_task = shakedown.get_task(driver_task_id, completed=False) _check_task_network_info(driver_task) # Check for network name / labels in Executor task info executor_task = shakedown.get_service_tasks(SPARK_PI_FW_NAME)[0] _check_task_network_info(executor_task) # Check job output utils.check_job_output(driver_task_id, "Pi is roughly 3")
def _submit_gpu_app(num_executors, executor_gpus, gpus_max, app_name=None): """ Helper function to submit a gpu app. """ args = [ "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=240s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", "spark.executor.memory=2g", "--conf", "spark.mesos.gpus.max={}".format(gpus_max), "--conf", "spark.executor.cores=1", "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.mesos.driverEnv.SPARK_USER=root", # Run as root on centos "--class", "GpuPiApp" ] if executor_gpus is not None: args += [ "--conf", "spark.mesos.executor.gpus={}".format(executor_gpus) ] app_args = "{} 1000000".format( num_executors) # Long enough to examine the Executor's task info if app_name is not None: app_args += " {}".format(app_name) driver_task_id = spark_utils.submit_job( app_url=spark_utils.scala_test_jar_url(), app_args=app_args, args=args) return driver_task_id
def test_driver_metrics(use_overlay): @retrying.retry(wait_fixed=5000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_for_metric(task_id, expected_metric_name): stdout = sdk_cmd.run_cli("task metrics details {}".format(task_id)) result = expected_metric_name in stdout log.info('Checking for {} in STDOUT:\n{}\nResult: {}'.format( expected_metric_name, stdout, result)) return result app_name = "MockTaskRunner" submit_args = [ "--conf spark.cores.max=1", "--conf spark.mesos.containerizer=mesos", "--class {}".format(app_name) ] if use_overlay: submit_args = submit_args + [ "--conf spark.mesos.network.name=dcos", "--conf spark.mesos.driverEnv.VIRTUAL_NETWORK_ENABLED=true", "--conf spark.executorEnv.VIRTUAL_NETWORK_ENABLED=true" ] expected_metric = "jvm.heap.used" driver_id = utils.submit_job(app_url=utils.dcos_test_jar_url(), app_args="1 300", args=submit_args) wait_for_metric(driver_id, expected_metric) sdk_tasks.check_running(app_name, 1, timeout_seconds=600) executor_id = shakedown.get_service_task_ids(app_name)[0] wait_for_metric(executor_id, expected_metric)
def _submit_producer(broker_dns, common_conf, topic, spark_app_name, driver_role): big_file = "file:///mnt/mesos/sandbox/big.txt" producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag]) producer_config = [ "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=2", "--class", "KafkaFeeder" ] + common_conf spark_utils.submit_job(app_url=jar_url, app_args=producer_args, app_name=spark_app_name, args=producer_config, driver_role=driver_role, verbose=False)
def _submit_consumer(name, spark_executor_docker_image, jar, kafka_broker_dns, cassandra_native_client_dns, dispatcher, kafka_topics, kafka_group_id, write_to_cassandra, batch_size_seconds, cassandra_keyspace, cassandra_table, spark_cores_max, spark_executor_cores, must_fail: bool): app_args = ["--appName", name, "--brokers", ",".join(kafka_broker_dns), "--topics", kafka_topics, "--groupId", kafka_group_id, "--batchSizeSeconds", str(batch_size_seconds), "--cassandraKeyspace", cassandra_keyspace, "--cassandraTable", cassandra_table] if must_fail: app_args.extend(["--mustFailDueToInvalidArgument"]) if not write_to_cassandra: app_args.extend(["--shouldNotWriteToCassandra"]) cassandra_hosts = map(lambda x: x.split(':')[0], cassandra_native_client_dns) cassandra_port = cassandra_native_client_dns[0].split(':')[1] app_config = ["--supervise", "--conf", "spark.cores.max={}".format(spark_cores_max), "--conf", "spark.executor.cores={}".format(spark_executor_cores), "--conf", "spark.cassandra.connection.host={}".format(",".join(cassandra_hosts)), "--conf", "spark.cassandra.connection.port={}".format(cassandra_port), "--name", name, "--class", CONSUMER_CLASS_NAME] if spark_executor_docker_image: app_config.extend([ "--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image) ]) args = app_config + COMMON_CONF submission_id = spark_utils.submit_job( app_url=jar, app_args=" ".join(str(a) for a in app_args), args=args, verbose=False, service_name=dispatcher['service']['name'], driver_role=dispatcher['roles']['executors'], spark_user=dispatcher['service']['user'] if sdk_utils.is_strict_mode() else None, principal=dispatcher['service']['service_account'] if sdk_utils.is_strict_mode() else None) return submission_id
def test_pipeline(kerberos_flag, stop_count, jar_uri, keytab_secret, spark_service_name, jaas_uri=None): stop_count = str(stop_count) kerberized = True if kerberos_flag == "true" else False broker_dns = sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME, KAFKA_SERVICE_NAME, 'endpoints broker', json=True)['dns'][0] topic = "top1" big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt" # arguments to the application producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag]) uris = "spark.mesos.uris={}".format(big_file_url) if kerberized and jaas_uri is None: _uri = upload_jaas() uris += ",{}".format(_uri) else: uris += ",{}".format(jaas_uri) common_args = [ "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris ] kerberos_args = get_kerberized_kafka_spark_conf(spark_service_name, keytab_secret) producer_config = ["--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1", "--class", "KafkaFeeder"] + common_args if kerberized: producer_config += kerberos_args producer_id = utils.submit_job(app_url=jar_uri, app_args=producer_args, service_name=spark_service_name, args=producer_config) sdk_tasks.check_running(KAFKA_SERVICE_NAME, 1, timeout_seconds=600) consumer_config = ["--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1", "--class", "KafkaConsumer"] + common_args if kerberized: consumer_config += kerberos_args consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag]) try: utils.run_tests(app_url=jar_uri, app_args=consumer_args, expected_output="Read {} words".format(stop_count), service_name=spark_service_name, args=consumer_config) finally: utils.kill_driver(producer_id, spark_service_name)
def test_mesos_label_support(): driver_task_id = utils.submit_job(app_url=utils.SPARK_EXAMPLES, app_args="150", args=["--conf spark.cores.max=1", "--conf spark.mesos.driver.labels=foo:bar", # pass a test label "--class org.apache.spark.examples.SparkPi"]) driver_task_info = sdk_cmd._get_task_info(driver_task_id) expected = {'key': 'foo', 'value': 'bar'} assert expected in driver_task_info['labels']
def feed_sample_data(jar_uri, kafka_brokers, topic, common_args, messages): producer_args = ["--class", "KerberizedKafkaProducer"] + common_args producer_id = utils.submit_job(app_url=jar_uri, app_args="{} {} {} {}".format( "kafka", kafka_brokers, topic, ' '.join(messages)), service_name=utils.SPARK_SERVICE_NAME, args=producer_args) # validating producer output utils.check_job_output(producer_id, "{} messages sent to Kafka".format(len(messages)))
def test_supervise_conflict_frameworkid(): job_service_name = "MockTaskRunner" @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(job_service_name) if present: return svc is not None else: return svc is None job_args = [ "--supervise", "--class", "MockTaskRunner", "--conf", "spark.cores.max=1", "--conf", "spark.executors.cores=1" ] try: driver_id = utils.submit_job(app_url=utils.dcos_test_jar_url(), app_args="1 1800", service_name=utils.SPARK_SERVICE_NAME, args=job_args) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has running executors") service_info = shakedown.get_service(job_service_name).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) kill_status = sdk_cmd.kill_task_with_pattern(driver_regex, service_info['hostname']) wait_job_present(False) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has re-started") restarted_service_info = shakedown.get_service(job_service_name).dict() assert service_info['id'] != restarted_service_info[ 'id'], "Job has restarted with same framework Id" finally: kill_info = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(kill_info)) assert json.loads(kill_info)["success"], "Failed to kill spark job" wait_job_present(False)
def _launch_test_task(app_name): log.info('Submitting a Spark Applications with 1 executor') driver_task_id = utils.submit_job(app_url=utils.dcos_test_jar_url(), app_args="1 5", args=["--conf spark.cores.max=1", "--conf spark.executor.cores=1", "--conf spark.mesos.containerizer=mesos", "--conf spark.mesos.rejectOfferDuration=1s", f"--conf spark.mesos.executor.docker.image={utils.SPARK_DOCKER_IMAGE}", f"--class {app_name}" ]) sdk_tasks.check_running(app_name, 1, timeout_seconds=300) return driver_task_id
def test_task_not_lost(): driver_task_id = utils.submit_job(app_url=utils.SPARK_EXAMPLES, app_args="1500", # Long enough to examine the Executor's task info args=["--conf spark.cores.max=1", "--class org.apache.spark.examples.SparkPi"]) # Wait until executor is running sdk_tasks.check_running(SPARK_PI_FW_NAME, 1, timeout_seconds=600) # Check Executor task ID - should end with 0, the first task. # If it's > 0, that means the first task was lost. assert sdk_tasks.get_task_ids(SPARK_PI_FW_NAME, '')[0].endswith('-0') # Check job output utils.check_job_output(driver_task_id, "Pi is roughly 3")
def _submit_producer(name, spark_executor_docker_image, jar, kafka_broker_dns, dispatcher, kafka_topics, number_of_words, words_per_second, spark_cores_max, spark_executor_cores, must_fail: bool): app_args = ["--appName", name, "--brokers", ",".join(kafka_broker_dns), "--topics", kafka_topics, "--numberOfWords", str(number_of_words), "--wordsPerSecond", str(words_per_second)] if must_fail: app_args.extend(["--mustFailDueToInvalidArgument", ]) app_config = ["--conf", "spark.cores.max={}".format(spark_cores_max), "--conf", "spark.executor.cores={}".format(spark_executor_cores), "--name", name, "--class", PRODUCER_CLASS_NAME] # `number_of_words == 0` means infinite stream, so we'd like to have it # restarted in the case of failures. if number_of_words == 0: app_config.extend(["--supervise"]) if spark_executor_docker_image: app_config.extend([ "--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image) ]) args = app_config + COMMON_CONF submission_id = spark_utils.submit_job( app_url=jar, app_args=" ".join(str(a) for a in app_args), args=args, verbose=False, service_name=dispatcher['service']['name'], driver_role=dispatcher['roles']['executors'], spark_user=dispatcher['service']['user'] if sdk_utils.is_strict_mode() else None, principal=dispatcher['service']['service_account'] if sdk_utils.is_strict_mode() else None) return submission_id
def test_supervise(): @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(JOB_SERVICE_NAME) if present: return svc is not None else: return svc is None JOB_SERVICE_NAME = "RecoverableNetworkWordCount" job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs://{}".format(HDFS_DATA_DIR) driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), service_name=utils.SPARK_SERVICE_NAME, args=(KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(JOB_SERVICE_NAME, 1) log.info("Job has running executors") service_info = shakedown.get_service(JOB_SERVICE_NAME).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) shakedown.kill_process_on_host(hostname=service_info['hostname'], pattern=driver_regex) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(JOB_SERVICE_NAME, 1) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" wait_job_present(False)
def _submit_job_and_get_tasks(extra_args=[]): submit_args = [ "--conf spark.driver.cores={}".format(driver_cpus), "--conf spark.cores.max={}".format(executor_cpus), "--conf spark.executor.cores={}".format(executor_cpus), "--class {}".format(app_name) ] + extra_args driver_task_id = utils.submit_job(app_url=utils.dcos_test_jar_url(), app_args="1 600", args=submit_args) sdk_tasks.check_running(app_name, 1, timeout_seconds=300) driver_task = shakedown.get_task(driver_task_id, completed=False) executor_task = shakedown.get_service_tasks(app_name)[0] return (driver_task_id, driver_task, executor_task)
def _verify_submission_rejected(service_name, driver_role=None): app_name = "MockTaskRunner" submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)] submission_id = None error = None try: submission_id = utils.submit_job(service_name=service_name, app_url=utils.dcos_test_jar_url(), driver_role=driver_role, app_args="1 300", args=submit_args) except Exception as err: error = err finally: if submission_id: utils.kill_driver(submission_id, service_name=service_name) assert error is not None
def _submit_shuffle_job(sleep=0, extra_args=[], use_cli=True): num_unique_keys = SHUFFLE_JOB_EXPECTED_GROUPS_COUNT num_mappers = 4 value_size_bytes = 100 num_reducers = 4 # Usage: ShuffleApp [numMappers] [numPairs] [valueSize] [numReducers] [sleepBeforeShutdown] return utils.submit_job( app_url=utils.dcos_test_jar_url(), use_cli=use_cli, app_args="{} {} {} {} {}".format(num_mappers, num_unique_keys, value_size_bytes, num_reducers, sleep), args=[ "--conf spark.executor.cores=1", "--conf spark.cores.max={}".format(SHUFFLE_JOB_NUM_EXECUTORS), "--conf spark.scheduler.minRegisteredResourcesRatio=1", "--conf spark.scheduler.maxRegisteredResourcesWaitingTime=3m", "--class ShuffleApp" ] + extra_args)
def test_task_not_lost(): driver_task_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="1500", # Long enough to examine the Executor's task info args=[ "--conf", "spark.cores.max=1", "--class", "org.apache.spark.examples.SparkPi" ]) # Wait until executor is running utils.wait_for_executors_running(SPARK_PI_FW_NAME, 1) # Check Executor task ID - should be 0, the first task. # If it's > 0, that means the first task was lost. executor_task = shakedown.get_service_tasks(SPARK_PI_FW_NAME)[0] assert executor_task['id'] == "0" # Check job output utils.check_job_output(driver_task_id, "Pi is roughly 3")
def _submit_job_and_verify_role(service_name, expected_role, driver_role=None): app_name = "MockTaskRunner" submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)] submission_id = utils.submit_job(service_name=service_name, app_url=utils.dcos_test_jar_url(), app_args="1 300", driver_role=driver_role, args=submit_args) try: sdk_tasks.check_running(app_name, 1, timeout_seconds=300) driver_framework = dcos_utils.get_framework_json(app_name, completed=False) log.info("Driver framework:\n{}".format(driver_framework)) assert expected_role == driver_framework["role"], \ "Expected role '{}' but got '{}'".format(expected_role, driver_framework["role"]) except Exception: log.info(f"Cleaning up. Attempting to kill driver: {submission_id}") utils.kill_driver(submission_id, service_name=service_name)
def _submit_job_and_verify_users(user, use_ucr_for_spark_submit, extra_args=[]): app_name = "MockTaskRunner" submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)] + extra_args driver_task_id = utils.submit_job(service_name=SERVICE_NAME, app_url=utils.dcos_test_jar_url(), app_args="1 300", args=submit_args) try: sdk_tasks.check_running(app_name, 1, timeout_seconds=300) driver_task = shakedown.get_task(driver_task_id, completed=False) executor_tasks = shakedown.get_service_tasks(app_name) for task in [driver_task] + executor_tasks: log.info(f"Checking task '{task['id']}'") _check_task_user(task, user, use_ucr_for_spark_submit) finally: log.info(f"Cleaning up. Attempting to kill driver: {driver_task_id}") utils.kill_driver(driver_task_id, service_name=SERVICE_NAME)
def test_pipeline(kerberos_flag, stop_count, jar_uri, keytab_secret, spark_app_name, jaas_uri=None): stop_count = str(stop_count) kerberized = True if kerberos_flag == "true" else False broker_dns = _kafka_broker_dns() topic = "top1" big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt" # arguments to the application producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag]) uris = "spark.mesos.uris=http://norvig.com/big.txt" if kerberized and jaas_uri is None: jaas_path = os.path.join(THIS_DIR, "resources", "spark-kafka-client-jaas.conf") s3.upload_file(jaas_path) _uri = s3.s3_http_url("spark-kafka-client-jaas.conf") uris += ",{}".format(_uri) else: uris += ",{}".format(jaas_uri) common_args = [ "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris ] kerberos_args = [ "--conf", "spark.mesos.driver.secret.names={}".format(keytab_secret), "--conf", "spark.mesos.driver.secret.filenames=kafka-client.keytab", "--conf", "spark.mesos.executor.secret.names={}".format(keytab_secret), "--conf", "spark.mesos.executor.secret.filenames=kafka-client.keytab", "--conf", "spark.mesos.task.labels=DCOS_SPACE:{}".format(utils.SPARK_APP_NAME), "--conf", "spark.executorEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5), "--conf", "spark.mesos.driverEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5), "--conf", "spark.driver.extraJavaOptions=-Djava.security.auth.login.config=" "/mnt/mesos/sandbox/spark-kafka-client-jaas.conf", "--conf", "spark.executor.extraJavaOptions=" "-Djava.security.auth.login.config=/mnt/mesos/sandbox/spark-kafka-client-jaas.conf", ] producer_config = [ "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=2", "--class", "KafkaFeeder" ] + common_args if kerberized: producer_config += kerberos_args producer_id = utils.submit_job(app_url=jar_uri, app_args=producer_args, app_name=spark_app_name, args=producer_config) shakedown.wait_for(lambda: _producer_launched(), ignore_exceptions=False, timeout_seconds=600) shakedown.wait_for(lambda: utils.is_service_ready(KAFKA_SERVICE_NAME, 1), ignore_exceptions=False, timeout_seconds=600) consumer_config = [ "--conf", "spark.cores.max=4", "--class", "KafkaConsumer" ] + common_args if kerberized: consumer_config += kerberos_args consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag]) utils.run_tests(app_url=jar_uri, app_args=consumer_args, expected_output="Read {} words".format(stop_count), app_name=spark_app_name, args=consumer_config) utils.kill_driver(producer_id, spark_app_name)
def test_structured_streaming_recovery(kerberized_spark, kerberized_kafka): kafka_brokers = ','.join( sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME, KAFKA_SERVICE_NAME, 'endpoints broker', json=True)['dns']) LOGGER.info("Kafka brokers: {}".format(kafka_brokers)) _uri = upload_jaas() uris = "spark.mesos.uris={}".format(_uri) jar_uri = utils.upload_dcos_test_jar() kafka_kerberos_args = get_kerberized_kafka_spark_conf( utils.SPARK_SERVICE_NAME) LOGGER.info("Spark Kerberos configuration for Kafka:\n{}".format( '\n'.join(kafka_kerberos_args))) common_args = [ "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris ] + kafka_kerberos_args # configuring streaming job and HDFS folders setup_hdfs_paths() # running kafka producer message_set_a = ["abc"] * 100 feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args, message_set_a) spark_submit_args = [ "--supervise", "--class", "StructuredStreamingWithCheckpointing", "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1", "--conf", "spark.sql.shuffle.partitions=2", "--conf", "spark.executor.memory=2g" ] + common_args application_args = "{} {} {} {}".format(kafka_brokers, KAFKA_TEST_TOPIC, HDFS_CHECKPOINT_DIR, SPARK_SECURITY_PROTOCOL) driver_task_id = utils.submit_job(app_url=jar_uri, app_args=application_args, service_name=utils.SPARK_SERVICE_NAME, args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + spark_submit_args)) # Wait until executor is running LOGGER.info("Starting supervised driver {}".format(driver_task_id)) sdk_tasks.check_running(SPARK_APPLICATION_NAME, expected_task_count=1, timeout_seconds=600) # validating Structured Streaming topic consumption expected_output_a = "{}| {}".format(message_set_a[0], len(message_set_a)) LOGGER.info( "Validating Structured Streaming topic consumption, waiting for output {}" .format(expected_output_a)) utils.wait_for_running_job_output(driver_task_id, expected_output_a) # killing the driver service_info = shakedown.get_service(SPARK_APPLICATION_NAME).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) sdk_cmd.kill_task_with_pattern(agent_host=service_info['hostname'], pattern=driver_regex) # sending more data to Kafka message_set_b = ["def"] * 100 feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args + kafka_kerberos_args, message_set_b) # checkpointing validation sdk_tasks.check_running(SPARK_APPLICATION_NAME, expected_task_count=1, timeout_seconds=600) LOGGER.info("Streaming job has re-started") # validating Structured Streaming resumed topic consumption expected_output_b = "{}| {}".format(message_set_b[0], len(message_set_b)) LOGGER.info( "Validating that consumption resumed from checkpoint, waiting for output '{}' and '{}'" .format(expected_output_a, expected_output_b)) utils.wait_for_running_job_output(driver_task_id, expected_output_a) utils.wait_for_running_job_output(driver_task_id, expected_output_b)
def test_supervise(): def streaming_job_registered(): return shakedown.get_service(JOB_SERVICE_NAME) is not None def streaming_job_is_not_running(): return not streaming_job_registered() def has_running_executors(): f = shakedown.get_service(JOB_SERVICE_NAME) if f is None: return False else: return len([ x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING" ]) > 0 JOB_SERVICE_NAME = "RecoverableNetworkWordCount" job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs:///users/alice" driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), app_name=utils.SPARK_APP_NAME, args=(KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has running executors") host = shakedown.get_service(JOB_SERVICE_NAME).dict()["hostname"] id = shakedown.get_service(JOB_SERVICE_NAME).dict()["id"] driver_regex = "spark.mesos.driver.frameworkId={}".format(id) shakedown.kill_process_on_host(hostname=host, pattern=driver_regex) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has re-registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_APP_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" shakedown.wait_for(lambda: streaming_job_is_not_running(), ignore_exceptions=False, timeout_seconds=600)
def test_supervise(kerberized_spark, hdfs_with_kerberos): job_service_name = "RecoverableNetworkWordCount" @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(job_service_name) if present: return svc is not None else: return svc is None job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs://{}".format(HDFS_DATA_DIR) driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), service_name=utils.SPARK_SERVICE_NAME, args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has running executors") service_info = shakedown.get_service(job_service_name).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) status, stdout = shakedown.run_command_on_agent( service_info['hostname'], "ps aux | grep -v grep | grep '{}'".format(driver_regex), username=sdk_cmd.LINUX_USER) pids = [p.strip().split()[1] for p in stdout.splitlines()] for pid in pids: status, stdout = shakedown.run_command_on_agent( service_info['hostname'], "sudo kill -9 {}".format(pid), username=sdk_cmd.LINUX_USER) if status: print("Killed pid: {}".format(pid)) else: print("Unable to killed pid: {}".format(pid)) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" wait_job_present(False)