def test_pipeline(kerberos_flag, stop_count, jar_uri, keytab_secret, spark_service_name, jaas_uri=None): stop_count = str(stop_count) kerberized = True if kerberos_flag == "true" else False broker_dns = sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME, KAFKA_SERVICE_NAME, 'endpoints broker', json=True)['dns'][0] topic = "top1" big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt" # arguments to the application producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag]) uris = "spark.mesos.uris={}".format(big_file_url) if kerberized and jaas_uri is None: _uri = upload_jaas() uris += ",{}".format(_uri) else: uris += ",{}".format(jaas_uri) common_args = [ "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris ] kerberos_args = get_kerberized_kafka_spark_conf(spark_service_name, keytab_secret) producer_config = ["--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1", "--class", "KafkaFeeder"] + common_args if kerberized: producer_config += kerberos_args producer_id = utils.submit_job(app_url=jar_uri, app_args=producer_args, service_name=spark_service_name, args=producer_config) sdk_tasks.check_running(KAFKA_SERVICE_NAME, 1, timeout_seconds=600) consumer_config = ["--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1", "--class", "KafkaConsumer"] + common_args if kerberized: consumer_config += kerberos_args consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag]) try: utils.run_tests(app_url=jar_uri, app_args=consumer_args, expected_output="Read {} words".format(stop_count), service_name=spark_service_name, args=consumer_config) finally: utils.kill_driver(producer_id, spark_service_name)
def test_executor_gpus_exceeds_available_gpus(): """ Checks: if executor.gpus exceeds the available gpus, the job never runs. """ num_executors = 2 executor_gpus = 2 driver_task_id = _submit_gpu_app(num_executors=num_executors, executor_gpus=executor_gpus, gpus_max=num_executors * executor_gpus) try: log.info("Waiting for job to complete.") shakedown.wait_for_task_completion(driver_task_id, timeout_sec=240) except TimeoutExpired: log.info("Job failed to complete, as expected.") spark_utils.kill_driver(driver_task_id, spark_utils.SPARK_APP_NAME) return pytest.fail("Did not expect this job to complete.")
def _verify_submission_rejected(service_name, driver_role=None): app_name = "MockTaskRunner" submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)] submission_id = None error = None try: submission_id = utils.submit_job(service_name=service_name, app_url=utils.dcos_test_jar_url(), driver_role=driver_role, app_args="1 300", args=submit_args) except Exception as err: error = err finally: if submission_id: utils.kill_driver(submission_id, service_name=service_name) assert error is not None
def _submit_job_and_verify_role(service_name, expected_role, driver_role=None): app_name = "MockTaskRunner" submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)] submission_id = utils.submit_job(service_name=service_name, app_url=utils.dcos_test_jar_url(), app_args="1 300", driver_role=driver_role, args=submit_args) try: sdk_tasks.check_running(app_name, 1, timeout_seconds=300) driver_framework = dcos_utils.get_framework_json(app_name, completed=False) log.info("Driver framework:\n{}".format(driver_framework)) assert expected_role == driver_framework["role"], \ "Expected role '{}' but got '{}'".format(expected_role, driver_framework["role"]) except Exception: log.info(f"Cleaning up. Attempting to kill driver: {submission_id}") utils.kill_driver(submission_id, service_name=service_name)
def test_supervise_conflict_frameworkid(): job_service_name = "MockTaskRunner" @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(job_service_name) if present: return svc is not None else: return svc is None job_args = [ "--supervise", "--class", "MockTaskRunner", "--conf", "spark.cores.max=1", "--conf", "spark.executors.cores=1" ] try: driver_id = utils.submit_job(app_url=utils.dcos_test_jar_url(), app_args="1 1800", service_name=utils.SPARK_SERVICE_NAME, args=job_args) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has running executors") service_info = shakedown.get_service(job_service_name).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) kill_status = sdk_cmd.kill_task_with_pattern(driver_regex, service_info['hostname']) wait_job_present(False) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has re-started") restarted_service_info = shakedown.get_service(job_service_name).dict() assert service_info['id'] != restarted_service_info[ 'id'], "Job has restarted with same framework Id" finally: kill_info = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(kill_info)) assert json.loads(kill_info)["success"], "Failed to kill spark job" wait_job_present(False)
def test_shuffle_job(submit_args=[], use_ucr_for_spark_submit=True, use_cli_for_spark_submit=True, check_network_labels=False): if not use_ucr_for_spark_submit: submit_args = submit_args + [ "--conf spark.mesos.containerizer=docker", "--conf spark.mesos.executor.docker.parameters=user=99", ] driver_task_id = _submit_shuffle_job(use_cli=use_cli_for_spark_submit, sleep=300, extra_args=submit_args) sdk_tasks.check_running(SHUFFLE_JOB_FW_NAME, SHUFFLE_JOB_NUM_EXECUTORS, timeout_seconds=600) driver_task = shakedown.get_task(driver_task_id, completed=False) _check_task_network(driver_task, is_ucr=use_ucr_for_spark_submit) if check_network_labels and use_ucr_for_spark_submit: _check_task_network_labels(driver_task) executor_tasks = shakedown.get_service_tasks(SHUFFLE_JOB_FW_NAME) for task in executor_tasks: _check_task_network(task, is_ucr=use_ucr_for_spark_submit) if check_network_labels and use_ucr_for_spark_submit: _check_task_network_labels(task) try: utils.wait_for_running_job_output( driver_task_id, "Groups count: {}".format(SHUFFLE_JOB_EXPECTED_GROUPS_COUNT)) finally: log.info("Cleaning up. Attempting to kill driver: {}".format( driver_task_id)) utils.kill_driver(driver_task_id, service_name=CNI_DISPATCHER_SERVICE_NAME)
def _submit_job_and_verify_users(user, use_ucr_for_spark_submit, extra_args=[]): app_name = "MockTaskRunner" submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)] + extra_args driver_task_id = utils.submit_job(service_name=SERVICE_NAME, app_url=utils.dcos_test_jar_url(), app_args="1 300", args=submit_args) try: sdk_tasks.check_running(app_name, 1, timeout_seconds=300) driver_task = shakedown.get_task(driver_task_id, completed=False) executor_tasks = shakedown.get_service_tasks(app_name) for task in [driver_task] + executor_tasks: log.info(f"Checking task '{task['id']}'") _check_task_user(task, user, use_ucr_for_spark_submit) finally: log.info(f"Cleaning up. Attempting to kill driver: {driver_task_id}") utils.kill_driver(driver_task_id, service_name=SERVICE_NAME)
def test_supervise(): @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(JOB_SERVICE_NAME) if present: return svc is not None else: return svc is None JOB_SERVICE_NAME = "RecoverableNetworkWordCount" job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs://{}".format(HDFS_DATA_DIR) driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), service_name=utils.SPARK_SERVICE_NAME, args=(KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(JOB_SERVICE_NAME, 1) log.info("Job has running executors") service_info = shakedown.get_service(JOB_SERVICE_NAME).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) shakedown.kill_process_on_host(hostname=service_info['hostname'], pattern=driver_regex) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(JOB_SERVICE_NAME, 1) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" wait_job_present(False)
def _kill_driver_task(driver_task_id): log.info(f"Cleaning up. Attempting to kill driver: {driver_task_id}") utils.kill_driver(driver_task_id)
def test_pipeline(kerberos_flag, stop_count, jar_uri, keytab_secret, spark_app_name, jaas_uri=None): stop_count = str(stop_count) kerberized = True if kerberos_flag == "true" else False broker_dns = _kafka_broker_dns() topic = "top1" big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt" # arguments to the application producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag]) uris = "spark.mesos.uris=http://norvig.com/big.txt" if kerberized and jaas_uri is None: jaas_path = os.path.join(THIS_DIR, "resources", "spark-kafka-client-jaas.conf") s3.upload_file(jaas_path) _uri = s3.s3_http_url("spark-kafka-client-jaas.conf") uris += ",{}".format(_uri) else: uris += ",{}".format(jaas_uri) common_args = [ "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris ] kerberos_args = [ "--conf", "spark.mesos.driver.secret.names={}".format(keytab_secret), "--conf", "spark.mesos.driver.secret.filenames=kafka-client.keytab", "--conf", "spark.mesos.executor.secret.names={}".format(keytab_secret), "--conf", "spark.mesos.executor.secret.filenames=kafka-client.keytab", "--conf", "spark.mesos.task.labels=DCOS_SPACE:{}".format(utils.SPARK_APP_NAME), "--conf", "spark.executorEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5), "--conf", "spark.mesos.driverEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5), "--conf", "spark.driver.extraJavaOptions=-Djava.security.auth.login.config=" "/mnt/mesos/sandbox/spark-kafka-client-jaas.conf", "--conf", "spark.executor.extraJavaOptions=" "-Djava.security.auth.login.config=/mnt/mesos/sandbox/spark-kafka-client-jaas.conf", ] producer_config = [ "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=2", "--class", "KafkaFeeder" ] + common_args if kerberized: producer_config += kerberos_args producer_id = utils.submit_job(app_url=jar_uri, app_args=producer_args, app_name=spark_app_name, args=producer_config) shakedown.wait_for(lambda: _producer_launched(), ignore_exceptions=False, timeout_seconds=600) shakedown.wait_for(lambda: utils.is_service_ready(KAFKA_SERVICE_NAME, 1), ignore_exceptions=False, timeout_seconds=600) consumer_config = [ "--conf", "spark.cores.max=4", "--class", "KafkaConsumer" ] + common_args if kerberized: consumer_config += kerberos_args consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag]) utils.run_tests(app_url=jar_uri, app_args=consumer_args, expected_output="Read {} words".format(stop_count), app_name=spark_app_name, args=consumer_config) utils.kill_driver(producer_id, spark_app_name)
def test_supervise(): def streaming_job_registered(): return shakedown.get_service(JOB_SERVICE_NAME) is not None def streaming_job_is_not_running(): return not streaming_job_registered() def has_running_executors(): f = shakedown.get_service(JOB_SERVICE_NAME) if f is None: return False else: return len([ x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING" ]) > 0 JOB_SERVICE_NAME = "RecoverableNetworkWordCount" job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs:///users/alice" driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), app_name=utils.SPARK_APP_NAME, args=(KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has running executors") host = shakedown.get_service(JOB_SERVICE_NAME).dict()["hostname"] id = shakedown.get_service(JOB_SERVICE_NAME).dict()["id"] driver_regex = "spark.mesos.driver.frameworkId={}".format(id) shakedown.kill_process_on_host(hostname=host, pattern=driver_regex) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has re-registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_APP_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" shakedown.wait_for(lambda: streaming_job_is_not_running(), ignore_exceptions=False, timeout_seconds=600)
def test_supervise(kerberized_spark, hdfs_with_kerberos): job_service_name = "RecoverableNetworkWordCount" @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(job_service_name) if present: return svc is not None else: return svc is None job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs://{}".format(HDFS_DATA_DIR) driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), service_name=utils.SPARK_SERVICE_NAME, args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has running executors") service_info = shakedown.get_service(job_service_name).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) status, stdout = shakedown.run_command_on_agent( service_info['hostname'], "ps aux | grep -v grep | grep '{}'".format(driver_regex), username=sdk_cmd.LINUX_USER) pids = [p.strip().split()[1] for p in stdout.splitlines()] for pid in pids: status, stdout = shakedown.run_command_on_agent( service_info['hostname'], "sudo kill -9 {}".format(pid), username=sdk_cmd.LINUX_USER) if status: print("Killed pid: {}".format(pid)) else: print("Unable to killed pid: {}".format(pid)) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" wait_job_present(False)