def test_supervise(): def streaming_job_registered(): return shakedown.get_service(JOB_SERVICE_NAME) is not None def streaming_job_is_not_running(): return not streaming_job_registered() def has_running_executors(): f = shakedown.get_service(JOB_SERVICE_NAME) if f is None: return False else: return len([ x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING" ]) > 0 JOB_SERVICE_NAME = "RecoverableNetworkWordCount" job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 hdfs:///netcheck hdfs:///outfile", app_name=utils.SPARK_APP_NAME, args=(KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has running executors") host = shakedown.get_service(JOB_SERVICE_NAME).dict()["hostname"] id = shakedown.get_service(JOB_SERVICE_NAME).dict()["id"] driver_regex = "spark.mesos.driver.frameworkId={}".format(id) shakedown.kill_process_on_host(hostname=host, pattern=driver_regex) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has re-registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_APP_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" shakedown.wait_for(lambda: streaming_job_is_not_running(), ignore_exceptions=False, timeout_seconds=600)
def test_supervise(): def streaming_job_registered(): return shakedown.get_service("HdfsWordCount") is not None def streaming_job_is_not_running(): return not streaming_job_registered() def has_running_executors(): f = shakedown.get_service("HdfsWordCount") if f is None: return False else: return len([ x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING" ]) > 0 driver_id = utils.submit_job( app_url=SPARK_EXAMPLES, app_args="file:///mnt/mesos/sandbox/", app_name="/spark", args=[ "--supervise", "--class", "org.apache.spark.examples.streaming.HdfsWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ]) LOGGER.info("Started supervised driver {}".format(driver_id)) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has running executors") host = shakedown.get_service("HdfsWordCount").dict()["hostname"] id = shakedown.get_service("HdfsWordCount").dict()["id"] driver_regex = "spark.mesos.driver.frameworkId={}".format(id) shakedown.kill_process_on_host(hostname=host, pattern=driver_regex) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has re-registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has re-started") out = utils.kill_driver(driver_id, "/spark") LOGGER.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" shakedown.wait_for(lambda: streaming_job_is_not_running(), ignore_exceptions=False, timeout_seconds=600)
def test_supervise_conflict_frameworkid(): job_service_name = "MockTaskRunner" @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(job_service_name) if present: return svc is not None else: return svc is None job_args = [ "--supervise", "--class", "MockTaskRunner", "--conf", "spark.cores.max=1", "--conf", "spark.executors.cores=1" ] try: driver_id = utils.submit_job(app_url=utils.dcos_test_jar_url(), app_args="1 1800", service_name=utils.SPARK_SERVICE_NAME, args=job_args) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has running executors") service_info = shakedown.get_service(job_service_name).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) kill_status = sdk_cmd.kill_task_with_pattern(driver_regex, service_info['hostname']) wait_job_present(False) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has re-started") restarted_service_info = shakedown.get_service(job_service_name).dict() assert service_info['id'] != restarted_service_info[ 'id'], "Job has restarted with same framework Id" finally: kill_info = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(kill_info)) assert json.loads(kill_info)["success"], "Failed to kill spark job" wait_job_present(False)
def streaming_job_running(job_name): f = shakedown.get_service(job_name) if f is None: return False else: return len( [x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"]) > 0
def test_install_marathon(): """Install the Marathon package for DC/OS. """ # Install shakedown.install_package_and_wait(PACKAGE_NAME) assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to install' end_time = time.time() + WAIT_TIME_IN_SECS found = False while time.time() < end_time: found = shakedown.get_service(PACKAGE_NAME) is not None if found and shakedown.service_healthy(SERVICE_NAME): break time.sleep(1) assert found, 'Service did not register with DCOS' shakedown.deployment_wait() # Uninstall uninstall('marathon-user') shakedown.deployment_wait() # Reinstall shakedown.install_package_and_wait(PACKAGE_NAME) assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to reinstall' # try: shakedown.install_package(PACKAGE_NAME) except Exception as e: pass else: # Exception is not raised -> exit code was 0 assert False, "Error: CLI returns 0 when asked to install Marathon"
def test_install_marathon(): """Install the Marathon package for DC/OS. """ # Install shakedown.install_package_and_wait(PACKAGE_NAME) assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to install' end_time = time.time() + WAIT_TIME_IN_SECS found = False while time.time() < end_time: found = shakedown.get_service(PACKAGE_NAME) is not None if found and shakedown.service_healthy(SERVICE_NAME): break time.sleep(1) assert found, 'Service did not register with DCOS' shakedown.deployment_wait() # Uninstall uninstall('marathon-user') shakedown.deployment_wait() # Reinstall shakedown.install_package_and_wait(PACKAGE_NAME) assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to reinstall'
def test_disconnect_from_master(): python_script_path = os.path.join(THIS_DIR, 'jobs', 'python', 'long_running.py') python_script_url = utils.upload_file(python_script_path) task_id = utils.submit_job( python_script_url, "{} {}".format(LONG_RUNNING_FW_NUM_TASKS, LONG_RUNNING_RUN_TIME_SEC), [ "--conf", "spark.mesos.driver.failoverTimeout=1800", "--conf", "spark.cores.max=1" ]) # Wait until executor is running utils.wait_for_executors_running(LONG_RUNNING_FW_NAME, LONG_RUNNING_FW_NUM_TASKS) # Block the driver's connection to Mesos master framework_info = shakedown.get_service(LONG_RUNNING_FW_NAME) (driver_host, port) = _parse_fw_pid_host_port(framework_info["pid"]) _block_master_connection(driver_host, port) # The connection will timeout after 15 minutes of inactivity. # Add 5 minutes to make sure the master has detected the disconnection. # The framework will be considered disconnected => failover_timeout kicks in. LOGGER.info( "Waiting {} seconds for connection with master to timeout...".format( MASTER_CONNECTION_TIMEOUT_SEC)) time.sleep(MASTER_CONNECTION_TIMEOUT_SEC + 5 * 60) # Restore the connection. The driver should reconnect. _unblock_master_connection(driver_host) # The executor and driver should finish. utils.check_job_output(task_id, "Job completed successfully")
def has_running_executors(): f = shakedown.get_service(JOB_SERVICE_NAME) if f is None: return False else: return len([ x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING" ]) > 0
def has_running_executors(): f = shakedown.get_service("HdfsWordCount") if f is None: return False else: return len([ x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING" ]) > 0
def uninstall_package(package_name, app_id=None, all_instances=False, wait_for_completion=False, timeout_sec=600): """ Uninstall a package using the DC/OS library. :param package_name: name of the package :type package_name: str :param app_id: unique app_id for the package :type app_id: str :param all_instances: uninstall all instances of package :type all_instances: bool :param wait_for_completion: whether or not to wait for task completion before returning :type wait_for_completion: bool :param timeout_sec: number of seconds to wait for task completion :type timeout_sec: int :return: True if uninstall was successful, False otherwise :rtype: bool """ print("\n{}uninstalling package '{}'\n".format( shakedown.cli.helpers.fchr('>>'), package_name)) cosmos = _get_cosmos() pkg = cosmos.get_package_version(package_name, None) # Uninstall subcommands (if defined) if pkg.has_cli_definition(): print("\n{}uninstalling CLI commands for package '{}'\n".format( shakedown.cli.helpers.fchr('>>'), package_name)) subcommand.uninstall(package_name) cosmos.uninstall_app(package_name, all_instances, app_id) # Optionally wait for the service to unregister as a framework if wait_for_completion: now = time.time() future = now + timeout_sec while now < future: if not shakedown.get_service(package_name): return True time.sleep(1) now = time.time() return False return True
def test_job(): shakedown.install_package_and_wait('chronos') # 0 tasks tasks = shakedown.get_service('chronos')['completed_tasks'] assert len(tasks) == 0 if is_before_version("3.0"): url = shakedown.dcos_service_url('chronos/scheduler/jobs') else: url = shakedown.dcos_service_url('chronos/v1/scheduler/jobs') jobs = http.get(url).json() assert len(jobs) == 0 # add a job if is_before_version("3.0"): url = shakedown.dcos_service_url('chronos/scheduler/iso8601') else: url = shakedown.dcos_service_url('chronos/v1/scheduler/iso8601') data = default_job() headers = {'Content-Type': 'application/json'} http.post(url, data=data, headers=headers) # give it a couple of seconds time.sleep(5) tasks = shakedown.get_service('chronos')['completed_tasks'] assert len(tasks) > 0 id = tasks[0]['id'] status, out = shakedown.run_command_on_master('date') sdate = out[:10] stdout, stderr, return_code = shakedown.run_dcos_command( 'task log --completed {}'.format(id)) assert sdate in stdout
def test_supervise(): @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(JOB_SERVICE_NAME) if present: return svc is not None else: return svc is None JOB_SERVICE_NAME = "RecoverableNetworkWordCount" job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs://{}".format(HDFS_DATA_DIR) driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), service_name=utils.SPARK_SERVICE_NAME, args=(KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(JOB_SERVICE_NAME, 1) log.info("Job has running executors") service_info = shakedown.get_service(JOB_SERVICE_NAME).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) shakedown.kill_process_on_host(hostname=service_info['hostname'], pattern=driver_regex) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(JOB_SERVICE_NAME, 1) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" wait_job_present(False)
def test_gpus_max(): """ Checks that gpus.max is respected. """ gpus_max = 1 app_name = "{}-{}".format(GPU_PI_APP_NAME, str(uuid.uuid4())) driver_task_id = _submit_gpu_app(num_executors=1, executor_gpus=None, gpus_max=gpus_max, app_name=app_name) log.info("Waiting for job to complete.") shakedown.wait_for_task_completion(driver_task_id) # Check total Executor gpus <= gpus.max service = shakedown.get_service(service_name=app_name, completed=True) executor_tasks = service['completed_tasks'] gpus = [task['resources']['gpus'] for task in executor_tasks] log.info("Task gpus: {}".format(str(gpus))) total_gpus = sum(gpus) log.info("Total gpus allocated: {}".format(str(total_gpus))) # We expect total gpus == gpus.max because gpus are allocated greedily. assert total_gpus == gpus_max
def assert_service_registration(package, service): found = shakedown.get_service(package) is not None assert found and shakedown.service_healthy( service ), f"Service {package} did not register with DCOS" # NOQA E999
def destroy_app(app_name): sdk_cmd.request('delete', api_url_with_param('apps', app_name)) # Make sure the scheduler has been destroyed sdk_spin.time_wait_noisy(lambda: (shakedown.get_service(app_name) is None))
def is_framework_completed(fw_name): # The framework is not Active or Inactive return shakedown.get_service(fw_name, True) is None
def streaming_job_registered(): return shakedown.get_service("HdfsWordCount") is not None
def streaming_job_launched(job_name): return shakedown.get_service(job_name) is not None
def fn(): return shakedown.get_service(app_name) is None
def install_package(package_name, package_version=None, app_id=None, options_file=None, wait_for_completion=False, timeout_sec=600): """ Install a package via the DC/OS library :param package_name: name of the package :type package_name: str :param package_version: version of the package (defaults to latest) :type package_version: str :param app_id: unique app_id for the package :type app_id: str :param options_file: filename that has options to use and is JSON format :type options_file: str :param wait_for_completion: whether or not to wait for task completion before returning :type wait_for_completion: bool :param timeout_sec: number of seconds to wait for task completion :type timeout_sec: int :return: True if installation was successful, False otherwise :rtype: bool """ options = _get_options(options_file) cosmos = _get_cosmos() pkg = cosmos.get_package_version(package_name, package_version) # Install subcommands (if defined) if pkg.has_cli_definition(): print("\n{}installing CLI commands for package '{}'\n".format( shakedown.cli.helpers.fchr('>>'), package_name)) subcommand.install(pkg) print("\n{}installing package '{}'\n".format( shakedown.cli.helpers.fchr('>>'), package_name)) # Print pre-install notes to console log pre_install_notes = pkg.package_json().get('preInstallNotes') if pre_install_notes: print(pre_install_notes) cosmos.install_app(pkg, options, app_id) # Print post-install notes to console log post_install_notes = pkg.package_json().get('postInstallNotes') if post_install_notes: print(post_install_notes) # Optionally wait for the service to register as a framework if wait_for_completion: now = time.time() future = now + timeout_sec while now < future: if shakedown.get_service(package_name): return True time.sleep(1) now = time.time() return False return True
def test_structured_streaming_recovery(kerberized_spark, kerberized_kafka): kafka_brokers = ','.join( sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME, KAFKA_SERVICE_NAME, 'endpoints broker', json=True)['dns']) LOGGER.info("Kafka brokers: {}".format(kafka_brokers)) _uri = upload_jaas() uris = "spark.mesos.uris={}".format(_uri) jar_uri = utils.upload_dcos_test_jar() kafka_kerberos_args = get_kerberized_kafka_spark_conf( utils.SPARK_SERVICE_NAME) LOGGER.info("Spark Kerberos configuration for Kafka:\n{}".format( '\n'.join(kafka_kerberos_args))) common_args = [ "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris ] + kafka_kerberos_args # configuring streaming job and HDFS folders setup_hdfs_paths() # running kafka producer message_set_a = ["abc"] * 100 feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args, message_set_a) spark_submit_args = [ "--supervise", "--class", "StructuredStreamingWithCheckpointing", "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1", "--conf", "spark.sql.shuffle.partitions=2", "--conf", "spark.executor.memory=2g" ] + common_args application_args = "{} {} {} {}".format(kafka_brokers, KAFKA_TEST_TOPIC, HDFS_CHECKPOINT_DIR, SPARK_SECURITY_PROTOCOL) driver_task_id = utils.submit_job(app_url=jar_uri, app_args=application_args, service_name=utils.SPARK_SERVICE_NAME, args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + spark_submit_args)) # Wait until executor is running LOGGER.info("Starting supervised driver {}".format(driver_task_id)) sdk_tasks.check_running(SPARK_APPLICATION_NAME, expected_task_count=1, timeout_seconds=600) # validating Structured Streaming topic consumption expected_output_a = "{}| {}".format(message_set_a[0], len(message_set_a)) LOGGER.info( "Validating Structured Streaming topic consumption, waiting for output {}" .format(expected_output_a)) utils.wait_for_running_job_output(driver_task_id, expected_output_a) # killing the driver service_info = shakedown.get_service(SPARK_APPLICATION_NAME).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) sdk_cmd.kill_task_with_pattern(agent_host=service_info['hostname'], pattern=driver_regex) # sending more data to Kafka message_set_b = ["def"] * 100 feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args + kafka_kerberos_args, message_set_b) # checkpointing validation sdk_tasks.check_running(SPARK_APPLICATION_NAME, expected_task_count=1, timeout_seconds=600) LOGGER.info("Streaming job has re-started") # validating Structured Streaming resumed topic consumption expected_output_b = "{}| {}".format(message_set_b[0], len(message_set_b)) LOGGER.info( "Validating that consumption resumed from checkpoint, waiting for output '{}' and '{}'" .format(expected_output_a, expected_output_b)) utils.wait_for_running_job_output(driver_task_id, expected_output_a) utils.wait_for_running_job_output(driver_task_id, expected_output_b)
def wait_job_present(present): svc = shakedown.get_service(JOB_SERVICE_NAME) if present: return svc is not None else: return svc is None
def fn(): shakedown.get_service(PACKAGE_NAME)
def test_supervise(kerberized_spark, hdfs_with_kerberos): job_service_name = "RecoverableNetworkWordCount" @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(job_service_name) if present: return svc is not None else: return svc is None job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs://{}".format(HDFS_DATA_DIR) driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), service_name=utils.SPARK_SERVICE_NAME, args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has running executors") service_info = shakedown.get_service(job_service_name).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) status, stdout = shakedown.run_command_on_agent( service_info['hostname'], "ps aux | grep -v grep | grep '{}'".format(driver_regex), username=sdk_cmd.LINUX_USER) pids = [p.strip().split()[1] for p in stdout.splitlines()] for pid in pids: status, stdout = shakedown.run_command_on_agent( service_info['hostname'], "sudo kill -9 {}".format(pid), username=sdk_cmd.LINUX_USER) if status: print("Killed pid: {}".format(pid)) else: print("Unable to killed pid: {}".format(pid)) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" wait_job_present(False)
def wait_job_present(present): svc = shakedown.get_service(job_service_name) if present: return svc is not None else: return svc is None
def fn(): shakedown.get_service(SERVICE_NAME)
def streaming_job_registered(): return shakedown.get_service(JOB_SERVICE_NAME) is not None