Esempio n. 1
0
def test_supervise():
    def streaming_job_registered():
        return shakedown.get_service(JOB_SERVICE_NAME) is not None

    def streaming_job_is_not_running():
        return not streaming_job_registered()

    def has_running_executors():
        f = shakedown.get_service(JOB_SERVICE_NAME)
        if f is None:
            return False
        else:
            return len([
                x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"
            ]) > 0

    JOB_SERVICE_NAME = "RecoverableNetworkWordCount"

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 hdfs:///netcheck hdfs:///outfile",
        app_name=utils.SPARK_APP_NAME,
        args=(KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has running executors")

    host = shakedown.get_service(JOB_SERVICE_NAME).dict()["hostname"]
    id = shakedown.get_service(JOB_SERVICE_NAME).dict()["id"]
    driver_regex = "spark.mesos.driver.frameworkId={}".format(id)
    shakedown.kill_process_on_host(hostname=host, pattern=driver_regex)

    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has re-registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_APP_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    shakedown.wait_for(lambda: streaming_job_is_not_running(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
def test_supervise():
    def streaming_job_registered():
        return shakedown.get_service("HdfsWordCount") is not None

    def streaming_job_is_not_running():
        return not streaming_job_registered()

    def has_running_executors():
        f = shakedown.get_service("HdfsWordCount")
        if f is None:
            return False
        else:
            return len([
                x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"
            ]) > 0

    driver_id = utils.submit_job(
        app_url=SPARK_EXAMPLES,
        app_args="file:///mnt/mesos/sandbox/",
        app_name="/spark",
        args=[
            "--supervise", "--class",
            "org.apache.spark.examples.streaming.HdfsWordCount", "--conf",
            "spark.cores.max=8", "--conf", "spark.executors.cores=4"
        ])
    LOGGER.info("Started supervised driver {}".format(driver_id))
    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has running executors")

    host = shakedown.get_service("HdfsWordCount").dict()["hostname"]
    id = shakedown.get_service("HdfsWordCount").dict()["id"]
    driver_regex = "spark.mesos.driver.frameworkId={}".format(id)
    shakedown.kill_process_on_host(hostname=host, pattern=driver_regex)

    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has re-registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has re-started")
    out = utils.kill_driver(driver_id, "/spark")
    LOGGER.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    shakedown.wait_for(lambda: streaming_job_is_not_running(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
Esempio n. 3
0
def test_supervise_conflict_frameworkid():
    job_service_name = "MockTaskRunner"

    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(job_service_name)
        if present:
            return svc is not None
        else:
            return svc is None

    job_args = [
        "--supervise", "--class", "MockTaskRunner", "--conf",
        "spark.cores.max=1", "--conf", "spark.executors.cores=1"
    ]

    try:
        driver_id = utils.submit_job(app_url=utils.dcos_test_jar_url(),
                                     app_args="1 1800",
                                     service_name=utils.SPARK_SERVICE_NAME,
                                     args=job_args)
        log.info("Started supervised driver {}".format(driver_id))

        wait_job_present(True)
        log.info("Job has registered")

        sdk_tasks.check_running(job_service_name, 1)
        log.info("Job has running executors")

        service_info = shakedown.get_service(job_service_name).dict()
        driver_regex = "spark.mesos.driver.frameworkId={}".format(
            service_info['id'])
        kill_status = sdk_cmd.kill_task_with_pattern(driver_regex,
                                                     service_info['hostname'])

        wait_job_present(False)

        wait_job_present(True)
        log.info("Job has re-registered")
        sdk_tasks.check_running(job_service_name, 1)
        log.info("Job has re-started")

        restarted_service_info = shakedown.get_service(job_service_name).dict()
        assert service_info['id'] != restarted_service_info[
            'id'], "Job has restarted with same framework Id"
    finally:
        kill_info = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
        log.info("{}".format(kill_info))
        assert json.loads(kill_info)["success"], "Failed to kill spark job"
        wait_job_present(False)
Esempio n. 4
0
def streaming_job_running(job_name):
    f = shakedown.get_service(job_name)
    if f is None:
        return False
    else:
        return len(
            [x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"]) > 0
Esempio n. 5
0
def test_install_marathon():
    """Install the Marathon package for DC/OS.
    """

    # Install
    shakedown.install_package_and_wait(PACKAGE_NAME)
    assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to install'

    end_time = time.time() + WAIT_TIME_IN_SECS
    found = False
    while time.time() < end_time:
        found = shakedown.get_service(PACKAGE_NAME) is not None
        if found and shakedown.service_healthy(SERVICE_NAME):
            break
        time.sleep(1)

    assert found, 'Service did not register with DCOS'
    shakedown.deployment_wait()

    # Uninstall
    uninstall('marathon-user')
    shakedown.deployment_wait()

    # Reinstall
    shakedown.install_package_and_wait(PACKAGE_NAME)
    assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to reinstall'
    #
    try:
        shakedown.install_package(PACKAGE_NAME)
    except Exception as e:
        pass
    else:
        # Exception is not raised -> exit code was 0
        assert False, "Error: CLI returns 0 when asked to install Marathon"
Esempio n. 6
0
def test_install_marathon():
    """Install the Marathon package for DC/OS.
    """

    # Install
    shakedown.install_package_and_wait(PACKAGE_NAME)
    assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to install'

    end_time = time.time() + WAIT_TIME_IN_SECS
    found = False
    while time.time() < end_time:
        found = shakedown.get_service(PACKAGE_NAME) is not None
        if found and shakedown.service_healthy(SERVICE_NAME):
            break
        time.sleep(1)

    assert found, 'Service did not register with DCOS'
    shakedown.deployment_wait()

    # Uninstall
    uninstall('marathon-user')
    shakedown.deployment_wait()

    # Reinstall
    shakedown.install_package_and_wait(PACKAGE_NAME)
    assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to reinstall'
    #
    try:
        shakedown.install_package(PACKAGE_NAME)
    except Exception as e:
        pass
    else:
        # Exception is not raised -> exit code was 0
        assert False, "Error: CLI returns 0 when asked to install Marathon"
Esempio n. 7
0
def test_install_marathon():
    """Install the Marathon package for DC/OS.
    """

    # Install
    shakedown.install_package_and_wait(PACKAGE_NAME)
    assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to install'

    end_time = time.time() + WAIT_TIME_IN_SECS
    found = False
    while time.time() < end_time:
        found = shakedown.get_service(PACKAGE_NAME) is not None
        if found and shakedown.service_healthy(SERVICE_NAME):
            break
        time.sleep(1)

    assert found, 'Service did not register with DCOS'
    shakedown.deployment_wait()

    # Uninstall
    uninstall('marathon-user')
    shakedown.deployment_wait()

    # Reinstall
    shakedown.install_package_and_wait(PACKAGE_NAME)
    assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to reinstall'
Esempio n. 8
0
def test_disconnect_from_master():
    python_script_path = os.path.join(THIS_DIR, 'jobs', 'python',
                                      'long_running.py')
    python_script_url = utils.upload_file(python_script_path)
    task_id = utils.submit_job(
        python_script_url,
        "{} {}".format(LONG_RUNNING_FW_NUM_TASKS, LONG_RUNNING_RUN_TIME_SEC), [
            "--conf", "spark.mesos.driver.failoverTimeout=1800", "--conf",
            "spark.cores.max=1"
        ])

    # Wait until executor is running
    utils.wait_for_executors_running(LONG_RUNNING_FW_NAME,
                                     LONG_RUNNING_FW_NUM_TASKS)

    # Block the driver's connection to Mesos master
    framework_info = shakedown.get_service(LONG_RUNNING_FW_NAME)
    (driver_host, port) = _parse_fw_pid_host_port(framework_info["pid"])
    _block_master_connection(driver_host, port)

    # The connection will timeout after 15 minutes of inactivity.
    # Add 5 minutes to make sure the master has detected the disconnection.
    # The framework will be considered disconnected => failover_timeout kicks in.
    LOGGER.info(
        "Waiting {} seconds for connection with master to timeout...".format(
            MASTER_CONNECTION_TIMEOUT_SEC))
    time.sleep(MASTER_CONNECTION_TIMEOUT_SEC + 5 * 60)

    # Restore the connection. The driver should reconnect.
    _unblock_master_connection(driver_host)

    # The executor and driver should finish.
    utils.check_job_output(task_id, "Job completed successfully")
Esempio n. 9
0
 def has_running_executors():
     f = shakedown.get_service(JOB_SERVICE_NAME)
     if f is None:
         return False
     else:
         return len([
             x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"
         ]) > 0
Esempio n. 10
0
 def has_running_executors():
     f = shakedown.get_service("HdfsWordCount")
     if f is None:
         return False
     else:
         return len([
             x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"
         ]) > 0
Esempio n. 11
0
def uninstall_package(package_name,
                      app_id=None,
                      all_instances=False,
                      wait_for_completion=False,
                      timeout_sec=600):
    """ Uninstall a package using the DC/OS library.

        :param package_name: name of the package
        :type package_name: str
        :param app_id: unique app_id for the package
        :type app_id: str
        :param all_instances: uninstall all instances of package
        :type all_instances: bool
        :param wait_for_completion: whether or not to wait for task completion before returning
        :type wait_for_completion: bool
        :param timeout_sec: number of seconds to wait for task completion
        :type timeout_sec: int

        :return: True if uninstall was successful, False otherwise
        :rtype: bool
    """

    print("\n{}uninstalling package '{}'\n".format(
        shakedown.cli.helpers.fchr('>>'), package_name))

    cosmos = _get_cosmos()
    pkg = cosmos.get_package_version(package_name, None)

    # Uninstall subcommands (if defined)
    if pkg.has_cli_definition():
        print("\n{}uninstalling CLI commands for package '{}'\n".format(
            shakedown.cli.helpers.fchr('>>'), package_name))
        subcommand.uninstall(package_name)

    cosmos.uninstall_app(package_name, all_instances, app_id)

    # Optionally wait for the service to unregister as a framework
    if wait_for_completion:
        now = time.time()
        future = now + timeout_sec

        while now < future:
            if not shakedown.get_service(package_name):
                return True

            time.sleep(1)
            now = time.time()

        return False

    return True
Esempio n. 12
0
def test_job():

    shakedown.install_package_and_wait('chronos')

    # 0 tasks
    tasks = shakedown.get_service('chronos')['completed_tasks']
    assert len(tasks) == 0

    if is_before_version("3.0"):
        url = shakedown.dcos_service_url('chronos/scheduler/jobs')
    else:
        url = shakedown.dcos_service_url('chronos/v1/scheduler/jobs')

    jobs = http.get(url).json()
    assert len(jobs) == 0

    # add a job
    if is_before_version("3.0"):
        url = shakedown.dcos_service_url('chronos/scheduler/iso8601')
    else:
        url = shakedown.dcos_service_url('chronos/v1/scheduler/iso8601')

    data = default_job()
    headers = {'Content-Type': 'application/json'}
    http.post(url, data=data, headers=headers)

    # give it a couple of seconds
    time.sleep(5)

    tasks = shakedown.get_service('chronos')['completed_tasks']
    assert len(tasks) > 0

    id = tasks[0]['id']
    status, out = shakedown.run_command_on_master('date')
    sdate = out[:10]
    stdout, stderr, return_code = shakedown.run_dcos_command(
        'task log --completed {}'.format(id))
    assert sdate in stdout
Esempio n. 13
0
def test_supervise():
    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(JOB_SERVICE_NAME)
        if present:
            return svc is not None
        else:
            return svc is None

    JOB_SERVICE_NAME = "RecoverableNetworkWordCount"

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs://{}".format(HDFS_DATA_DIR)
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        service_name=utils.SPARK_SERVICE_NAME,
        args=(KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    wait_job_present(True)
    log.info("Job has registered")
    sdk_tasks.check_running(JOB_SERVICE_NAME, 1)
    log.info("Job has running executors")

    service_info = shakedown.get_service(JOB_SERVICE_NAME).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])
    shakedown.kill_process_on_host(hostname=service_info['hostname'],
                                   pattern=driver_regex)

    wait_job_present(True)
    log.info("Job has re-registered")
    sdk_tasks.check_running(JOB_SERVICE_NAME, 1)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    wait_job_present(False)
Esempio n. 14
0
def test_gpus_max():
    """
    Checks that gpus.max is respected.
    """
    gpus_max = 1
    app_name = "{}-{}".format(GPU_PI_APP_NAME, str(uuid.uuid4()))
    driver_task_id = _submit_gpu_app(num_executors=1,
                                     executor_gpus=None,
                                     gpus_max=gpus_max,
                                     app_name=app_name)

    log.info("Waiting for job to complete.")
    shakedown.wait_for_task_completion(driver_task_id)

    # Check total Executor gpus <= gpus.max
    service = shakedown.get_service(service_name=app_name, completed=True)
    executor_tasks = service['completed_tasks']
    gpus = [task['resources']['gpus'] for task in executor_tasks]
    log.info("Task gpus: {}".format(str(gpus)))
    total_gpus = sum(gpus)
    log.info("Total gpus allocated: {}".format(str(total_gpus)))
    # We expect total gpus == gpus.max because gpus are allocated greedily.
    assert total_gpus == gpus_max
Esempio n. 15
0
 def assert_service_registration(package, service):
     found = shakedown.get_service(package) is not None
     assert found and shakedown.service_healthy(
         service
     ), f"Service {package} did not register with DCOS"  # NOQA E999
Esempio n. 16
0
def destroy_app(app_name):
    sdk_cmd.request('delete', api_url_with_param('apps', app_name))

    # Make sure the scheduler has been destroyed
    sdk_spin.time_wait_noisy(lambda: (shakedown.get_service(app_name) is None))
Esempio n. 17
0
def is_framework_completed(fw_name):
    # The framework is not Active or Inactive
    return shakedown.get_service(fw_name, True) is None
Esempio n. 18
0
 def streaming_job_registered():
     return shakedown.get_service("HdfsWordCount") is not None
Esempio n. 19
0
def streaming_job_launched(job_name):
    return shakedown.get_service(job_name) is not None
Esempio n. 20
0
 def fn():
     return shakedown.get_service(app_name) is None
Esempio n. 21
0
def install_package(package_name,
                    package_version=None,
                    app_id=None,
                    options_file=None,
                    wait_for_completion=False,
                    timeout_sec=600):
    """ Install a package via the DC/OS library

        :param package_name: name of the package
        :type package_name: str
        :param package_version: version of the package (defaults to latest)
        :type package_version: str
        :param app_id: unique app_id for the package
        :type app_id: str
        :param options_file: filename that has options to use and is JSON format
        :type options_file: str
        :param wait_for_completion: whether or not to wait for task completion before returning
        :type wait_for_completion: bool
        :param timeout_sec: number of seconds to wait for task completion
        :type timeout_sec: int

        :return: True if installation was successful, False otherwise
        :rtype: bool
    """

    options = _get_options(options_file)
    cosmos = _get_cosmos()
    pkg = cosmos.get_package_version(package_name, package_version)

    # Install subcommands (if defined)
    if pkg.has_cli_definition():
        print("\n{}installing CLI commands for package '{}'\n".format(
            shakedown.cli.helpers.fchr('>>'), package_name))
        subcommand.install(pkg)

    print("\n{}installing package '{}'\n".format(
        shakedown.cli.helpers.fchr('>>'), package_name))

    # Print pre-install notes to console log
    pre_install_notes = pkg.package_json().get('preInstallNotes')
    if pre_install_notes:
        print(pre_install_notes)

    cosmos.install_app(pkg, options, app_id)

    # Print post-install notes to console log
    post_install_notes = pkg.package_json().get('postInstallNotes')
    if post_install_notes:
        print(post_install_notes)

    # Optionally wait for the service to register as a framework
    if wait_for_completion:
        now = time.time()
        future = now + timeout_sec

        while now < future:
            if shakedown.get_service(package_name):
                return True

            time.sleep(1)
            now = time.time()

        return False

    return True
Esempio n. 22
0
def test_structured_streaming_recovery(kerberized_spark, kerberized_kafka):
    kafka_brokers = ','.join(
        sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME,
                        KAFKA_SERVICE_NAME,
                        'endpoints broker',
                        json=True)['dns'])
    LOGGER.info("Kafka brokers: {}".format(kafka_brokers))

    _uri = upload_jaas()
    uris = "spark.mesos.uris={}".format(_uri)

    jar_uri = utils.upload_dcos_test_jar()

    kafka_kerberos_args = get_kerberized_kafka_spark_conf(
        utils.SPARK_SERVICE_NAME)
    LOGGER.info("Spark Kerberos configuration for Kafka:\n{}".format(
        '\n'.join(kafka_kerberos_args)))

    common_args = [
        "--conf", "spark.mesos.containerizer=mesos", "--conf",
        "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf",
        "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris
    ] + kafka_kerberos_args

    # configuring streaming job and HDFS folders
    setup_hdfs_paths()

    # running kafka producer
    message_set_a = ["abc"] * 100
    feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args,
                     message_set_a)

    spark_submit_args = [
        "--supervise", "--class", "StructuredStreamingWithCheckpointing",
        "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1",
        "--conf", "spark.sql.shuffle.partitions=2", "--conf",
        "spark.executor.memory=2g"
    ] + common_args

    application_args = "{} {} {} {}".format(kafka_brokers, KAFKA_TEST_TOPIC,
                                            HDFS_CHECKPOINT_DIR,
                                            SPARK_SECURITY_PROTOCOL)

    driver_task_id = utils.submit_job(app_url=jar_uri,
                                      app_args=application_args,
                                      service_name=utils.SPARK_SERVICE_NAME,
                                      args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS +
                                            spark_submit_args))

    # Wait until executor is running
    LOGGER.info("Starting supervised driver {}".format(driver_task_id))
    sdk_tasks.check_running(SPARK_APPLICATION_NAME,
                            expected_task_count=1,
                            timeout_seconds=600)

    # validating Structured Streaming topic consumption
    expected_output_a = "{}|  {}".format(message_set_a[0], len(message_set_a))
    LOGGER.info(
        "Validating Structured Streaming topic consumption, waiting for output {}"
        .format(expected_output_a))
    utils.wait_for_running_job_output(driver_task_id, expected_output_a)

    # killing the driver
    service_info = shakedown.get_service(SPARK_APPLICATION_NAME).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])
    sdk_cmd.kill_task_with_pattern(agent_host=service_info['hostname'],
                                   pattern=driver_regex)

    # sending more data to Kafka
    message_set_b = ["def"] * 100
    feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC,
                     common_args + kafka_kerberos_args, message_set_b)

    # checkpointing validation
    sdk_tasks.check_running(SPARK_APPLICATION_NAME,
                            expected_task_count=1,
                            timeout_seconds=600)
    LOGGER.info("Streaming job has re-started")

    # validating Structured Streaming resumed topic consumption
    expected_output_b = "{}|  {}".format(message_set_b[0], len(message_set_b))
    LOGGER.info(
        "Validating that consumption resumed from checkpoint, waiting for output '{}' and '{}'"
        .format(expected_output_a, expected_output_b))

    utils.wait_for_running_job_output(driver_task_id, expected_output_a)
    utils.wait_for_running_job_output(driver_task_id, expected_output_b)
Esempio n. 23
0
 def wait_job_present(present):
     svc = shakedown.get_service(JOB_SERVICE_NAME)
     if present:
         return svc is not None
     else:
         return svc is None
 def fn():
     shakedown.get_service(PACKAGE_NAME)
Esempio n. 25
0
def test_supervise(kerberized_spark, hdfs_with_kerberos):
    job_service_name = "RecoverableNetworkWordCount"

    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(job_service_name)
        if present:
            return svc is not None
        else:
            return svc is None

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs://{}".format(HDFS_DATA_DIR)
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        service_name=utils.SPARK_SERVICE_NAME,
        args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    wait_job_present(True)
    log.info("Job has registered")
    sdk_tasks.check_running(job_service_name, 1)
    log.info("Job has running executors")

    service_info = shakedown.get_service(job_service_name).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])

    status, stdout = shakedown.run_command_on_agent(
        service_info['hostname'],
        "ps aux | grep -v grep | grep '{}'".format(driver_regex),
        username=sdk_cmd.LINUX_USER)

    pids = [p.strip().split()[1] for p in stdout.splitlines()]

    for pid in pids:
        status, stdout = shakedown.run_command_on_agent(
            service_info['hostname'],
            "sudo kill -9 {}".format(pid),
            username=sdk_cmd.LINUX_USER)

        if status:
            print("Killed pid: {}".format(pid))
        else:
            print("Unable to killed pid: {}".format(pid))

    wait_job_present(True)
    log.info("Job has re-registered")
    sdk_tasks.check_running(job_service_name, 1)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    wait_job_present(False)
Esempio n. 26
0
def destroy_app(app_name):
    sdk_cmd.request('delete', api_url_with_param('apps', app_name))

    # Make sure the scheduler has been destroyed
    sdk_spin.time_wait_noisy(lambda: (shakedown.get_service(app_name) is None))
Esempio n. 27
0
 def fn():
     shakedown.get_service(PACKAGE_NAME)
Esempio n. 28
0
 def wait_job_present(present):
     svc = shakedown.get_service(job_service_name)
     if present:
         return svc is not None
     else:
         return svc is None
Esempio n. 29
0
 def fn():
     shakedown.get_service(SERVICE_NAME)
Esempio n. 30
0
 def streaming_job_registered():
     return shakedown.get_service(JOB_SERVICE_NAME) is not None