コード例 #1
0
def test_unique_task_ids():
    log.info('Submitting two sample Spark Applications')
    submit_args = [
        "--conf spark.cores.max=1", "--class org.apache.spark.examples.SparkPi"
    ]

    driver_id_1 = utils.submit_job(app_url=utils.SPARK_EXAMPLES,
                                   app_args="100",
                                   args=submit_args)

    driver_id_2 = utils.submit_job(app_url=utils.SPARK_EXAMPLES,
                                   app_args="100",
                                   args=submit_args)

    log.info(
        'Two Spark Applications submitted. Driver 1 ID: %s, Driver 2 ID: %s' %
        (driver_id_1, driver_id_2))
    log.info('Waiting for completion. Polling state')
    completed = wait_for_jobs_completion(driver_id_1, driver_id_2)

    assert completed == True, 'Sample Spark Applications failed to successfully complete within given time'
    out = sdk_cmd.run_cli("task --completed --json")
    data = json.loads(out)

    log.info(
        'Collecting tasks that belong to the drivers created in this test')
    task_ids = []
    for d in data:
        if driver_id_1 in d['framework_id'] or driver_id_2 in d['framework_id']:
            task_ids.append(d['id'])

    log.info('Tasks found: %s' % (' '.join(task_ids)))
    assert len(task_ids) == len(
        set(task_ids)
    ), 'Task ids for two independent Spark Applications contain duplicates'
コード例 #2
0
def submit_job(dispatcher):
    dispatcher_name, dispatcher_role, driver_role = dispatcher

    args = [
        "--conf",
        "spark.cores.max=1",
        "--conf",
        "spark.mesos.containerizer=mesos",
        "--conf",
        "spark.mesos.role={}".format(driver_role),
        "--conf",
        "spark.mesos.executor.docker.image=mesosphere/spark-dev:931ca56273af913d103718376e2fbc04be7cbde0",
        # use Hector's image
        "--conf",
        "spark.port.maxRetries=32"  # setting to allow up to 32 drivers on same node
        #"--conf", "spark.mesos.driverEnv.SPARK_USER=root", # Run as root on centos
    ]

    app_args = "100000 300"

    utils.submit_job(app_name="/{}".format(dispatcher_name),
                     app_url=MONTE_CARLO_APP_URL,
                     app_args=app_args,
                     verbose=False,
                     args=args)
コード例 #3
0
def submit_job(app_url: str, app_args: str, dispatcher: typing.Dict, duration: int, config: typing.List[str]):
    dispatcher_name = dispatcher["service"]["name"]
    log.info("Submitting job to dispatcher: %s, with duration: %s min.", dispatcher_name, duration)

    spark_utils.submit_job(
        service_name=dispatcher_name,
        app_url=app_url,
        app_args=app_args,
        verbose=False,
        args=config,
        driver_role=dispatcher["roles"]["executors"],
        spark_user=dispatcher["service"]["user"] if sdk_utils.is_strict_mode() else None,
        principal=dispatcher["service"]["service_account"] if sdk_utils.is_strict_mode() else None)
コード例 #4
0
    def _submit_consumer(broker_dns, common_conf, topic, spark_app_name,
                         driver_role, num_words):
        consumer_args = " ".join([broker_dns, topic, num_words, kerberos_flag])

        consumer_config = [
            "--conf", "spark.cores.max=4", "--class", "KafkaConsumer"
        ] + common_conf

        spark_utils.submit_job(app_url=jar_url,
                               app_args=consumer_args,
                               app_name=spark_app_name,
                               args=consumer_config,
                               driver_role=driver_role,
                               verbose=False)
コード例 #5
0
def test_disconnect_from_master():
    python_script_path = os.path.join(THIS_DIR, 'jobs', 'python',
                                      'long_running.py')
    python_script_url = utils.upload_file(python_script_path)
    task_id = utils.submit_job(
        python_script_url,
        "{} {}".format(LONG_RUNNING_FW_NUM_TASKS, LONG_RUNNING_RUN_TIME_SEC), [
            "--conf", "spark.mesos.driver.failoverTimeout=1800", "--conf",
            "spark.cores.max=1"
        ])

    # Wait until executor is running
    utils.wait_for_executors_running(LONG_RUNNING_FW_NAME,
                                     LONG_RUNNING_FW_NUM_TASKS)

    # Block the driver's connection to Mesos master
    framework_info = shakedown.get_service(LONG_RUNNING_FW_NAME)
    (driver_host, port) = _parse_fw_pid_host_port(framework_info["pid"])
    _block_master_connection(driver_host, port)

    # The connection will timeout after 15 minutes of inactivity.
    # Add 5 minutes to make sure the master has detected the disconnection.
    # The framework will be considered disconnected => failover_timeout kicks in.
    LOGGER.info(
        "Waiting {} seconds for connection with master to timeout...".format(
            MASTER_CONNECTION_TIMEOUT_SEC))
    time.sleep(MASTER_CONNECTION_TIMEOUT_SEC + 5 * 60)

    # Restore the connection. The driver should reconnect.
    _unblock_master_connection(driver_host)

    # The executor and driver should finish.
    utils.check_job_output(task_id, "Job completed successfully")
コード例 #6
0
ファイル: test_spark.py プロジェクト: kndarp/spark-build
def test_cni_labels():
    driver_task_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args=
        "3000",  # Long enough to examine the Driver's & Executor's task infos
        args=[
            "--conf spark.mesos.network.name=dcos",
            "--conf spark.mesos.network.labels=key1:val1,key2:val2",
            "--conf spark.cores.max={}".format(CNI_TEST_NUM_EXECUTORS),
            "--class org.apache.spark.examples.SparkPi"
        ])

    # Wait until executors are running
    sdk_tasks.check_running(SPARK_PI_FW_NAME,
                            CNI_TEST_NUM_EXECUTORS,
                            timeout_seconds=600)

    # Check for network name / labels in Driver task info
    driver_task = shakedown.get_task(driver_task_id, completed=False)
    _check_task_network_info(driver_task)

    # Check for network name / labels in Executor task info
    executor_task = shakedown.get_service_tasks(SPARK_PI_FW_NAME)[0]
    _check_task_network_info(executor_task)

    # Check job output
    utils.check_job_output(driver_task_id, "Pi is roughly 3")
コード例 #7
0
ファイル: test_gpu.py プロジェクト: stuartpa/spark-build
def _submit_gpu_app(num_executors, executor_gpus, gpus_max, app_name=None):
    """
    Helper function to submit a gpu app.
    """
    args = [
        "--conf",
        "spark.scheduler.maxRegisteredResourcesWaitingTime=240s",
        "--conf",
        "spark.scheduler.minRegisteredResourcesRatio=1.0",
        "--conf",
        "spark.executor.memory=2g",
        "--conf",
        "spark.mesos.gpus.max={}".format(gpus_max),
        "--conf",
        "spark.executor.cores=1",
        "--conf",
        "spark.mesos.containerizer=mesos",
        "--conf",
        "spark.mesos.driverEnv.SPARK_USER=root",  # Run as root on centos
        "--class",
        "GpuPiApp"
    ]
    if executor_gpus is not None:
        args += [
            "--conf", "spark.mesos.executor.gpus={}".format(executor_gpus)
        ]

    app_args = "{} 1000000".format(
        num_executors)  # Long enough to examine the Executor's task info
    if app_name is not None:
        app_args += " {}".format(app_name)

    driver_task_id = spark_utils.submit_job(
        app_url=spark_utils.scala_test_jar_url(), app_args=app_args, args=args)
    return driver_task_id
コード例 #8
0
def test_driver_metrics(use_overlay):
    @retrying.retry(wait_fixed=5000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_for_metric(task_id, expected_metric_name):
        stdout = sdk_cmd.run_cli("task metrics details {}".format(task_id))
        result = expected_metric_name in stdout
        log.info('Checking for {} in STDOUT:\n{}\nResult: {}'.format(
            expected_metric_name, stdout, result))
        return result

    app_name = "MockTaskRunner"

    submit_args = [
        "--conf spark.cores.max=1", "--conf spark.mesos.containerizer=mesos",
        "--class {}".format(app_name)
    ]

    if use_overlay:
        submit_args = submit_args + [
            "--conf spark.mesos.network.name=dcos",
            "--conf spark.mesos.driverEnv.VIRTUAL_NETWORK_ENABLED=true",
            "--conf spark.executorEnv.VIRTUAL_NETWORK_ENABLED=true"
        ]

    expected_metric = "jvm.heap.used"

    driver_id = utils.submit_job(app_url=utils.dcos_test_jar_url(),
                                 app_args="1 300",
                                 args=submit_args)
    wait_for_metric(driver_id, expected_metric)

    sdk_tasks.check_running(app_name, 1, timeout_seconds=600)
    executor_id = shakedown.get_service_task_ids(app_name)[0]
    wait_for_metric(executor_id, expected_metric)
コード例 #9
0
    def _submit_producer(broker_dns, common_conf, topic, spark_app_name,
                         driver_role):
        big_file = "file:///mnt/mesos/sandbox/big.txt"

        producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag])

        producer_config = [
            "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=2",
            "--class", "KafkaFeeder"
        ] + common_conf

        spark_utils.submit_job(app_url=jar_url,
                               app_args=producer_args,
                               app_name=spark_app_name,
                               args=producer_config,
                               driver_role=driver_role,
                               verbose=False)
コード例 #10
0
def _submit_consumer(name,
                     spark_executor_docker_image,
                     jar,
                     kafka_broker_dns,
                     cassandra_native_client_dns,
                     dispatcher,
                     kafka_topics,
                     kafka_group_id,
                     write_to_cassandra,
                     batch_size_seconds,
                     cassandra_keyspace,
                     cassandra_table,
                     spark_cores_max,
                     spark_executor_cores,
                     must_fail: bool):
    app_args = ["--appName",           name,
                "--brokers",           ",".join(kafka_broker_dns),
                "--topics",            kafka_topics,
                "--groupId",           kafka_group_id,
                "--batchSizeSeconds",  str(batch_size_seconds),
                "--cassandraKeyspace", cassandra_keyspace,
                "--cassandraTable",    cassandra_table]

    if must_fail:
        app_args.extend(["--mustFailDueToInvalidArgument"])

    if not write_to_cassandra:
        app_args.extend(["--shouldNotWriteToCassandra"])

    cassandra_hosts = map(lambda x: x.split(':')[0], cassandra_native_client_dns)
    cassandra_port = cassandra_native_client_dns[0].split(':')[1]

    app_config = ["--supervise",
                  "--conf",      "spark.cores.max={}".format(spark_cores_max),
                  "--conf",      "spark.executor.cores={}".format(spark_executor_cores),
                  "--conf",      "spark.cassandra.connection.host={}".format(",".join(cassandra_hosts)),
                  "--conf",      "spark.cassandra.connection.port={}".format(cassandra_port),
                  "--name",      name,
                  "--class",     CONSUMER_CLASS_NAME]

    if spark_executor_docker_image:
        app_config.extend([
            "--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image)
        ])

    args = app_config + COMMON_CONF

    submission_id = spark_utils.submit_job(
        app_url=jar,
        app_args=" ".join(str(a) for a in app_args),
        args=args,
        verbose=False,
        service_name=dispatcher['service']['name'],
        driver_role=dispatcher['roles']['executors'],
        spark_user=dispatcher['service']['user'] if sdk_utils.is_strict_mode() else None,
        principal=dispatcher['service']['service_account'] if sdk_utils.is_strict_mode() else None)

    return submission_id
コード例 #11
0
def test_pipeline(kerberos_flag, stop_count, jar_uri, keytab_secret, spark_service_name, jaas_uri=None):
    stop_count = str(stop_count)
    kerberized = True if kerberos_flag == "true" else False
    broker_dns = sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME, KAFKA_SERVICE_NAME, 'endpoints broker', json=True)['dns'][0]
    topic = "top1"

    big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt"

    # arguments to the application
    producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag])

    uris = "spark.mesos.uris={}".format(big_file_url)

    if kerberized and jaas_uri is None:
        _uri = upload_jaas()
        uris += ",{}".format(_uri)
    else:
        uris += ",{}".format(jaas_uri)

    common_args = [
        "--conf", "spark.mesos.containerizer=mesos",
        "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s",
        "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0",
        "--conf", uris
    ]

    kerberos_args = get_kerberized_kafka_spark_conf(spark_service_name, keytab_secret)

    producer_config = ["--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1",
                       "--class", "KafkaFeeder"] + common_args

    if kerberized:
        producer_config += kerberos_args

    producer_id = utils.submit_job(app_url=jar_uri,
                                   app_args=producer_args,
                                   service_name=spark_service_name,
                                   args=producer_config)

    sdk_tasks.check_running(KAFKA_SERVICE_NAME, 1, timeout_seconds=600)

    consumer_config = ["--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1",
                       "--class", "KafkaConsumer"] + common_args

    if kerberized:
        consumer_config += kerberos_args

    consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag])

    try:
        utils.run_tests(app_url=jar_uri,
                    app_args=consumer_args,
                    expected_output="Read {} words".format(stop_count),
                    service_name=spark_service_name,
                    args=consumer_config)
    finally:
        utils.kill_driver(producer_id, spark_service_name)
コード例 #12
0
def test_mesos_label_support():
    driver_task_id = utils.submit_job(app_url=utils.SPARK_EXAMPLES,
                                      app_args="150",
                                      args=["--conf spark.cores.max=1",
                                            "--conf spark.mesos.driver.labels=foo:bar", # pass a test label
                                            "--class org.apache.spark.examples.SparkPi"])

    driver_task_info = sdk_cmd._get_task_info(driver_task_id)
    expected = {'key': 'foo', 'value': 'bar'}
    assert expected in driver_task_info['labels']
コード例 #13
0
def feed_sample_data(jar_uri, kafka_brokers, topic, common_args, messages):
    producer_args = ["--class", "KerberizedKafkaProducer"] + common_args
    producer_id = utils.submit_job(app_url=jar_uri,
                                   app_args="{} {} {} {}".format(
                                       "kafka", kafka_brokers, topic,
                                       ' '.join(messages)),
                                   service_name=utils.SPARK_SERVICE_NAME,
                                   args=producer_args)

    # validating producer output
    utils.check_job_output(producer_id,
                           "{} messages sent to Kafka".format(len(messages)))
コード例 #14
0
def test_supervise_conflict_frameworkid():
    job_service_name = "MockTaskRunner"

    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(job_service_name)
        if present:
            return svc is not None
        else:
            return svc is None

    job_args = [
        "--supervise", "--class", "MockTaskRunner", "--conf",
        "spark.cores.max=1", "--conf", "spark.executors.cores=1"
    ]

    try:
        driver_id = utils.submit_job(app_url=utils.dcos_test_jar_url(),
                                     app_args="1 1800",
                                     service_name=utils.SPARK_SERVICE_NAME,
                                     args=job_args)
        log.info("Started supervised driver {}".format(driver_id))

        wait_job_present(True)
        log.info("Job has registered")

        sdk_tasks.check_running(job_service_name, 1)
        log.info("Job has running executors")

        service_info = shakedown.get_service(job_service_name).dict()
        driver_regex = "spark.mesos.driver.frameworkId={}".format(
            service_info['id'])
        kill_status = sdk_cmd.kill_task_with_pattern(driver_regex,
                                                     service_info['hostname'])

        wait_job_present(False)

        wait_job_present(True)
        log.info("Job has re-registered")
        sdk_tasks.check_running(job_service_name, 1)
        log.info("Job has re-started")

        restarted_service_info = shakedown.get_service(job_service_name).dict()
        assert service_info['id'] != restarted_service_info[
            'id'], "Job has restarted with same framework Id"
    finally:
        kill_info = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
        log.info("{}".format(kill_info))
        assert json.loads(kill_info)["success"], "Failed to kill spark job"
        wait_job_present(False)
コード例 #15
0
def _launch_test_task(app_name):
    log.info('Submitting a Spark Applications with 1 executor')

    driver_task_id = utils.submit_job(app_url=utils.dcos_test_jar_url(),
                                      app_args="1 5",
                                      args=["--conf spark.cores.max=1",
                                            "--conf spark.executor.cores=1",
                                            "--conf spark.mesos.containerizer=mesos",
                                            "--conf spark.mesos.rejectOfferDuration=1s",
                                            f"--conf spark.mesos.executor.docker.image={utils.SPARK_DOCKER_IMAGE}",
                                            f"--class {app_name}"
                                            ])
    sdk_tasks.check_running(app_name, 1, timeout_seconds=300)
    return driver_task_id
コード例 #16
0
def test_task_not_lost():
    driver_task_id = utils.submit_job(app_url=utils.SPARK_EXAMPLES,
                                      app_args="1500",   # Long enough to examine the Executor's task info
                                      args=["--conf spark.cores.max=1",
                                            "--class org.apache.spark.examples.SparkPi"])

    # Wait until executor is running
    sdk_tasks.check_running(SPARK_PI_FW_NAME, 1, timeout_seconds=600)

    # Check Executor task ID - should end with 0, the first task.
    # If it's > 0, that means the first task was lost.
    assert sdk_tasks.get_task_ids(SPARK_PI_FW_NAME, '')[0].endswith('-0')

    # Check job output
    utils.check_job_output(driver_task_id, "Pi is roughly 3")
コード例 #17
0
def _submit_producer(name,
                     spark_executor_docker_image,
                     jar,
                     kafka_broker_dns,
                     dispatcher,
                     kafka_topics,
                     number_of_words,
                     words_per_second,
                     spark_cores_max,
                     spark_executor_cores,
                     must_fail: bool):
    app_args = ["--appName",        name,
                "--brokers",        ",".join(kafka_broker_dns),
                "--topics",         kafka_topics,
                "--numberOfWords",  str(number_of_words),
                "--wordsPerSecond", str(words_per_second)]

    if must_fail:
        app_args.extend(["--mustFailDueToInvalidArgument", ])

    app_config = ["--conf",  "spark.cores.max={}".format(spark_cores_max),
                  "--conf",  "spark.executor.cores={}".format(spark_executor_cores),
                  "--name",  name,
                  "--class", PRODUCER_CLASS_NAME]

    # `number_of_words == 0` means infinite stream, so we'd like to have it
    # restarted in the case of failures.
    if number_of_words == 0:
        app_config.extend(["--supervise"])

    if spark_executor_docker_image:
        app_config.extend([
            "--conf", "spark.mesos.executor.docker.image={}".format(spark_executor_docker_image)
        ])

    args = app_config + COMMON_CONF

    submission_id = spark_utils.submit_job(
        app_url=jar,
        app_args=" ".join(str(a) for a in app_args),
        args=args,
        verbose=False,
        service_name=dispatcher['service']['name'],
        driver_role=dispatcher['roles']['executors'],
        spark_user=dispatcher['service']['user'] if sdk_utils.is_strict_mode() else None,
        principal=dispatcher['service']['service_account'] if sdk_utils.is_strict_mode() else None)

    return submission_id
コード例 #18
0
ファイル: test_hdfs.py プロジェクト: zencircle/spark-build
def test_supervise():
    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(JOB_SERVICE_NAME)
        if present:
            return svc is not None
        else:
            return svc is None

    JOB_SERVICE_NAME = "RecoverableNetworkWordCount"

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs://{}".format(HDFS_DATA_DIR)
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        service_name=utils.SPARK_SERVICE_NAME,
        args=(KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    wait_job_present(True)
    log.info("Job has registered")
    sdk_tasks.check_running(JOB_SERVICE_NAME, 1)
    log.info("Job has running executors")

    service_info = shakedown.get_service(JOB_SERVICE_NAME).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])
    shakedown.kill_process_on_host(hostname=service_info['hostname'],
                                   pattern=driver_regex)

    wait_job_present(True)
    log.info("Job has re-registered")
    sdk_tasks.check_running(JOB_SERVICE_NAME, 1)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    wait_job_present(False)
コード例 #19
0
def _submit_job_and_get_tasks(extra_args=[]):
    submit_args = [
        "--conf spark.driver.cores={}".format(driver_cpus),
        "--conf spark.cores.max={}".format(executor_cpus),
        "--conf spark.executor.cores={}".format(executor_cpus),
        "--class {}".format(app_name)
    ] + extra_args

    driver_task_id = utils.submit_job(app_url=utils.dcos_test_jar_url(),
                                      app_args="1 600",
                                      args=submit_args)

    sdk_tasks.check_running(app_name, 1, timeout_seconds=300)
    driver_task = shakedown.get_task(driver_task_id, completed=False)
    executor_task = shakedown.get_service_tasks(app_name)[0]

    return (driver_task_id, driver_task, executor_task)
コード例 #20
0
ファイル: test_quota.py プロジェクト: mediapills/spark-build
def _verify_submission_rejected(service_name, driver_role=None):
    app_name = "MockTaskRunner"
    submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)]

    submission_id = None
    error = None
    try:
        submission_id = utils.submit_job(service_name=service_name,
                                         app_url=utils.dcos_test_jar_url(),
                                         driver_role=driver_role,
                                         app_args="1 300",
                                         args=submit_args)
    except Exception as err:
        error = err
    finally:
        if submission_id:
            utils.kill_driver(submission_id, service_name=service_name)

    assert error is not None
コード例 #21
0
def _submit_shuffle_job(sleep=0, extra_args=[], use_cli=True):
    num_unique_keys = SHUFFLE_JOB_EXPECTED_GROUPS_COUNT
    num_mappers = 4
    value_size_bytes = 100
    num_reducers = 4
    # Usage: ShuffleApp [numMappers] [numPairs] [valueSize] [numReducers] [sleepBeforeShutdown]
    return utils.submit_job(
        app_url=utils.dcos_test_jar_url(),
        use_cli=use_cli,
        app_args="{} {} {} {} {}".format(num_mappers, num_unique_keys,
                                         value_size_bytes, num_reducers,
                                         sleep),
        args=[
            "--conf spark.executor.cores=1",
            "--conf spark.cores.max={}".format(SHUFFLE_JOB_NUM_EXECUTORS),
            "--conf spark.scheduler.minRegisteredResourcesRatio=1",
            "--conf spark.scheduler.maxRegisteredResourcesWaitingTime=3m",
            "--class ShuffleApp"
        ] + extra_args)
コード例 #22
0
def test_task_not_lost():
    driver_task_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="1500",  # Long enough to examine the Executor's task info
        args=[
            "--conf", "spark.cores.max=1", "--class",
            "org.apache.spark.examples.SparkPi"
        ])

    # Wait until executor is running
    utils.wait_for_executors_running(SPARK_PI_FW_NAME, 1)

    # Check Executor task ID - should be 0, the first task.
    # If it's > 0, that means the first task was lost.
    executor_task = shakedown.get_service_tasks(SPARK_PI_FW_NAME)[0]
    assert executor_task['id'] == "0"

    # Check job output
    utils.check_job_output(driver_task_id, "Pi is roughly 3")
コード例 #23
0
ファイル: test_quota.py プロジェクト: mediapills/spark-build
def _submit_job_and_verify_role(service_name, expected_role, driver_role=None):
    app_name = "MockTaskRunner"
    submit_args = ["--conf spark.cores.max=1", "--class {}".format(app_name)]

    submission_id = utils.submit_job(service_name=service_name,
                                     app_url=utils.dcos_test_jar_url(),
                                     app_args="1 300",
                                     driver_role=driver_role,
                                     args=submit_args)

    try:
        sdk_tasks.check_running(app_name, 1, timeout_seconds=300)
        driver_framework = dcos_utils.get_framework_json(app_name,
                                                         completed=False)
        log.info("Driver framework:\n{}".format(driver_framework))
        assert expected_role == driver_framework["role"], \
            "Expected role '{}' but got '{}'".format(expected_role, driver_framework["role"])

    except Exception:
        log.info(f"Cleaning up. Attempting to kill driver: {submission_id}")
        utils.kill_driver(submission_id, service_name=service_name)
コード例 #24
0
def _submit_job_and_verify_users(user, use_ucr_for_spark_submit, extra_args=[]):
    app_name = "MockTaskRunner"

    submit_args = ["--conf spark.cores.max=1",
                   "--class {}".format(app_name)] + extra_args

    driver_task_id = utils.submit_job(service_name=SERVICE_NAME,
                                      app_url=utils.dcos_test_jar_url(),
                                      app_args="1 300",
                                      args=submit_args)
    try:
        sdk_tasks.check_running(app_name, 1, timeout_seconds=300)
        driver_task = shakedown.get_task(driver_task_id, completed=False)
        executor_tasks = shakedown.get_service_tasks(app_name)

        for task in [driver_task] + executor_tasks:
            log.info(f"Checking task '{task['id']}'")
            _check_task_user(task, user, use_ucr_for_spark_submit)

    finally:
        log.info(f"Cleaning up. Attempting to kill driver: {driver_task_id}")
        utils.kill_driver(driver_task_id, service_name=SERVICE_NAME)
コード例 #25
0
def test_pipeline(kerberos_flag,
                  stop_count,
                  jar_uri,
                  keytab_secret,
                  spark_app_name,
                  jaas_uri=None):
    stop_count = str(stop_count)
    kerberized = True if kerberos_flag == "true" else False
    broker_dns = _kafka_broker_dns()
    topic = "top1"

    big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt"

    # arguments to the application
    producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag])

    uris = "spark.mesos.uris=http://norvig.com/big.txt"

    if kerberized and jaas_uri is None:
        jaas_path = os.path.join(THIS_DIR, "resources",
                                 "spark-kafka-client-jaas.conf")
        s3.upload_file(jaas_path)
        _uri = s3.s3_http_url("spark-kafka-client-jaas.conf")
        uris += ",{}".format(_uri)
    else:
        uris += ",{}".format(jaas_uri)

    common_args = [
        "--conf", "spark.mesos.containerizer=mesos", "--conf",
        "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf",
        "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris
    ]

    kerberos_args = [
        "--conf",
        "spark.mesos.driver.secret.names={}".format(keytab_secret),
        "--conf",
        "spark.mesos.driver.secret.filenames=kafka-client.keytab",
        "--conf",
        "spark.mesos.executor.secret.names={}".format(keytab_secret),
        "--conf",
        "spark.mesos.executor.secret.filenames=kafka-client.keytab",
        "--conf",
        "spark.mesos.task.labels=DCOS_SPACE:{}".format(utils.SPARK_APP_NAME),
        "--conf",
        "spark.executorEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5),
        "--conf",
        "spark.mesos.driverEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5),
        "--conf",
        "spark.driver.extraJavaOptions=-Djava.security.auth.login.config="
        "/mnt/mesos/sandbox/spark-kafka-client-jaas.conf",
        "--conf",
        "spark.executor.extraJavaOptions="
        "-Djava.security.auth.login.config=/mnt/mesos/sandbox/spark-kafka-client-jaas.conf",
    ]

    producer_config = [
        "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=2",
        "--class", "KafkaFeeder"
    ] + common_args

    if kerberized:
        producer_config += kerberos_args

    producer_id = utils.submit_job(app_url=jar_uri,
                                   app_args=producer_args,
                                   app_name=spark_app_name,
                                   args=producer_config)

    shakedown.wait_for(lambda: _producer_launched(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    shakedown.wait_for(lambda: utils.is_service_ready(KAFKA_SERVICE_NAME, 1),
                       ignore_exceptions=False,
                       timeout_seconds=600)

    consumer_config = [
        "--conf", "spark.cores.max=4", "--class", "KafkaConsumer"
    ] + common_args

    if kerberized:
        consumer_config += kerberos_args

    consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag])

    utils.run_tests(app_url=jar_uri,
                    app_args=consumer_args,
                    expected_output="Read {} words".format(stop_count),
                    app_name=spark_app_name,
                    args=consumer_config)

    utils.kill_driver(producer_id, spark_app_name)
コード例 #26
0
def test_structured_streaming_recovery(kerberized_spark, kerberized_kafka):
    kafka_brokers = ','.join(
        sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME,
                        KAFKA_SERVICE_NAME,
                        'endpoints broker',
                        json=True)['dns'])
    LOGGER.info("Kafka brokers: {}".format(kafka_brokers))

    _uri = upload_jaas()
    uris = "spark.mesos.uris={}".format(_uri)

    jar_uri = utils.upload_dcos_test_jar()

    kafka_kerberos_args = get_kerberized_kafka_spark_conf(
        utils.SPARK_SERVICE_NAME)
    LOGGER.info("Spark Kerberos configuration for Kafka:\n{}".format(
        '\n'.join(kafka_kerberos_args)))

    common_args = [
        "--conf", "spark.mesos.containerizer=mesos", "--conf",
        "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf",
        "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris
    ] + kafka_kerberos_args

    # configuring streaming job and HDFS folders
    setup_hdfs_paths()

    # running kafka producer
    message_set_a = ["abc"] * 100
    feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args,
                     message_set_a)

    spark_submit_args = [
        "--supervise", "--class", "StructuredStreamingWithCheckpointing",
        "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1",
        "--conf", "spark.sql.shuffle.partitions=2", "--conf",
        "spark.executor.memory=2g"
    ] + common_args

    application_args = "{} {} {} {}".format(kafka_brokers, KAFKA_TEST_TOPIC,
                                            HDFS_CHECKPOINT_DIR,
                                            SPARK_SECURITY_PROTOCOL)

    driver_task_id = utils.submit_job(app_url=jar_uri,
                                      app_args=application_args,
                                      service_name=utils.SPARK_SERVICE_NAME,
                                      args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS +
                                            spark_submit_args))

    # Wait until executor is running
    LOGGER.info("Starting supervised driver {}".format(driver_task_id))
    sdk_tasks.check_running(SPARK_APPLICATION_NAME,
                            expected_task_count=1,
                            timeout_seconds=600)

    # validating Structured Streaming topic consumption
    expected_output_a = "{}|  {}".format(message_set_a[0], len(message_set_a))
    LOGGER.info(
        "Validating Structured Streaming topic consumption, waiting for output {}"
        .format(expected_output_a))
    utils.wait_for_running_job_output(driver_task_id, expected_output_a)

    # killing the driver
    service_info = shakedown.get_service(SPARK_APPLICATION_NAME).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])
    sdk_cmd.kill_task_with_pattern(agent_host=service_info['hostname'],
                                   pattern=driver_regex)

    # sending more data to Kafka
    message_set_b = ["def"] * 100
    feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC,
                     common_args + kafka_kerberos_args, message_set_b)

    # checkpointing validation
    sdk_tasks.check_running(SPARK_APPLICATION_NAME,
                            expected_task_count=1,
                            timeout_seconds=600)
    LOGGER.info("Streaming job has re-started")

    # validating Structured Streaming resumed topic consumption
    expected_output_b = "{}|  {}".format(message_set_b[0], len(message_set_b))
    LOGGER.info(
        "Validating that consumption resumed from checkpoint, waiting for output '{}' and '{}'"
        .format(expected_output_a, expected_output_b))

    utils.wait_for_running_job_output(driver_task_id, expected_output_a)
    utils.wait_for_running_job_output(driver_task_id, expected_output_b)
コード例 #27
0
def test_supervise():
    def streaming_job_registered():
        return shakedown.get_service(JOB_SERVICE_NAME) is not None

    def streaming_job_is_not_running():
        return not streaming_job_registered()

    def has_running_executors():
        f = shakedown.get_service(JOB_SERVICE_NAME)
        if f is None:
            return False
        else:
            return len([
                x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"
            ]) > 0

    JOB_SERVICE_NAME = "RecoverableNetworkWordCount"

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs:///users/alice"
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        app_name=utils.SPARK_APP_NAME,
        args=(KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has running executors")

    host = shakedown.get_service(JOB_SERVICE_NAME).dict()["hostname"]
    id = shakedown.get_service(JOB_SERVICE_NAME).dict()["id"]
    driver_regex = "spark.mesos.driver.frameworkId={}".format(id)
    shakedown.kill_process_on_host(hostname=host, pattern=driver_regex)

    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has re-registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_APP_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    shakedown.wait_for(lambda: streaming_job_is_not_running(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
コード例 #28
0
def test_supervise(kerberized_spark, hdfs_with_kerberos):
    job_service_name = "RecoverableNetworkWordCount"

    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(job_service_name)
        if present:
            return svc is not None
        else:
            return svc is None

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs://{}".format(HDFS_DATA_DIR)
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        service_name=utils.SPARK_SERVICE_NAME,
        args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    wait_job_present(True)
    log.info("Job has registered")
    sdk_tasks.check_running(job_service_name, 1)
    log.info("Job has running executors")

    service_info = shakedown.get_service(job_service_name).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])

    status, stdout = shakedown.run_command_on_agent(
        service_info['hostname'],
        "ps aux | grep -v grep | grep '{}'".format(driver_regex),
        username=sdk_cmd.LINUX_USER)

    pids = [p.strip().split()[1] for p in stdout.splitlines()]

    for pid in pids:
        status, stdout = shakedown.run_command_on_agent(
            service_info['hostname'],
            "sudo kill -9 {}".format(pid),
            username=sdk_cmd.LINUX_USER)

        if status:
            print("Killed pid: {}".format(pid))
        else:
            print("Unable to killed pid: {}".format(pid))

    wait_job_present(True)
    log.info("Job has re-registered")
    sdk_tasks.check_running(job_service_name, 1)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    wait_job_present(False)