def test_canary_fourth():
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy world-deploy')

    expected_tasks = [
        'hello-0', 'hello-1', 'hello-2', 'hello-3',
        'world-0', 'world-1', 'world-2', 'world-3']
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks

    pl = sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'deploy')
    log.info(pl)

    assert pl['status'] == 'COMPLETE'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'

    phase = pl['phases'][1]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'
Beispiel #2
0
def check_healthy(service_name, count=DEFAULT_TASK_COUNT, recovery_expected=False):
    sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds=25 * 60)
    if recovery_expected:
        # TODO(elezar): See INFINITY-2109 where we need to better handle recovery health checks
        sdk_plan.wait_for_kicked_off_recovery(service_name, timeout_seconds=25 * 60)
    sdk_plan.wait_for_completed_recovery(service_name, timeout_seconds=25 * 60)
    sdk_tasks.check_running(service_name, count)
def setup_constraint_switch():
    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)

    agents = sdk_agents.get_private_agents()
    some_agent = agents[0]["hostname"]
    other_agent = agents[1]["hostname"]
    log.info("Agents: %s %s", some_agent, other_agent)
    assert some_agent != other_agent
    options = _escape_placement_for_1_9(
        {
            "service": {"yaml": "marathon_constraint"},
            "hello": {
                "count": 1,
                # First, we stick the pod to some_agent
                "placement": '[["hostname", "LIKE", "{}"]]'.format(some_agent),
            },
            "world": {"count": 0},
        }
    )
    sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, 1, additional_options=options)
    sdk_tasks.check_running(config.SERVICE_NAME, 1)
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello")

    # Now, stick it to other_agent
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config["env"]["HELLO_PLACEMENT"] = '[["hostname", "LIKE", "{}"]]'.format(other_agent)
    sdk_marathon.update_app(marathon_config)
    # Wait for the scheduler to be up and settled before advancing.
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    return some_agent, other_agent, hello_ids
def test_canary_third():
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "plan continue deploy hello-deploy")

    expected_tasks = ["hello-0", "hello-1", "hello-2", "hello-3", "world-0"]
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))
    rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod list")
    assert rc == 0, "Pod list failed"
    assert json.loads(stdout) == expected_tasks

    pl = sdk_plan.wait_for_completed_phase(config.SERVICE_NAME, "deploy", "hello-deploy")
    log.info(pl)

    assert pl["status"] == "WAITING"

    assert len(pl["phases"]) == 2

    phase = pl["phases"][0]
    assert phase["status"] == "COMPLETE"
    steps = phase["steps"]
    assert len(steps) == 4
    assert steps[0]["status"] == "COMPLETE"
    assert steps[1]["status"] == "COMPLETE"
    assert steps[2]["status"] == "COMPLETE"
    assert steps[3]["status"] == "COMPLETE"

    phase = pl["phases"][1]
    assert phase["status"] == "WAITING"
    steps = phase["steps"]
    assert len(steps) == 4
    assert steps[0]["status"] == "COMPLETE"
    assert steps[1]["status"] == "WAITING"
    assert steps[2]["status"] == "PENDING"
    assert steps[3]["status"] == "PENDING"
def test_hostname_unique():
    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
    options = _escape_placement_for_1_9(
        {
            "service": {"yaml": "marathon_constraint"},
            "hello": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'},
            "world": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'},
        }
    )

    sdk_install.install(
        config.PACKAGE_NAME,
        config.SERVICE_NAME,
        get_num_private_agents() * 2,
        additional_options=options,
    )

    # hello deploys first. One "world" task should end up placed with each "hello" task.
    # ensure "hello" task can still be placed with "world" task
    old_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0")
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0")
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", old_ids)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    sdk_tasks.check_running(
        config.SERVICE_NAME, get_num_private_agents() * 2 - 1, timeout_seconds=10
    )
    sdk_tasks.check_running(config.SERVICE_NAME, get_num_private_agents() * 2)
    ensure_count_per_agent(hello_count=1, world_count=1)
def setup_constraint_switch():
    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)

    agents = shakedown.get_private_agents()
    some_agent = agents[0]
    other_agent = agents[1]
    log.info('Agents: %s %s', some_agent, other_agent)
    assert some_agent != other_agent
    options = _escape_placement_for_1_9({
        "service": {
            "yaml": "marathon_constraint"
        },
        "hello": {
            "count": 1,
            # First, we stick the pod to some_agent
            "placement": "[[\"hostname\", \"LIKE\", \"{}\"]]".format(some_agent)
        },
        "world": {
            "count": 0
        }
    })
    sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, 1, additional_options=options)
    sdk_tasks.check_running(config.SERVICE_NAME, 1)
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello')

    # Now, stick it to other_agent
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config['env']['HELLO_PLACEMENT'] = "[[\"hostname\", \"LIKE\", \"{}\"]]".format(other_agent)
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)
    # Wait for the scheduler to be up and settled before advancing.
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    return some_agent, other_agent, hello_ids
def test_no_change():

    broker_ids = tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE))
    plan1 = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME))

    config = marathon.get_config(SERVICE_NAME)
    marathon.update_app(SERVICE_NAME, config)

    plan2 = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME))

    assert plan1 == plan2
    try:
        tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids, timeout_seconds=60)
        assert False, "Should not restart tasks now"
    except AssertionError as arg:
        raise arg
    except:
        pass

    tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)

    assert plan2['status'] == 'COMPLETE'
    assert plan2['phases'][0]['status'] == 'COMPLETE'

    for step in range(DEFAULT_BROKER_COUNT):
        assert plan2['phases'][0]['steps'][step]['status'] == 'COMPLETE'
def install(package_name, running_task_count, service_name=None, additional_options={}, package_version=None):
    if not service_name:
        service_name = package_name
    start = time.time()
    merged_options = get_package_options(additional_options)
    print('Installing {} with options={} version={}'.format(package_name, merged_options, package_version))
    # install_package_and_wait silently waits for all marathon deployments to clear.
    # to give some visibility, install in the following order:
    # 1. install package
    shakedown.install_package(package_name, package_version=package_version, options_json=merged_options)
    # 2. wait for expected tasks to come up
    print("Waiting for expected tasks to come up...")
    sdk_tasks.check_running(service_name, running_task_count)
    # 3. check service health
    marathon_client = dcos.marathon.create_client()

    def fn():
        # TODO(nickbp): upstream fix to shakedown, which currently checks for ANY deployments rather
        #               than the one we care about
        deploying_apps = set([])
        print("Getting deployments")
        deployments = marathon_client.get_deployments()
        print("Found {} deployments".format(len(deployments)))
        for d in deployments:
            print("Deployment: {}".format(d))
            for a in d.get('affectedApps', []):
                print("Adding {}".format(a))
                deploying_apps.add(a)
        print('Checking deployment of {} has ended:\n- Deploying apps: {}'.format(service_name, deploying_apps))
        return not '/{}'.format(service_name) in deploying_apps
    sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=30)
    print('Install done after {}'.format(sdk_spin.pretty_time(time.time() - start)))
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^node-[0-9]+-server$'))
    assert len(candidate_tasks) != 0, 'Could not find a node to shut down'
    # Cassandra nodes should never share a machine
    assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \
        'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks)
    # Just pick the first one from the list
    replace_task = candidate_tasks[0]

    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
Beispiel #10
0
def test_port_static_to_dynamic_port():
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    broker_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE))

    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config['env']['BROKER_PORT'] = '0'
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids)
    # all tasks are running
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    for broker_id in range(config.DEFAULT_BROKER_COUNT):
        result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'broker get {}'.format(broker_id), json=True)
        assert result['port'] != 9092

    result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints broker', json=True)
    assert len(result['address']) == config.DEFAULT_BROKER_COUNT
    assert len(result['dns']) == config.DEFAULT_BROKER_COUNT

    for port in result['address']:
        assert int(port.split(':')[-1]) != 9092

    for port in result['dns']:
        assert int(port.split(':')[-1]) != 9092
def test_secrets_basic():
    # 1) create Secrets
    # 2) install examples/secrets.yml
    # 3) if secret file is not created, tasks will fail
    # 4) wait till deployment finishes
    # 5) do replace operation
    # 6) ensure all tasks are running
    # 7) delete Secrets

    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)

    create_secrets("{}/".format(config.SERVICE_NAME))

    sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options)

    hello_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server")
    world_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "word-0-server")

    # ensure that secrets work after replace
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0')

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_0)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_0)

    # tasks will fail if secret files are not created by mesos module
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD)

    # clean up and delete secrets
    delete_secrets("{}/".format(config.SERVICE_NAME))
def test_increase_decrease_world_nodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    config.check_running(foldered_name)

    original_hello_ids = sdk_tasks.get_task_ids(foldered_name, 'hello')
    original_world_ids = sdk_tasks.get_task_ids(foldered_name, 'world')
    log.info('world ids: ' + str(original_world_ids))

    # add 2 world nodes
    sdk_marathon.bump_task_count_config(foldered_name, 'WORLD_COUNT', 2)

    config.check_running(foldered_name)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'world', original_world_ids)

    # check 2 world tasks added:
    assert 2 + len(original_world_ids) == len(sdk_tasks.get_task_ids(foldered_name, 'world'))

    # subtract 2 world nodes
    sdk_marathon.bump_task_count_config(foldered_name, 'WORLD_COUNT', -2)

    config.check_running(foldered_name)
    # wait for the decommission plan for this subtraction to be complete
    sdk_plan.wait_for_completed_plan(foldered_name, 'decommission')
    # check that the total task count is back to original
    sdk_tasks.check_running(
        foldered_name,
        len(original_hello_ids) + len(original_world_ids),
        allow_more=False)
    # check that original tasks weren't affected/relaunched in the process
    sdk_tasks.check_tasks_not_updated(foldered_name, 'hello', original_hello_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'world', original_world_ids)

    # check that the world tasks are back to their prior state (also without changing task ids)
    assert original_world_ids == sdk_tasks.get_task_ids(foldered_name, 'world')
def test_marathon_rack_not_found():
    def fun():
        try:
            return service_cli('plan show {}'.format(DEFAULT_PLAN_NAME))
        except:
            return False

    shakedown.install_package(PACKAGE_NAME,
                              service_name=SERVICE_NAME,
                              options_json=install.get_package_options(
                                  additional_options={'service':{'placement_constraint':'rack_id:LIKE:rack-foo-.*'}}
                              ),
                              wait_for_completion=False)
    try:
        tasks.check_running(PACKAGE_NAME, 1, timeout_seconds=120)
        assert False, "Should have failed to install"
    except AssertionError as arg:
        raise arg
    except:
        pass  # expected to fail

    pl = spin.time_wait_return(fun)

    # check that first node is still (unsuccessfully) looking for a match:
    assert pl['status'] == 'IN_PROGRESS'
    assert pl['phases'][0]['status'] == 'IN_PROGRESS'

    # if so early, it can be PREPARED ?
    assert pl['phases'][0]['steps'][0]['status'] in ('PREPARED', 'PENDING')
    assert pl['phases'][0]['steps'][1]['status'] == 'PENDING'
    assert pl['phases'][0]['steps'][2]['status'] == 'PENDING'
    install.uninstall(SERVICE_NAME, PACKAGE_NAME)
Beispiel #14
0
def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes() -> None:
    initial_master_task_ids = sdk_tasks.get_task_ids(service_name, "master")
    initial_data_task_ids = sdk_tasks.get_task_ids(service_name, "data")

    # Get service configuration.
    _, svc_config, _ = sdk_cmd.svc_cli(package_name, service_name, "describe", parse_json=True)

    ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config)
    coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config)

    global current_expected_task_count

    sdk_service.update_configuration(
        package_name,
        service_name,
        {
            "ingest_nodes": {"count": ingest_nodes_count + 1},
            "coordinator_nodes": {"count": coordinator_nodes_count + 1},
        },
        current_expected_task_count,
        # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than
        # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards
        # with `sdk_tasks.check_running`.
        wait_for_deployment=False,
    )

    # Should be running 2 tasks more.
    current_expected_task_count += 2
    sdk_tasks.check_running(service_name, current_expected_task_count)
    # Master nodes should not restart.
    sdk_tasks.check_tasks_not_updated(service_name, "master", initial_master_task_ids)
    # Data nodes should not restart.
    sdk_tasks.check_tasks_not_updated(service_name, "data", initial_data_task_ids)
def test_canary_first():
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy hello-deploy')

    expected_tasks = ['hello-0']
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks

    # do not use service_plan always
    # when here, plan should always return properly
    pl = sdk_plan.wait_for_completed_step(config.SERVICE_NAME, 'deploy', 'hello-deploy', 'hello-0:[server]')
    log.info(pl)

    assert pl['status'] == 'WAITING'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'

    phase = pl['phases'][1]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'WAITING'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'
def test_enable():
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
    sdk_plan.recovery_plan_is_empty(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, 3, timeout_seconds=30, allow_more=False)
    set_test_boolean("true")
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, 6, timeout_seconds=30, allow_more=False)
    sdk_plan.recovery_plan_is_empty(config.SERVICE_NAME)
def test_enable():
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
    sdk_plan.recovery_plan_is_empty(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, 3)
    set_test_boolean('true')
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, 6)
    sdk_plan.recovery_plan_is_empty(config.SERVICE_NAME)
Beispiel #18
0
def test_static_port_comes_online():
    sdk_install.install(
        config.PACKAGE_NAME,
        config.SERVICE_NAME,
        config.DEFAULT_BROKER_COUNT,
        additional_options=STATIC_PORT_OPTIONS_DICT)

    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)
def test_service_startup_rapid():
    max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS
    startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS
    retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS

    task_short_name = 'kafka-0'
    broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0]

    # the following 'dcos kafka topic ....' command has expected output as follows:
    # 'Output: 100 records sent ....'
    # but may fail, i.e. have output such as follows:
    # '...leader not available...'
    stdout = ''
    retries = 15
    while retries > 0:
        retries -= 1
        stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'topic producer_test test 100')
        if 'records sent' in stdout:
            break

    jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart {}'.format(task_short_name), json=True)
    assert len(jsonobj) == 2
    assert jsonobj['pod'] == task_short_name
    assert jsonobj['tasks'] == [ '{}-broker'.format(task_short_name) ]

    starting_fallback_time = datetime.datetime.now()

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), [ broker_task_id_0 ])
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0]

    # extract starting and started lines from log
    starting_time = started_time = None
    retry_seconds_remaining = max_restart_seconds + startup_padding_seconds
    while retry_seconds_remaining > 0.0 and (starting_time is None or started_time is None):
        stdout = sdk_cmd.run_cli("task log --lines=1000 {}".format(broker_task_id_1))
        task_lines = stdout.split('\n')
        for log_line in reversed(task_lines):
            if starting_time is None and ' starting (kafka.server.KafkaServer)' in log_line:
                starting_time = log_line_ts(log_line)
            elif started_time is None and ' started (kafka.server.KafkaServer)' in log_line:
                started_time = log_line_ts(log_line)
        if starting_time is None or started_time is None:
            time.sleep(retry_delay_seconds)

    if started_time is None or starting_time is None:
        f = open('/tmp/kafka_startup_stdout', 'w')
        f.write(stdout)
        f.close()

    if starting_time is None:
        starting_time = starting_fallback_time

    assert starting_time is not None
    assert started_time is not None
    assert started_time >= starting_time
    assert (started_time - starting_time).total_seconds() <= max_restart_seconds
Beispiel #20
0
def replace_broker_pod(service_name=config.SERVICE_NAME):
    pod_name = '{}-0'.format(config.DEFAULT_POD_TYPE)
    task_name = '{}-{}'.format(pod_name, config.DEFAULT_TASK_NAME)
    broker_0_id = sdk_tasks.get_task_ids(service_name, task_name)
    sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'pod replace {}'.format(pod_name))
    sdk_tasks.check_tasks_updated(service_name, task_name, broker_0_id)
    sdk_tasks.check_running(service_name, config.DEFAULT_BROKER_COUNT)
    # wait till all brokers register
    broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=service_name)
Beispiel #21
0
def restart_broker_pods(service_name=config.SERVICE_NAME):
    for i in range(config.DEFAULT_BROKER_COUNT):
        pod_name = '{}-{}'.format(config.DEFAULT_POD_TYPE, i)
        task_name = '{}-{}'.format(pod_name, config.DEFAULT_TASK_NAME)
        broker_id = sdk_tasks.get_task_ids(service_name, task_name)
        restart_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'pod restart {}'.format(pod_name), json=True)
        assert len(restart_info) == 2
        assert restart_info['tasks'][0] == task_name
        sdk_tasks.check_tasks_updated(service_name, task_name, broker_id)
        sdk_tasks.check_running(service_name, config.DEFAULT_BROKER_COUNT)
Beispiel #22
0
def test_uninstall():
    config.check_running()

    # add the needed envvar in marathon and confirm that the uninstall "deployment" succeeds:
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    env = marathon_config['env']
    env['SDK_UNINSTALL'] = 'w00t'
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, 0)
def test_uninstall():
    config.check_running()

    # add the needed envvar in marathon and confirm that the uninstall "deployment" succeeds:
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    env = marathon_config["env"]
    env["SDK_UNINSTALL"] = "w00t"
    sdk_marathon.update_app(marathon_config)
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, 0, allow_more=False)
Beispiel #24
0
def test_port_dynamic_to_dynamic_port():
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    broker_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE))

    sdk_marathon.bump_cpu_count_config(config.SERVICE_NAME, 'BROKER_CPUS')

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids)
    # all tasks are running
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)
Beispiel #25
0
def test_overlay_network_deployment_and_endpoints():
    # double check
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)
    endpoints = sdk_networks.get_and_test_endpoints(config.PACKAGE_NAME, config.SERVICE_NAME, "", 2)
    assert "broker" in endpoints, "broker is missing from endpoints {}".format(endpoints)
    assert "zookeeper" in endpoints, "zookeeper missing from endpoints {}".format(endpoints)
    broker_endpoints = sdk_networks.get_and_test_endpoints(config.PACKAGE_NAME, config.SERVICE_NAME, "broker", 3)
    sdk_networks.check_endpoints_on_overlay(broker_endpoints)

    zookeeper = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints zookeeper')
    assert zookeeper.rstrip() == 'master.mesos:2181/{}'.format(sdk_utils.get_zk_path(config.SERVICE_NAME))
def test_bump_node_counts():
    # Run this test last, as it changes the task count
    config = marathon.get_config(PACKAGE_NAME)
    data_nodes = int(config['env']['DATA_NODE_COUNT'])
    config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1)
    ingest_nodes = int(config['env']['INGEST_NODE_COUNT'])
    config['env']['INGEST_NODE_COUNT'] = str(ingest_nodes + 1)
    coordinator_nodes = int(config['env']['COORDINATOR_NODE_COUNT'])
    config['env']['COORDINATOR_NODE_COUNT'] = str(coordinator_nodes + 1)
    marathon.update_app(PACKAGE_NAME, config)
    tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT + 3)
Beispiel #27
0
def test_changing_discovery_replaces_certificate_sans(hello_world_service):
    """
    Update service configuration to change discovery prefix of a task.
    Scheduler should update task and new SANs should be generated.
    """
    original_tasks = sdk_tasks.get_task_ids(config.PACKAGE_NAME, 'discovery')
    assert len(original_tasks) == 1, 'Expecting exactly one task ID'

    task_id = original_tasks[0]
    assert task_id

    # Load end-entity certificate from PEM encoded file
    end_entity_cert = x509.load_pem_x509_certificate(
        task_exec(task_id, 'cat server.crt').encode('ascii'),
        DEFAULT_BACKEND)

    san_extension = end_entity_cert.extensions.get_extension_for_oid(
        ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
    sans = [
        san.value for san in san_extension.value._general_names._general_names]

    expected_san = (
        '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format(
            name=DISCOVERY_TASK_PREFIX,
            service_name=config.SERVICE_NAME)
        )
    assert expected_san in sans

    # Run task update with new discovery prefix
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config['env']['DISCOVERY_TASK_PREFIX'] = DISCOVERY_TASK_PREFIX + '-new'
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'discovery', original_tasks)
    sdk_tasks.check_running(config.SERVICE_NAME, 4)
    new_task_id = sdk_tasks.get_task_ids(config.SERVICE_NAME, "discovery")[0]
    assert task_id != new_task_id

    new_cert = x509.load_pem_x509_certificate(
        task_exec(new_task_id, 'cat server.crt').encode('ascii'),
        DEFAULT_BACKEND)

    san_extension = new_cert.extensions.get_extension_for_oid(
        ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
    sans = [
        san.value for san in san_extension.value._general_names._general_names]

    expected_san =  (
        '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format(
            name=DISCOVERY_TASK_PREFIX + '-new',
            service_name=config.SERVICE_NAME)
        )
    assert expected_san in sans
def test_port_dynamic_to_dynamic_port():
    tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)

    broker_ids = tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE))

    config = marathon.get_config(SERVICE_NAME)
    broker_cpus = int(config['env']['BROKER_CPUS'])
    config['env']['BROKER_CPUS'] = str(broker_cpus + 0.1)
    marathon.update_app(SERVICE_NAME, config)

    tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids)
    # all tasks are running
    tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)
Beispiel #29
0
def test_bump_node_counts():
    # Run this test last, as it changes the task count
    marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME)
    data_nodes = int(marathon_config['env']['DATA_NODE_COUNT'])
    marathon_config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1)
    ingest_nodes = int(marathon_config['env']['INGEST_NODE_COUNT'])
    marathon_config['env']['INGEST_NODE_COUNT'] = str(ingest_nodes + 1)
    coordinator_nodes = int(marathon_config['env']['COORDINATOR_NODE_COUNT'])
    marathon_config['env']['COORDINATOR_NODE_COUNT'] = str(
        coordinator_nodes + 1)
    sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config)
    sdk_tasks.check_running(FOLDERED_SERVICE_NAME,
                            config.DEFAULT_TASK_COUNT + 3)
def test_placement_max_one_per_hostname():
    install.install(
        PACKAGE_NAME,
        DEFAULT_BROKER_COUNT,
        service_name=SERVICE_NAME,
        additional_options={'service':{'placement_constraint':'hostname:MAX_PER:1'}}
    )
    # double check
    tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)

    pl = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME))
    assert pl['status'] == 'COMPLETE'
    install.uninstall(SERVICE_NAME, PACKAGE_NAME)
Beispiel #31
0
def pre_test_setup():
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)
    config.wait_for_expected_nodes_to_exist(task_count=config.DEFAULT_TASK_COUNT)
Beispiel #32
0
def setup_function(function):
    tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT)
Beispiel #33
0
def check_running(service_name=PACKAGE_NAME):
    tasks.check_running(service_name, configured_task_count(service_name))
Beispiel #34
0
def check_running(count = DEFAULT_TASK_COUNT):
    tasks.check_running(PACKAGE_NAME, count)
Beispiel #35
0
def test_increase_cpu():
    hello_0_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0-server')
    sdk_marathon.bump_cpu_count_config(config.SERVICE_NAME, 'HELLO_CPUS')

    pl = sdk_plan.wait_for_plan_status(config.SERVICE_NAME, 'deploy',
                                       'WAITING')
    log.info(pl)

    assert pl['status'] == 'WAITING'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 5
    assert steps[0]['status'] == 'WAITING'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'
    assert steps[4]['status'] == 'PENDING'

    phase = pl['phases'][1]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'

    # check that all prior tasks are still running, no changes yet
    expected_tasks = [
        'hello-0', 'hello-1', 'hello-2', 'hello-3', 'hello-4', 'world-0',
        'world-1', 'world-2', 'world-3'
    ]
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME,
                           config.SERVICE_NAME,
                           'pod list',
                           json=True) == expected_tasks
    assert hello_0_ids == sdk_tasks.get_task_ids(config.SERVICE_NAME,
                                                 'hello-0-server')

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'plan continue deploy hello-deploy')

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0-server',
                                  hello_0_ids)
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))

    pl = sdk_plan.wait_for_step_status(config.SERVICE_NAME, 'deploy',
                                       'hello-deploy', 'hello-0:[server]',
                                       'COMPLETE')
    log.info(pl)

    assert pl['status'] == 'WAITING'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 5
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'WAITING'
    assert steps[2]['status'] == 'PENDING'
    assert steps[3]['status'] == 'PENDING'
    assert steps[4]['status'] == 'PENDING'

    phase = pl['phases'][1]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'

    hello_1_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-1-server')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'plan continue deploy hello-deploy')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-1-server',
                                  hello_1_ids)

    pl = sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
    log.info(pl)

    assert pl['status'] == 'COMPLETE'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 5
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'
    assert steps[4]['status'] == 'COMPLETE'

    phase = pl['phases'][1]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'
Beispiel #36
0
def test_secrets_update():
    # 1) create Secrets
    # 2) install examples/secrets.yml
    # 3) update Secrets
    # 4) restart task
    # 5) verify Secrets content (updated after restart)
    # 6) delete Secrets

    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)

    create_secrets("{}/".format(config.SERVICE_NAME))

    sdk_install.install(config.PACKAGE_NAME,
                        config.SERVICE_NAME,
                        NUM_HELLO + NUM_WORLD,
                        additional_options=secret_options)

    # tasks will fail if secret file is not created
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD)

    sdk_cmd.run_cli("security secrets update --value={} {}/secret1".format(
        secret_content_alternative, config.SERVICE_NAME))
    sdk_cmd.run_cli("security secrets update --value={} {}/secret2".format(
        secret_content_alternative, config.SERVICE_NAME))
    sdk_cmd.run_cli("security secrets update --value={} {}/secret3".format(
        secret_content_alternative, config.SERVICE_NAME))

    # Verify with hello-0 and world-0, just check with one of the pods

    hello_tasks_old = sdk_tasks.get_task_ids(config.SERVICE_NAME,
                                             "hello-0-server")
    world_tasks_old = sdk_tasks.get_task_ids(config.SERVICE_NAME,
                                             "world-0-server")

    # restart pods to retrieve new secret's content
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod restart hello-0')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod restart world-0')

    # wait pod restart to complete
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server",
                                  hello_tasks_old)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server',
                                  world_tasks_old)

    # wait till it is running
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD)

    # make sure content is changed
    assert secret_content_alternative == read_secret(
        "world-0-server", "bash -c 'echo $WORLD_SECRET1_ENV'")
    assert secret_content_alternative == read_secret("world-0-server",
                                                     "cat WORLD_SECRET2_FILE")
    assert secret_content_alternative == read_secret(
        "world-0-server", "cat {}/secret3".format(config.SERVICE_NAME))

    # make sure content is changed
    assert secret_content_alternative == read_secret(
        "hello-0-server", "bash -c 'echo $HELLO_SECRET1_ENV'")
    assert secret_content_alternative == read_secret("hello-0-server",
                                                     "cat HELLO_SECRET1_FILE")
    assert secret_content_alternative == read_secret("hello-0-server",
                                                     "cat HELLO_SECRET2_FILE")

    # clean up and delete secrets
    delete_secrets("{}/".format(config.SERVICE_NAME))
Beispiel #37
0
def check_healthy(count=DEFAULT_TASK_COUNT):
    sdk_plan.wait_for_completed_deployment(PACKAGE_NAME, timeout_seconds=25 * 60)
    sdk_plan.wait_for_completed_recovery(PACKAGE_NAME, timeout_seconds=25 * 60)
    sdk_tasks.check_running(PACKAGE_NAME, count)
Beispiel #38
0
def pre_test_setup():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    sdk_tasks.check_running(foldered_name, current_expected_task_count)
    config.wait_for_expected_nodes_to_exist(
        service_name=foldered_name, task_count=current_expected_task_count)
Beispiel #39
0
def check_healthy(count=DEFAULT_TASK_COUNT):
    service_plan_complete("deploy")
    service_plan_complete("recovery")
    tasks.check_running(PACKAGE_NAME, count)
Beispiel #40
0
def check_running(service_name=SERVICE_NAME):
    sdk_tasks.check_running(service_name, configured_task_count(service_name))
Beispiel #41
0
def test_structured_streaming_recovery(kerberized_spark, kerberized_kafka):
    kafka_brokers = ','.join(
        sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME,
                        KAFKA_SERVICE_NAME,
                        'endpoints broker',
                        json=True)['dns'])
    LOGGER.info("Kafka brokers: {}".format(kafka_brokers))

    _uri = upload_jaas()
    uris = "spark.mesos.uris={}".format(_uri)

    jar_uri = utils.upload_dcos_test_jar()

    kafka_kerberos_args = get_kerberized_kafka_spark_conf(
        utils.SPARK_SERVICE_NAME)
    LOGGER.info("Spark Kerberos configuration for Kafka:\n{}".format(
        '\n'.join(kafka_kerberos_args)))

    common_args = [
        "--conf", "spark.mesos.containerizer=mesos", "--conf",
        "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf",
        "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris
    ] + kafka_kerberos_args

    # configuring streaming job and HDFS folders
    setup_hdfs_paths()

    # running kafka producer
    message_set_a = ["abc"] * 100
    feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args,
                     message_set_a)

    spark_submit_args = [
        "--supervise", "--class", "StructuredStreamingWithCheckpointing",
        "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1",
        "--conf", "spark.sql.shuffle.partitions=2", "--conf",
        "spark.executor.memory=2g"
    ] + common_args

    application_args = "{} {} {} {}".format(kafka_brokers, KAFKA_TEST_TOPIC,
                                            HDFS_CHECKPOINT_DIR,
                                            SPARK_SECURITY_PROTOCOL)

    driver_task_id = utils.submit_job(app_url=jar_uri,
                                      app_args=application_args,
                                      service_name=utils.SPARK_SERVICE_NAME,
                                      args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS +
                                            spark_submit_args))

    # Wait until executor is running
    LOGGER.info("Starting supervised driver {}".format(driver_task_id))
    sdk_tasks.check_running(SPARK_APPLICATION_NAME,
                            expected_task_count=1,
                            timeout_seconds=600)

    # validating Structured Streaming topic consumption
    expected_output_a = "{}|  {}".format(message_set_a[0], len(message_set_a))
    LOGGER.info(
        "Validating Structured Streaming topic consumption, waiting for output {}"
        .format(expected_output_a))
    utils.wait_for_running_job_output(driver_task_id, expected_output_a)

    # killing the driver
    service_info = shakedown.get_service(SPARK_APPLICATION_NAME).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])
    sdk_cmd.kill_task_with_pattern(agent_host=service_info['hostname'],
                                   pattern=driver_regex)

    # sending more data to Kafka
    message_set_b = ["def"] * 100
    feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC,
                     common_args + kafka_kerberos_args, message_set_b)

    # checkpointing validation
    sdk_tasks.check_running(SPARK_APPLICATION_NAME,
                            expected_task_count=1,
                            timeout_seconds=600)
    LOGGER.info("Streaming job has re-started")

    # validating Structured Streaming resumed topic consumption
    expected_output_b = "{}|  {}".format(message_set_b[0], len(message_set_b))
    LOGGER.info(
        "Validating that consumption resumed from checkpoint, waiting for output '{}' and '{}'"
        .format(expected_output_a, expected_output_b))

    utils.wait_for_running_job_output(driver_task_id, expected_output_a)
    utils.wait_for_running_job_output(driver_task_id, expected_output_b)
Beispiel #42
0
def setup_function(function):
    tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT)
    wait_for_expected_nodes_to_exist()
Beispiel #43
0
def test_unchanged_scheduler_restarts_without_restarting_tasks():
    initial_task_ids = tasks.get_task_ids(PACKAGE_NAME, "master")
    shakedown.kill_process_on_host(get_marathon_host(), "scheduler.Main")
    tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT)
    current_task_ids = tasks.get_task_ids(PACKAGE_NAME, "master")
    assert initial_task_ids == current_task_ids
Beispiel #44
0
def pre_test_setup():
    sdk_tasks.check_running(foldered_name, current_expected_task_count)
    config.wait_for_expected_nodes_to_exist(
        service_name=foldered_name, task_count=current_expected_task_count)
Beispiel #45
0
def update_app(service_name, options, expected_task_count):
    config = sdk_marathon.get_config(service_name)
    config['env'].update(options)
    sdk_marathon.update_app(service_name, config)
    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_tasks.check_running(service_name, expected_task_count)
Beispiel #46
0
def test_increase_count():
    sdk_marathon.bump_task_count_config(config.SERVICE_NAME, 'HELLO_COUNT')

    expected_tasks = [
        'hello-0', 'hello-1', 'hello-2', 'hello-3', 'world-0', 'world-1',
        'world-2', 'world-3'
    ]
    try:
        sdk_tasks.check_running(config.SERVICE_NAME,
                                len(expected_tasks) + 1,
                                timeout_seconds=60)
        assert False, "Should not start task now"
    except AssertionError as arg:
        raise arg
    except:
        pass  # expected to fail
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME,
                           config.SERVICE_NAME,
                           'pod list',
                           json=True) == expected_tasks

    pl = sdk_plan.wait_for_plan_status(config.SERVICE_NAME, 'deploy',
                                       'WAITING')
    log.info(pl)

    assert pl['status'] == 'WAITING'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'WAITING'
    steps = phase['steps']
    assert len(steps) == 5
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'
    assert steps[4]['status'] == 'WAITING'

    phase = pl['phases'][1]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'plan continue deploy hello-deploy')

    expected_tasks = [
        'hello-0', 'hello-1', 'hello-2', 'hello-3', 'hello-4', 'world-0',
        'world-1', 'world-2', 'world-3'
    ]
    sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks))
    assert sdk_cmd.svc_cli(config.PACKAGE_NAME,
                           config.SERVICE_NAME,
                           'pod list',
                           json=True) == expected_tasks

    pl = sdk_plan.wait_for_plan_status(config.SERVICE_NAME, 'deploy',
                                       'COMPLETE')
    log.info(pl)

    assert pl['status'] == 'COMPLETE'

    assert len(pl['phases']) == 2

    phase = pl['phases'][0]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 5
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'
    assert steps[4]['status'] == 'COMPLETE'

    phase = pl['phases'][1]
    assert phase['status'] == 'COMPLETE'
    steps = phase['steps']
    assert len(steps) == 4
    assert steps[0]['status'] == 'COMPLETE'
    assert steps[1]['status'] == 'COMPLETE'
    assert steps[2]['status'] == 'COMPLETE'
    assert steps[3]['status'] == 'COMPLETE'
def check_healthy(count=DEFAULT_TASK_COUNT):
    plan.wait_for_completed_deployment(FOLDERED_SERVICE_NAME,
                                       timeout_seconds=20 * 60)
    plan.wait_for_completed_recovery(FOLDERED_SERVICE_NAME,
                                     timeout_seconds=20 * 60)
    tasks.check_running(FOLDERED_SERVICE_NAME, count)
Beispiel #48
0
def test_pipeline(kerberos_flag,
                  stop_count,
                  jar_uri,
                  keytab_secret,
                  spark_service_name,
                  jaas_uri=None):
    stop_count = str(stop_count)
    kerberized = True if kerberos_flag == "true" else False
    broker_dns = sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME,
                                 KAFKA_SERVICE_NAME,
                                 'endpoints broker',
                                 json=True)['dns'][0]
    topic = "top1"

    big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt"

    # arguments to the application
    producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag])

    uris = "spark.mesos.uris={}".format(big_file_url)

    if kerberized and jaas_uri is None:
        jaas_path = os.path.join(THIS_DIR, "resources",
                                 "spark-kafka-client-jaas.conf")
        s3.upload_file(jaas_path)
        _uri = s3.http_url("spark-kafka-client-jaas.conf")
        uris += ",{}".format(_uri)
    else:
        uris += ",{}".format(jaas_uri)

    common_args = [
        "--conf", "spark.mesos.containerizer=mesos", "--conf",
        "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf",
        "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris
    ]

    kerberos_args = [
        "--conf",
        "spark.mesos.driver.secret.names={}".format(keytab_secret),
        "--conf",
        "spark.mesos.driver.secret.filenames=kafka-client.keytab",
        "--conf",
        "spark.mesos.executor.secret.names={}".format(keytab_secret),
        "--conf",
        "spark.mesos.executor.secret.filenames=kafka-client.keytab",
        "--conf",
        "spark.mesos.task.labels=DCOS_SPACE:/{}".format(spark_service_name),
        "--conf",
        "spark.executorEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5),
        "--conf",
        "spark.mesos.driverEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5),
        "--conf",
        "spark.driver.extraJavaOptions=-Djava.security.auth.login.config="
        "/mnt/mesos/sandbox/spark-kafka-client-jaas.conf",
        "--conf",
        "spark.executor.extraJavaOptions="
        "-Djava.security.auth.login.config=/mnt/mesos/sandbox/spark-kafka-client-jaas.conf",
    ]

    producer_config = [
        "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=2",
        "--class", "KafkaFeeder"
    ] + common_args

    if kerberized:
        producer_config += kerberos_args

    producer_id = utils.submit_job(app_url=jar_uri,
                                   app_args=producer_args,
                                   service_name=spark_service_name,
                                   args=producer_config)

    sdk_tasks.check_running(KAFKA_SERVICE_NAME, 1, timeout_seconds=600)

    consumer_config = [
        "--conf", "spark.cores.max=4", "--class", "KafkaConsumer"
    ] + common_args

    if kerberized:
        consumer_config += kerberos_args

    consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag])

    utils.run_tests(app_url=jar_uri,
                    app_args=consumer_args,
                    expected_output="Read {} words".format(stop_count),
                    service_name=spark_service_name,
                    args=consumer_config)

    utils.kill_driver(producer_id, spark_service_name)
def test_profile_mount_volumes():
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO)
Beispiel #50
0
def test_secrets_config_update():
    # 1) install examples/secrets.yml
    # 2) create new Secrets, delete old Secrets
    # 2) update configuration with new Secrets
    # 4) verify secret content (using new Secrets after config update)

    install.uninstall(PACKAGE_NAME)

    create_secrets("{}/".format(PACKAGE_NAME))

    install.install(PACKAGE_NAME,
                    NUM_HELLO + NUM_WORLD,
                    additional_options=secret_options)

    # launch will fail if secrets are not available or not accessible
    plan.wait_for_completed_deployment(PACKAGE_NAME)

    # tasks will fail if secret file is not created
    tasks.check_running(PACKAGE_NAME, NUM_HELLO + NUM_WORLD)

    # Verify secret content, one from each pod type
    # get tasks ids - only first pods
    hello_tasks = tasks.get_task_ids(PACKAGE_NAME, "hello-0")
    world_tasks = tasks.get_task_ids(PACKAGE_NAME, "world-0")

    # make sure it has the default value
    assert secret_content_default == task_exec(
        world_tasks[0], "bash -c 'echo $WORLD_SECRET1_ENV'")
    assert secret_content_default == task_exec(world_tasks[0],
                                               "cat WORLD_SECRET2_FILE")
    assert secret_content_default == task_exec(
        world_tasks[0], "cat {}/secret3".format(PACKAGE_NAME))

    # hello tasks has container image
    assert secret_content_default == task_exec(
        hello_tasks[0], "bash -c 'echo $HELLO_SECRET1_ENV'")
    assert secret_content_default == task_exec(hello_tasks[0],
                                               "cat HELLO_SECRET1_FILE")
    assert secret_content_default == task_exec(hello_tasks[0],
                                               "cat HELLO_SECRET2_FILE")

    # clean up and delete secrets (defaults)
    delete_secrets("{}/".format(PACKAGE_NAME))

    # create new secrets with new content -- New Value
    create_secrets(secret_content_arg=secret_content_alternative)

    config = marathon.get_config(PACKAGE_NAME)
    config['env']['HELLO_SECRET1'] = 'secret1'
    config['env']['HELLO_SECRET2'] = 'secret2'
    config['env']['WORLD_SECRET1'] = 'secret1'
    config['env']['WORLD_SECRET2'] = 'secret2'
    config['env']['WORLD_SECRET3'] = 'secret3'

    # config update
    marathon.update_app(PACKAGE_NAME, config)

    # wait till plan is complete - pods are supposed to restart
    plan.wait_for_completed_deployment(PACKAGE_NAME)

    # all tasks are running
    tasks.check_running(PACKAGE_NAME, NUM_HELLO + NUM_WORLD)

    # Verify secret content is changed

    # get task ids - only first pod
    hello_tasks = tasks.get_task_ids(PACKAGE_NAME, "hello-0")
    world_tasks = tasks.get_task_ids(PACKAGE_NAME, "world-0")

    assert secret_content_alternative == task_exec(
        world_tasks[0], "bash -c 'echo $WORLD_SECRET1_ENV'")
    assert secret_content_alternative == task_exec(world_tasks[0],
                                                   "cat WORLD_SECRET2_FILE")
    assert secret_content_alternative == task_exec(world_tasks[0],
                                                   "cat secret3")

    assert secret_content_alternative == task_exec(
        hello_tasks[0], "bash -c 'echo $HELLO_SECRET1_ENV'")
    assert secret_content_alternative == task_exec(hello_tasks[0],
                                                   "cat HELLO_SECRET1_FILE")
    assert secret_content_alternative == task_exec(hello_tasks[0],
                                                   "cat HELLO_SECRET2_FILE")

    # clean up and delete secrets
    delete_secrets()
Beispiel #51
0
def test_secrets_config_update():
    # 1) install examples/secrets.yml
    # 2) create new Secrets, delete old Secrets
    # 2) update configuration with new Secrets
    # 4) verify secret content (using new Secrets after config update)

    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)

    create_secrets("{}/".format(config.SERVICE_NAME))

    sdk_install.install(config.PACKAGE_NAME,
                        config.SERVICE_NAME,
                        NUM_HELLO + NUM_WORLD,
                        additional_options=secret_options)

    # tasks will fail if secret file is not created
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD)

    # Verify secret content, one from each pod type

    # make sure it has the default value
    assert secret_content_default == read_secret(
        "world-0-server", "bash -c 'echo $WORLD_SECRET1_ENV'")
    assert secret_content_default == read_secret("world-0-server",
                                                 "cat WORLD_SECRET2_FILE")
    assert secret_content_default == read_secret(
        "world-0-server", "cat {}/secret3".format(config.SERVICE_NAME))

    # hello tasks has container image
    assert secret_content_default == read_secret(
        "hello-0-server", "bash -c 'echo $HELLO_SECRET1_ENV'")
    assert secret_content_default == read_secret("hello-0-server",
                                                 "cat HELLO_SECRET1_FILE")
    assert secret_content_default == read_secret("hello-0-server",
                                                 "cat HELLO_SECRET2_FILE")

    # clean up and delete secrets (defaults)
    delete_secrets("{}/".format(config.SERVICE_NAME))

    # create new secrets with new content -- New Value
    create_secrets(secret_content_arg=secret_content_alternative)

    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config['env']['HELLO_SECRET1'] = 'secret1'
    marathon_config['env']['HELLO_SECRET2'] = 'secret2'
    marathon_config['env']['WORLD_SECRET1'] = 'secret1'
    marathon_config['env']['WORLD_SECRET2'] = 'secret2'
    marathon_config['env']['WORLD_SECRET3'] = 'secret3'

    # config update
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

    # wait till plan is complete - pods are supposed to restart
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    # all tasks are running
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD)

    # Verify secret content is changed

    assert secret_content_alternative == read_secret(
        "world-0-server", "bash -c 'echo $WORLD_SECRET1_ENV'")
    assert secret_content_alternative == read_secret("world-0-server",
                                                     "cat WORLD_SECRET2_FILE")
    assert secret_content_alternative == read_secret("world-0-server",
                                                     "cat secret3")

    assert secret_content_alternative == read_secret(
        "hello-0-server", "bash -c 'echo $HELLO_SECRET1_ENV'")
    assert secret_content_alternative == read_secret("hello-0-server",
                                                     "cat HELLO_SECRET1_FILE")
    assert secret_content_alternative == read_secret("hello-0-server",
                                                     "cat HELLO_SECRET2_FILE")

    # clean up and delete secrets
    delete_secrets()
Beispiel #52
0
def test_soak_secrets_framework_alive():

    sdk_plan.wait_for_completed_deployment(FRAMEWORK_NAME)
    sdk_tasks.check_running(FRAMEWORK_NAME, NUM_HELLO + NUM_WORLD)
Beispiel #53
0
def _set_xpack(service_name, is_enabled):
    config = marathon.get_config(service_name)
    config['env']['TASKCFG_ALL_XPACK_ENABLED'] = is_enabled
    marathon.update_app(service_name, config)
    sdk_plan.wait_for_completed_deployment(service_name)
    tasks.check_running(service_name, DEFAULT_TASK_COUNT)
Beispiel #54
0
def test_service_startup_rapid():
    max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS
    startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS
    retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS

    task_short_name = "kafka-0"
    broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME,
                                              task_short_name)[0]

    # the following 'dcos kafka topic ....' command has expected output as follows:
    # 'Output: 100 records sent ....'
    # but may fail, i.e. have output such as follows:
    # '...leader not available...'
    stdout = ""
    retries = 15
    while retries > 0:
        retries -= 1
        _, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                                       config.SERVICE_NAME,
                                       "topic producer_test test 100")
        if "records sent" in stdout:
            break

    _, jsonobj, _ = sdk_cmd.svc_cli(
        config.PACKAGE_NAME,
        config.SERVICE_NAME,
        "pod restart {}".format(task_short_name),
        parse_json=True,
    )
    assert len(jsonobj) == 2
    assert jsonobj["pod"] == task_short_name
    assert jsonobj["tasks"] == ["{}-broker".format(task_short_name)]

    starting_fallback_time = datetime.datetime.now()

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME,
                                  "{}-".format(config.DEFAULT_POD_TYPE),
                                  [broker_task_id_0])
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME,
                                              task_short_name)[0]

    # extract starting and started lines from log
    starting_time = started_time = None
    retry_seconds_remaining = max_restart_seconds + startup_padding_seconds
    while retry_seconds_remaining > 0.0 and (starting_time is None
                                             or started_time is None):
        stdout = sdk_cmd.run_cli(
            "task log --lines=1000 {}".format(broker_task_id_1))
        task_lines = stdout.split("\n")
        for log_line in reversed(task_lines):
            if starting_time is None and " starting (kafka.server.KafkaServer)" in log_line:
                starting_time = log_line_ts(log_line)
            elif started_time is None and " started (kafka.server.KafkaServer)" in log_line:
                started_time = log_line_ts(log_line)
        if starting_time is None or started_time is None:
            time.sleep(retry_delay_seconds)

    if started_time is None or starting_time is None:
        f = open("/tmp/kafka_startup_stdout", "w")
        f.write(stdout)
        f.close()

    if starting_time is None:
        starting_time = starting_fallback_time

    assert starting_time is not None
    assert started_time is not None
    assert started_time >= starting_time
    assert (started_time -
            starting_time).total_seconds() <= max_restart_seconds
Beispiel #55
0
def pre_test_setup():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    sdk_tasks.check_running(foldered_name, config.DEFAULT_TASK_COUNT)
    config.wait_for_expected_nodes_to_exist(service_name=foldered_name)
Beispiel #56
0
def pre_test_setup() -> None:
    sdk_tasks.check_running(service_name, current_expected_task_count)
    config.wait_for_expected_nodes_to_exist(
        service_name=service_name,
        task_count=current_expected_task_count - current_non_node_task_count,
    )
Beispiel #57
0
def test_static_port_comes_online(static_port_config):
    tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)
Beispiel #58
0
def xpack(is_enabled):
    config = marathon.get_config(PACKAGE_NAME)
    config['env']['TASKCFG_ALL_XPACK_ENABLED'] = is_enabled
    marathon.update_app(PACKAGE_NAME, config)
    sdk_plan.wait_for_completed_deployment(PACKAGE_NAME)
    tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT)
Beispiel #59
0
def test_service_health():
    sdk_tasks.check_running(config.SERVICE_NAME, config.NO_INGEST_TASK_COUNT)
    config.wait_for_expected_nodes_to_exist(
        task_count=config.NO_INGEST_TASK_COUNT)
    assert shakedown.service_healthy(config.SERVICE_NAME)
Beispiel #60
0
def test_supervise(kerberized_spark, hdfs_with_kerberos):
    job_service_name = "RecoverableNetworkWordCount"

    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(job_service_name)
        if present:
            return svc is not None
        else:
            return svc is None

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs://{}".format(HDFS_DATA_DIR)
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        service_name=utils.SPARK_SERVICE_NAME,
        args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    wait_job_present(True)
    log.info("Job has registered")
    sdk_tasks.check_running(job_service_name, 1)
    log.info("Job has running executors")

    service_info = shakedown.get_service(job_service_name).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])

    status, stdout = shakedown.run_command_on_agent(
        service_info['hostname'],
        "ps aux | grep -v grep | grep '{}'".format(driver_regex),
        username=sdk_cmd.LINUX_USER)

    pids = [p.strip().split()[1] for p in stdout.splitlines()]

    for pid in pids:
        status, stdout = shakedown.run_command_on_agent(
            service_info['hostname'],
            "sudo kill -9 {}".format(pid),
            username=sdk_cmd.LINUX_USER)

        if status:
            print("Killed pid: {}".format(pid))
        else:
            print("Unable to killed pid: {}".format(pid))

    wait_job_present(True)
    log.info("Job has re-registered")
    sdk_tasks.check_running(job_service_name, 1)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    wait_job_present(False)