def test_modify_app_config():
    """This tests checks that the modification of the app config does not trigger a recovery."""
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(foldered_name)
    old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")

    app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS'
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    name_ids = sdk_tasks.get_task_ids(foldered_name, 'name')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info('marathon config: ')
    log.info(marathon_config)
    expiry_ms = int(marathon_config['env'][app_config_field])
    marathon_config['env'][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60)

    # All tasks should be updated because hdfs-site.xml has changed
    config.check_healthy(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids)
    sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids)

    sdk_plan.wait_for_completed_recovery(foldered_name)
    new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")
    assert old_recovery_plan == new_recovery_plan
def test_secrets_basic():
    # 1) create Secrets
    # 2) install examples/secrets.yml
    # 3) if secret file is not created, tasks will fail
    # 4) wait till deployment finishes
    # 5) do replace operation
    # 6) ensure all tasks are running
    # 7) delete Secrets

    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)

    create_secrets("{}/".format(config.SERVICE_NAME))

    sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options)

    hello_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server")
    world_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "word-0-server")

    # ensure that secrets work after replace
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0')

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_0)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_0)

    # tasks will fail if secret files are not created by mesos module
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD)

    # clean up and delete secrets
    delete_secrets("{}/".format(config.SERVICE_NAME))
def test_increase_decrease_world_nodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    config.check_running(foldered_name)

    original_hello_ids = sdk_tasks.get_task_ids(foldered_name, 'hello')
    original_world_ids = sdk_tasks.get_task_ids(foldered_name, 'world')
    log.info('world ids: ' + str(original_world_ids))

    # add 2 world nodes
    sdk_marathon.bump_task_count_config(foldered_name, 'WORLD_COUNT', 2)

    config.check_running(foldered_name)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'world', original_world_ids)

    # check 2 world tasks added:
    assert 2 + len(original_world_ids) == len(sdk_tasks.get_task_ids(foldered_name, 'world'))

    # subtract 2 world nodes
    sdk_marathon.bump_task_count_config(foldered_name, 'WORLD_COUNT', -2)

    config.check_running(foldered_name)
    # wait for the decommission plan for this subtraction to be complete
    sdk_plan.wait_for_completed_plan(foldered_name, 'decommission')
    # check that the total task count is back to original
    sdk_tasks.check_running(
        foldered_name,
        len(original_hello_ids) + len(original_world_ids),
        allow_more=False)
    # check that original tasks weren't affected/relaunched in the process
    sdk_tasks.check_tasks_not_updated(foldered_name, 'hello', original_hello_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'world', original_world_ids)

    # check that the world tasks are back to their prior state (also without changing task ids)
    assert original_world_ids == sdk_tasks.get_task_ids(foldered_name, 'world')
Example #4
0
def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes() -> None:
    initial_master_task_ids = sdk_tasks.get_task_ids(service_name, "master")
    initial_data_task_ids = sdk_tasks.get_task_ids(service_name, "data")

    # Get service configuration.
    _, svc_config, _ = sdk_cmd.svc_cli(package_name, service_name, "describe", parse_json=True)

    ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config)
    coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config)

    global current_expected_task_count

    sdk_service.update_configuration(
        package_name,
        service_name,
        {
            "ingest_nodes": {"count": ingest_nodes_count + 1},
            "coordinator_nodes": {"count": coordinator_nodes_count + 1},
        },
        current_expected_task_count,
        # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than
        # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards
        # with `sdk_tasks.check_running`.
        wait_for_deployment=False,
    )

    # Should be running 2 tasks more.
    current_expected_task_count += 2
    sdk_tasks.check_running(service_name, current_expected_task_count)
    # Master nodes should not restart.
    sdk_tasks.check_tasks_not_updated(service_name, "master", initial_master_task_ids)
    # Data nodes should not restart.
    sdk_tasks.check_tasks_not_updated(service_name, "data", initial_data_task_ids)
Example #5
0
def test_soak_secrets_update():

    secret_content_alternative = "hello-world-secret-data-alternative"
    test_soak_secrets_framework_alive()

    sdk_cmd.run_cli("package install --cli dcos-enterprise-cli --yes")
    sdk_cmd.run_cli("package install --cli hello-world --yes")
    sdk_cmd.run_cli("security secrets update --value={} secrets/secret1".format(secret_content_alternative))
    sdk_cmd.run_cli("security secrets update --value={} secrets/secret2".format(secret_content_alternative))
    sdk_cmd.run_cli("security secrets update --value={} secrets/secret3".format(secret_content_alternative))
    test_soak_secrets_restart_hello0()

    # get new task ids - only first pod
    hello_tasks = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "hello-0")
    world_tasks = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "world-0")

    # make sure content is changed
    assert secret_content_alternative == task_exec(world_tasks[0], "bash -c 'echo $WORLD_SECRET1_ENV'")
    assert secret_content_alternative == task_exec(world_tasks[0], "cat WORLD_SECRET2_FILE")
    assert secret_content_alternative == task_exec(world_tasks[0], "cat secrets/secret3")

    # make sure content is changed
    assert secret_content_alternative == task_exec(hello_tasks[0], "bash -c 'echo $HELLO_SECRET1_ENV'")
    assert secret_content_alternative == task_exec(hello_tasks[0], "cat HELLO_SECRET1_FILE")
    assert secret_content_alternative == task_exec(hello_tasks[0], "cat HELLO_SECRET2_FILE")

    # revert back to some other value
    sdk_cmd.run_cli("security secrets update --value=SECRET1 secrets/secret1")
    sdk_cmd.run_cli("security secrets update --value=SECRET2 secrets/secret2")
    sdk_cmd.run_cli("security secrets update --value=SECRET3 secrets/secret3")
    test_soak_secrets_restart_hello0()
Example #6
0
def test_modify_app_config_rollback():
    app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS"

    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    old_config = sdk_marathon.get_config(foldered_name)
    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info("marathon config: ")
    log.info(marathon_config)
    expiry_ms = int(marathon_config["env"][app_config_field])
    log.info("expiry ms: " + str(expiry_ms))
    marathon_config["env"][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(marathon_config, timeout=15 * 60)

    # Wait for journal nodes to be affected by the change
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")

    log.info("old config: ")
    log.info(old_config)
    # Put the old config back (rollback)
    sdk_marathon.update_app(old_config)

    # Wait for the journal nodes to return to their old configuration
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    config.check_healthy(service_name=foldered_name)

    marathon_config = sdk_marathon.get_config(foldered_name)
    assert int(marathon_config["env"][app_config_field]) == expiry_ms

    # Data tasks should not have been affected
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_modify_app_config_rollback():
    app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS'
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)

    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    old_config = sdk_marathon.get_config(foldered_name)
    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info('marathon config: ')
    log.info(marathon_config)
    expiry_ms = int(marathon_config['env'][app_config_field])
    log.info('expiry ms: ' + str(expiry_ms))
    marathon_config['env'][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60)

    # Wait for journal nodes to be affected by the change
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')

    log.info('old config: ')
    log.info(old_config)
    # Put the old config back (rollback)
    sdk_marathon.update_app(foldered_name, old_config)

    # Wait for the journal nodes to return to their old configuration
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    config.check_healthy(service_name=foldered_name)

    marathon_config = sdk_marathon.get_config(foldered_name)
    assert int(marathon_config['env'][app_config_field]) == expiry_ms

    # Data tasks should not have been affected
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #8
0
def test_modify_app_config():
    """This tests checks that the modification of the app config does not trigger a recovery."""
    sdk_plan.wait_for_completed_recovery(foldered_name)
    old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")

    app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS"
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info("marathon config: ")
    log.info(marathon_config)
    expiry_ms = int(marathon_config["env"][app_config_field])
    marathon_config["env"][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(marathon_config, timeout=15 * 60)

    # All tasks should be updated because hdfs-site.xml has changed
    config.check_healthy(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_updated(foldered_name, "name", name_ids)
    sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids)

    sdk_plan.wait_for_completed_recovery(foldered_name)
    new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")
    assert old_recovery_plan == new_recovery_plan
def test_service_startup_rapid():
    max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS
    startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS
    retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS

    task_short_name = 'kafka-0'
    broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0]

    # the following 'dcos kafka topic ....' command has expected output as follows:
    # 'Output: 100 records sent ....'
    # but may fail, i.e. have output such as follows:
    # '...leader not available...'
    stdout = ''
    retries = 15
    while retries > 0:
        retries -= 1
        stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'topic producer_test test 100')
        if 'records sent' in stdout:
            break

    jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart {}'.format(task_short_name), json=True)
    assert len(jsonobj) == 2
    assert jsonobj['pod'] == task_short_name
    assert jsonobj['tasks'] == [ '{}-broker'.format(task_short_name) ]

    starting_fallback_time = datetime.datetime.now()

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), [ broker_task_id_0 ])
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)

    broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0]

    # extract starting and started lines from log
    starting_time = started_time = None
    retry_seconds_remaining = max_restart_seconds + startup_padding_seconds
    while retry_seconds_remaining > 0.0 and (starting_time is None or started_time is None):
        stdout = sdk_cmd.run_cli("task log --lines=1000 {}".format(broker_task_id_1))
        task_lines = stdout.split('\n')
        for log_line in reversed(task_lines):
            if starting_time is None and ' starting (kafka.server.KafkaServer)' in log_line:
                starting_time = log_line_ts(log_line)
            elif started_time is None and ' started (kafka.server.KafkaServer)' in log_line:
                started_time = log_line_ts(log_line)
        if starting_time is None or started_time is None:
            time.sleep(retry_delay_seconds)

    if started_time is None or starting_time is None:
        f = open('/tmp/kafka_startup_stdout', 'w')
        f.write(stdout)
        f.close()

    if starting_time is None:
        starting_time = starting_fallback_time

    assert starting_time is not None
    assert started_time is not None
    assert started_time >= starting_time
    assert (started_time - starting_time).total_seconds() <= max_restart_seconds
Example #10
0
def check_permanent_recovery(
    package_name: str,
    service_name: str,
    pod_name: str,
    recovery_timeout_s: int,
    pods_with_updated_tasks: Optional[List[str]] = None,
) -> None:
    """
    Perform a replace (permanent recovery) operation on the specified pod.

    The specified pod AND any additional pods in `pods_with_updated_tasks` are
    checked to ensure that their tasks have been restarted.

    Any remaining pods are checked to ensure that their tasks are not changed.

    For example, performing a pod replace kafka-0 on a Kafka framework should
    result in ONLY the kafa-0-broker task being restarted. In this case,
    pods_with_updated_tasks is specified as None.

    When performing a pod replace operation on a Cassandra seed node (node-0),
    a rolling restart of other nodes is triggered, and
    pods_with_updated_tasks = ["node-0", "node-1", "node-2"]
    (assuming a three node Cassandra ring)
    """
    LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name)

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)

    rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list")
    assert rc == 0, "Pod list failed"
    pod_list = set(json.loads(stdout))

    pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else []
    pods_to_update = set(pods_with_updated_tasks + [pod_name])

    tasks_to_replace = {}
    for pod in pods_to_update:
        tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name))

    LOG.info("The following tasks will be replaced: %s", tasks_to_replace)

    tasks_in_other_pods = {}
    for pod in pod_list - pods_to_update:
        tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod))

    LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods)

    sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name))

    sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s)
    sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s)

    for pod, tasks in tasks_to_replace.items():
        sdk_tasks.check_tasks_updated(service_name, pod, tasks)

    for pod, tasks in tasks_in_other_pods.items():
        sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
Example #11
0
def test_kill_journal_node():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal-0')
    name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    sdk_tasks.kill_task_with_pattern('journalnode', sdk_hosts.system_host(FOLDERED_SERVICE_NAME, 'journal-0-node'))
    config.expect_recovery(service_name=FOLDERED_SERVICE_NAME)
    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #12
0
def test_bump_journal_cpus():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name')
    log.info('journal ids: ' + str(journal_ids))

    sdk_marathon.bump_cpu_count_config(FOLDERED_SERVICE_NAME, 'JOURNAL_CPUS')

    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    # journal node update should not cause any of the name nodes to crash
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
    config.check_healthy(service_name=FOLDERED_SERVICE_NAME)
Example #13
0
def test_kill_data_node():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data-0')
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    name_ids = sdk_tasks.get_task_ids(foldered_name, 'name')

    sdk_cmd.kill_task_with_pattern('datanode', sdk_hosts.system_host(foldered_name, 'data-0-node'))
    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
Example #14
0
def test_kill_data_node():
    data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0]
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")

    sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host)

    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id])
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
Example #15
0
def test_kill_all_journalnodes():
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    for journal_pod in config.get_pod_type_instances("journal", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(journal_pod))

    config.expect_recovery(service_name=foldered_name)

    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
Example #16
0
def test_changing_discovery_replaces_certificate_sans(hello_world_service):
    """
    Update service configuration to change discovery prefix of a task.
    Scheduler should update task and new SANs should be generated.
    """
    original_tasks = sdk_tasks.get_task_ids(config.PACKAGE_NAME, 'discovery')
    assert len(original_tasks) == 1, 'Expecting exactly one task ID'

    task_id = original_tasks[0]
    assert task_id

    # Load end-entity certificate from PEM encoded file
    end_entity_cert = x509.load_pem_x509_certificate(
        task_exec(task_id, 'cat server.crt').encode('ascii'),
        DEFAULT_BACKEND)

    san_extension = end_entity_cert.extensions.get_extension_for_oid(
        ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
    sans = [
        san.value for san in san_extension.value._general_names._general_names]

    expected_san = (
        '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format(
            name=DISCOVERY_TASK_PREFIX,
            service_name=config.SERVICE_NAME)
        )
    assert expected_san in sans

    # Run task update with new discovery prefix
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config['env']['DISCOVERY_TASK_PREFIX'] = DISCOVERY_TASK_PREFIX + '-new'
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'discovery', original_tasks)
    sdk_tasks.check_running(config.SERVICE_NAME, 4)
    new_task_id = sdk_tasks.get_task_ids(config.SERVICE_NAME, "discovery")[0]
    assert task_id != new_task_id

    new_cert = x509.load_pem_x509_certificate(
        task_exec(new_task_id, 'cat server.crt').encode('ascii'),
        DEFAULT_BACKEND)

    san_extension = new_cert.extensions.get_extension_for_oid(
        ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
    sans = [
        san.value for san in san_extension.value._general_names._general_names]

    expected_san =  (
        '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format(
            name=DISCOVERY_TASK_PREFIX + '-new',
            service_name=config.SERVICE_NAME)
        )
    assert expected_san in sans
def test_kill_all_journalnodes(hdfs_server):
    service_name = hdfs_server["service"]["name"]
    journal_ids = sdk_tasks.get_task_ids(service_name, "journal")
    name_ids = sdk_tasks.get_task_ids(service_name, "name")
    data_ids = sdk_tasks.get_task_ids(service_name, "data")

    for journal_pod in config.get_pod_type_instances("journal", service_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, "pod restart {}".format(journal_pod))
        config.expect_recovery(service_name=service_name)

    sdk_tasks.check_tasks_updated(service_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(service_name, "name", name_ids)
    sdk_tasks.check_tasks_not_updated(service_name, "data", data_ids)
Example #18
0
def test_kill_all_datanodes():
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    for data_pod in config.get_pod_type_instances("data", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(data_pod))

    config.expect_recovery(service_name=foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
Example #19
0
def test_bump_journal_cpus():
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")
    log.info("journal ids: " + str(journal_ids))

    sdk_marathon.bump_cpu_count_config(foldered_name, "JOURNAL_CPUS")

    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    # journal node update should not cause any of the name nodes to crash
    # if the name nodes crashed, then it implies the journal nodes were updated in parallel, when they should've been updated serially
    # for journal nodes, the deploy plan is parallel, while the update plan is serial. maybe the deploy plan was mistakenly used?
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
    config.check_healthy(service_name=foldered_name)
Example #20
0
def test_kill_all_journalnodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    data_ids = sdk_tasks.get_task_ids(sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data')

    for journal_pod in config.get_pod_type_instances("journal", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(journal_pod))

    config.expect_recovery(service_name=foldered_name)

    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #21
0
def test_kill_all_datanodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    name_ids = sdk_tasks.get_task_ids(foldered_name, 'name')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    for data_pod in config.get_pod_type_instances("data", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(data_pod))

    config.expect_recovery(service_name=foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
Example #22
0
def test_kill_all_journalnodes():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    for journal_pod in config.get_pod_type_instances("journal", FOLDERED_SERVICE_NAME):
        sdk_cmd.svc_cli(
            config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
            'pod restart {}'.format(journal_pod))

    config.expect_recovery(service_name=FOLDERED_SERVICE_NAME)

    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_secrets_update():
    # 1) create Secrets
    # 2) install examples/secrets.yml
    # 3) update Secrets
    # 4) restart task
    # 5) verify Secrets content (updated after restart)
    # 6) delete Secrets

    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)

    create_secrets("{}/".format(config.SERVICE_NAME))

    sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options)

    # tasks will fail if secret file is not created
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD)


    sdk_cmd.run_cli("security secrets update --value={} {}/secret1".format(secret_content_alternative, config.SERVICE_NAME))
    sdk_cmd.run_cli("security secrets update --value={} {}/secret2".format(secret_content_alternative, config.SERVICE_NAME))
    sdk_cmd.run_cli("security secrets update --value={} {}/secret3".format(secret_content_alternative, config.SERVICE_NAME))

    # Verify with hello-0 and world-0, just check with one of the pods

    hello_tasks_old = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server")
    world_tasks_old = sdk_tasks.get_task_ids(config.SERVICE_NAME, "world-0-server")

    # restart pods to retrieve new secret's content
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart hello-0')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart world-0')

    # wait pod restart to complete
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_old)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_old)

    # wait till it is running
    sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD)

    # make sure content is changed
    assert secret_content_alternative == read_secret("world-0-server", "bash -c 'echo $WORLD_SECRET1_ENV'")
    assert secret_content_alternative == read_secret("world-0-server", "cat WORLD_SECRET2_FILE")
    assert secret_content_alternative == read_secret("world-0-server", "cat {}/secret3".format(config.SERVICE_NAME))

    # make sure content is changed
    assert secret_content_alternative == read_secret("hello-0-server", "bash -c 'echo $HELLO_SECRET1_ENV'")
    assert secret_content_alternative == read_secret("hello-0-server", "cat HELLO_SECRET1_FILE")
    assert secret_content_alternative == read_secret("hello-0-server", "cat HELLO_SECRET2_FILE")

    # clean up and delete secrets
    delete_secrets("{}/".format(config.SERVICE_NAME))
Example #24
0
def test_soak_secrets_restart_hello0():

    hello_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "hello-0")
    world_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "world-0")

    # restart pods to retrieve new secret's content
    sdk_cmd.svc_cli(config.PACKAGE_NAME, FRAMEWORK_NAME, 'pod restart hello-0')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, FRAMEWORK_NAME, 'pod restart world-0')

    # wait pod restart to complete
    sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, "hello-0", hello_tasks_old)
    sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, 'world-0', world_tasks_old)

    # wait till it all running
    sdk_tasks.check_running(FRAMEWORK_NAME, NUM_HELLO + NUM_WORLD)
Example #25
0
def test_kill_all_datanodes():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    for data_pod in config.get_pod_type_instances("data", FOLDERED_SERVICE_NAME):
        sdk_cmd.svc_cli(
            config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
            'pod restart {}'.format(data_pod))

    config.expect_recovery(service_name=FOLDERED_SERVICE_NAME)

    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
def test_kill_hello_node():
    config.check_running()
    hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0')
    sdk_cmd.kill_task_with_pattern('hello', 'hello-0-server.hello-world.mesos')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0', hello_ids)

    config.check_running()
def test_config_updates_then_all_executors_killed():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    hosts = shakedown.get_service_ips(config.SERVICE_NAME)
    config.bump_world_cpus()
    [sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', h) for h in hosts]
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
def test_config_update_then_scheduler_died():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    host = sdk_marathon.get_scheduler_host(config.SERVICE_NAME)
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern('helloworld.scheduler.Main', host)
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
def test_pods_restart_graceful_shutdown():
    options = {
        "world": {
            "kill_grace_period": 30
        }
    }

    sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
    sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT,
                        additional_options=options)

    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world-0')

    jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart world-0', json=True)
    assert len(jsonobj) == 2
    assert jsonobj['pod'] == 'world-0'
    assert len(jsonobj['tasks']) == 1
    assert jsonobj['tasks'][0] == 'world-0-server'

    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0', world_ids)
    config.check_running()

    # ensure the SIGTERM was sent via the "all clean" message in the world
    # service's signal trap/handler, BUT not the shell command, indicated
    # by "echo".
    stdout = sdk_cmd.run_cli(
        "task log --completed --lines=1000 {}".format(world_ids[0]))
    clean_msg = None
    for s in stdout.split('\n'):
        if s.find('echo') < 0 and s.find('all clean') >= 0:
            clean_msg = s

    assert clean_msg is not None
def test_config_update_then_kill_task_in_node():
    # kill 1 of 2 world tasks
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern('world', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME))
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
Example #31
0
def test_adding_data_node_only_restarts_masters():
    initial_master_task_ids = sdk_tasks.get_task_ids(foldered_name, "master")
    initial_data_task_ids = sdk_tasks.get_task_ids(foldered_name, "data")
    initial_coordinator_task_ids = sdk_tasks.get_task_ids(foldered_name, "coordinator")
    marathon_config = sdk_marathon.get_config(foldered_name)
    data_nodes = int(marathon_config['env']['DATA_NODE_COUNT'])
    marathon_config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1)
    sdk_marathon.update_app(foldered_name, marathon_config)
    sdk_plan.wait_for_completed_deployment(foldered_name)
    global current_expected_task_count
    current_expected_task_count += 1
    sdk_tasks.check_running(foldered_name, current_expected_task_count)
    sdk_tasks.check_tasks_updated(foldered_name, "master", initial_master_task_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", initial_data_task_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "coordinator", initial_coordinator_task_ids)
    sdk_plan.wait_for_completed_deployment(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
Example #32
0
def test_config_update_then_executor_killed():
    world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world')
    bump_world_cpus()
    tasks.kill_task_with_pattern(
        'helloworld.executor.Main',
        'world-0-server.{}.mesos'.format(PACKAGE_NAME))
    tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids)
    check_running()
Example #33
0
def test_soak_secrets_restart_hello0():

    hello_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "hello-0")
    world_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "world-0")

    # restart pods to retrieve new secret's content
    sdk_cmd.run_cli(
        'hello-world --name={} pod restart hello-0'.format(FRAMEWORK_NAME))
    sdk_cmd.run_cli(
        'hello-world --name={} pod restart world-0'.format(FRAMEWORK_NAME))

    # wait pod restart to complete
    sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, "hello-0", hello_tasks_old)
    sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, 'world-0', world_tasks_old)

    # wait till it all running
    sdk_tasks.check_running(FRAMEWORK_NAME, NUM_HELLO + NUM_WORLD)
Example #34
0
def restart_broker_pods(service_name=SERVICE_NAME):
    for i in range(DEFAULT_BROKER_COUNT):
        broker_id = sdk_tasks.get_task_ids(service_name,'{}-{}-{}'.format(DEFAULT_POD_TYPE, i, DEFAULT_TASK_NAME))
        restart_info = service_cli('pod restart {}-{}'.format(DEFAULT_POD_TYPE, i), service_name=service_name)
        sdk_tasks.check_tasks_updated(service_name, '{}-{}-{}'.format(DEFAULT_POD_TYPE, i, DEFAULT_TASK_NAME), broker_id)
        sdk_tasks.check_running(service_name, DEFAULT_BROKER_COUNT)
        assert len(restart_info) == 2
        assert restart_info['tasks'][0] == '{}-{}-{}'.format(DEFAULT_POD_TYPE, i, DEFAULT_TASK_NAME)
Example #35
0
def test_bump_data_nodes():
    data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data')
    sdk_utils.out('data ids: ' + str(data_ids))

    marathon.bump_task_count_config(PACKAGE_NAME, 'DATA_COUNT')

    check_healthy(DEFAULT_TASK_COUNT + 1)
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
Example #36
0
def test_bump_journal_cpus():
    journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal')
    sdk_utils.out('journal ids: ' + str(journal_ids))

    marathon.bump_cpu_count_config(PACKAGE_NAME, 'JOURNAL_CPUS')

    tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids)
    check_healthy()
Example #37
0
def test_bump_data_nodes():
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")
    log.info("data ids: " + str(data_ids))

    sdk_marathon.bump_task_count_config(foldered_name, "DATA_COUNT")

    config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1)
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_config_update_then_executor_killed():
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern(
        'helloworld.executor.Main',
        'world-0-server.{}.mesos'.format(config.SERVICE_NAME))
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
def test_config_update_then_kill_task_in_node():
    # kill 1 of 2 world tasks
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    config.bump_world_cpus()
    sdk_cmd.kill_task_with_pattern(
        'world', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME))
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
Example #40
0
def test_config_update_then_kill_task_in_node():
    # kill 1 of 2 world tasks
    world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world')
    bump_world_cpus()
    tasks.kill_task_with_pattern(
        'world', 'world-0-server.{}.mesos'.format(PACKAGE_NAME))
    tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids)
    check_running()
Example #41
0
def test_unchanged_scheduler_restarts_without_restarting_tasks():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    initial_task_ids = sdk_tasks.get_task_ids(foldered_name, "master")
    shakedown.kill_process_on_host(
        sdk_marathon.get_scheduler_host(foldered_name),
        "elastic.scheduler.Main")
    sdk_tasks.check_tasks_not_updated(foldered_name, "master",
                                      initial_task_ids)
def test_pods_replace():
    broker_0_id = tasks.get_task_ids(
        SERVICE_NAME, '{}-0-{}'.format(DEFAULT_POD_TYPE, DEFAULT_TASK_NAME))
    service_cli('pods replace {}-0'.format(DEFAULT_POD_TYPE))
    tasks.check_tasks_updated(
        SERVICE_NAME, '{}-0-{}'.format(DEFAULT_POD_TYPE, DEFAULT_TASK_NAME),
        broker_0_id)
    tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)
Example #43
0
def test_data_node_replace():
    data_ids = sdk_tasks.get_task_ids(
        sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data-0')
    sdk_cmd.svc_cli(config.PACKAGE_NAME,
                    sdk_utils.get_foldered_name(config.SERVICE_NAME),
                    'pod replace data-0')
    sdk_tasks.check_tasks_updated(
        sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data-0', data_ids)
Example #44
0
def test_kill_hello_node():
    config.check_running()
    hello_ids = sdk_tasks.get_task_ids(config.PACKAGE_NAME, 'hello-0')
    sdk_tasks.kill_task_with_pattern('hello',
                                     'hello-0-server.hello-world.mesos')
    sdk_tasks.check_tasks_updated(config.PACKAGE_NAME, 'hello', hello_ids)

    config.check_running()
Example #45
0
def test_kill_scheduler():
    task_ids = sdk_tasks.get_task_ids(foldered_name, "")
    scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(foldered_name)
    scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix)
    assert len(scheduler_ids) == 1, "Expected to find one scheduler task"

    sdk_cmd.kill_task_with_pattern(
        "./hdfs-scheduler/bin/hdfs",
        "nobody",
        agent_host=sdk_marathon.get_scheduler_host(foldered_name),
    )

    # scheduler should be restarted, but service tasks should be left as-is:
    sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids)
    sdk_tasks.wait_for_active_framework(foldered_name)
    sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids)
    config.check_healthy(service_name=foldered_name)
def test_config_update_then_kill_all_task_in_node():
    #  kill both world tasks
    world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world')
    hosts = shakedown.get_service_ips(config.SERVICE_NAME)
    config.bump_world_cpus()
    [sdk_cmd.kill_task_with_pattern('world', h) for h in hosts]
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids)
    config.check_running()
Example #47
0
def test_master_node_replace():
    # Ideally, the pod will get placed on a different agent. This test will verify that the remaining two masters
    # find the replaced master at its new IP address. This requires a reasonably low TTL for Java DNS lookups.
    master_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'master-0')
    cmd.run_cli(
        'elastic --name={} pod replace master-0'.format(FOLDERED_SERVICE_NAME))
    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'master-0',
                                  master_ids)
Example #48
0
def test_config_update_then_kill_all_task_in_node():
    #  kill both world tasks
    world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world')
    bump_world_cpus()
    hosts = shakedown.get_service_ips(PACKAGE_NAME)
    [tasks.kill_task_with_pattern('world', h) for h in hosts]
    tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids)
    check_running()
Example #49
0
def replace_name_node(index):
    check_healthy()
    name_node_name = 'name-' + str(index)
    name_id = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, name_node_name)
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    cmd.run_cli('hdfs --name={} pod replace {}'.format(FOLDERED_SERVICE_NAME,
                                                       name_node_name))

    expect_recovery()

    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, name_node_name,
                                  name_id)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal',
                                      journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #50
0
def test_permanent_and_transient_namenode_failures_0_1():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    config.check_healthy(service_name=foldered_name)
    name_0_ids = sdk_tasks.get_task_ids(foldered_name, 'name-0')
    name_1_ids = sdk_tasks.get_task_ids(foldered_name, 'name-1')
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace name-0')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart name-1')

    config.expect_recovery(service_name=foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, 'name-0', name_0_ids)
    sdk_tasks.check_tasks_updated(foldered_name, 'name-1', name_1_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #51
0
def test_bump_data_nodes():
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')
    log.info('data ids: ' + str(data_ids))

    sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'DATA_COUNT')

    check_healthy(count=DEFAULT_TASK_COUNT + 1)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #52
0
def _upgrade_or_downgrade(package_name, to_package_version, service_name,
                          running_task_count, additional_options,
                          timeout_seconds, wait_for_deployment):

    initial_config = get_config(package_name, service_name)
    task_ids = sdk_tasks.get_task_ids(service_name, '')

    if sdk_utils.dcos_version_less_than(
            "1.10") or shakedown.ee_version() is None:
        log.info('Using marathon upgrade flow to upgrade {} {}'.format(
            package_name, to_package_version))
        sdk_marathon.destroy_app(service_name)
        sdk_install.install(package_name,
                            service_name,
                            running_task_count,
                            additional_options=additional_options,
                            package_version=to_package_version,
                            timeout_seconds=timeout_seconds,
                            wait_for_deployment=wait_for_deployment)
    else:
        log.info('Using CLI upgrade flow to upgrade {} {}'.format(
            package_name, to_package_version))
        if additional_options:
            with tempfile.NamedTemporaryFile() as opts_f:
                opts_f.write(json.dumps(additional_options).encode('utf-8'))
                opts_f.flush(
                )  # ensure json content is available for the CLI to read below
                sdk_cmd.svc_cli(
                    package_name, service_name,
                    'update start --package-version={} --options={}'.format(
                        to_package_version, opts_f.name))
        else:
            sdk_cmd.svc_cli(
                package_name, service_name,
                'update start --package-version={}'.format(to_package_version))
        # we must manually upgrade the package CLI because it's not done automatically in this flow
        # (and why should it? that'd imply the package CLI replacing itself via a call to the main CLI...)
        sdk_cmd.run_cli(
            'package install --yes --cli --package-version={} {}'.format(
                to_package_version, package_name))

    if wait_for_deployment:

        updated_config = get_config(package_name, service_name)

        if updated_config == initial_config:
            log.info(
                'No config change detected. Tasks should not be restarted')
            sdk_tasks.check_tasks_not_updated(service_name, '', task_ids)
        else:
            log.info('Checking that all tasks have restarted')
            sdk_tasks.check_tasks_updated(service_name, '', task_ids)

        # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected
        # total task count via FINISHED tasks, without actually completing deployment
        log.info("Waiting for {}/{} to finish deployment plan...".format(
            package_name, service_name))
        sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)
Example #53
0
def test_upgrade():

    test_version = upgrade.get_pkg_version(PACKAGE_NAME)
    print('Found test version: {}'.format(test_version))

    repositories = json.loads(
        cmd.run_cli('package repo list --json'))['repositories']
    print("Repositories: " + str(repositories))

    if len(repositories) < 2:
        print(
            "There is only one version in the repository. Skipping upgrade test!"
        )
        assert repo[0]['name'] == 'Universe'
        return

    test_repo_name, test_repo_url = upgrade.get_test_repo_info()

    for repo in repositories:
        if repo['name'] != 'Universe':
            shakedown.remove_package_repo(repo['name'])

    universe_version = upgrade.get_pkg_version(PACKAGE_NAME)
    print('Found Universe version: {}'.format(universe_version))

    print('Installing Universe version: {}'.format(universe_version))
    install.install(PACKAGE_NAME, DEFAULT_BROKER_COUNT)
    print('Installation complete for Universe version: {}'.format(
        universe_version))

    tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)
    broker_ids = tasks.get_task_ids(SERVICE_NAME, 'broker-')

    print('Adding test version to repository with name: {} and url: {}'.format(
        test_repo_name, test_repo_url))
    upgrade.add_repo(test_repo_name, test_repo_url, universe_version, 0,
                     PACKAGE_NAME)

    print('Upgrading to test version: {}'.format(test_version))
    marathon.destroy_app(SERVICE_NAME)

    print('Installing test version: {}'.format(test_version))

    # installation will return with old tasks because they are still running
    install.install(PACKAGE_NAME, DEFAULT_BROKER_COUNT)
    print('Installation complete for test version: {}'.format(test_version))

    # wait till tasks are restarted
    tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE),
                              broker_ids)
    print('All task are restarted')
    # all tasks are running
    tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)

    address = service_cli('endpoints {}'.format(DEFAULT_TASK_NAME))
    assert len(address) == 3
    assert len(address['dns']) == DEFAULT_BROKER_COUNT
    assert len(address['address']) == DEFAULT_BROKER_COUNT
Example #54
0
def test_modify_app_config_rollback():
    check_healthy()
    app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS'

    journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal')
    name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name')
    zkfc_ids = tasks.get_task_ids(PACKAGE_NAME, 'zkfc')
    data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data')
    print('journal ids: ' + str(journal_ids))
    print('name ids: ' + str(name_ids))
    print('zkfc ids: ' + str(zkfc_ids))
    print('data ids: ' + str(data_ids))

    old_config = marathon.get_config(PACKAGE_NAME)
    config = marathon.get_config(PACKAGE_NAME)
    print('marathon config: ')
    print(config)
    expiry_ms = int(config['env'][app_config_field])
    print('expiry ms: ' + str(expiry_ms))
    config['env'][app_config_field] = str(expiry_ms + 1)
    r = cmd.request('put',
                    marathon.api_url('apps/' + PACKAGE_NAME),
                    json=config)

    # Wait for journal nodes to be affected by the change
    tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids)
    journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal')

    print('old config: ')
    print(old_config)
    # Put the old config back (rollback)
    r = cmd.request('put',
                    marathon.api_url('apps/' + PACKAGE_NAME),
                    json=old_config)

    # Wait for the journal nodes to return to their old configuration
    tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids)
    check_healthy()

    config = marathon.get_config(PACKAGE_NAME)
    assert int(config['env'][app_config_field]) == expiry_ms

    # ZKFC and Data tasks should not have been affected
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'zkfc', zkfc_ids)
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
Example #55
0
def test_bump_data_nodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')
    log.info('data ids: ' + str(data_ids))

    sdk_marathon.bump_task_count_config(foldered_name, 'DATA_COUNT')

    config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #56
0
def test_permanent_and_transient_namenode_failures_1_0():
    check_healthy()
    name_0_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-0')
    name_1_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-1')
    journal_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    data_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    cmd.run_cli(
        'hdfs --name={} pods replace name-1'.format(FOLDERED_SERVICE_NAME))
    cmd.run_cli(
        'hdfs --name={} pods restart name-0'.format(FOLDERED_SERVICE_NAME))

    check_healthy()
    tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-0', name_0_ids)
    tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-1', name_1_ids)
    tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal',
                                  journal_ids)
    tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #57
0
    def run_openssl_command() -> str:
        command = ' '.join([
            'timeout', openssl_timeout, 'openssl', 's_client', '-cipher',
            cipher, '-connect', endpoint
        ])

        task_id = sdk_tasks.get_task_ids(service_name, task_name)[0]
        _, output = sdk_cmd.task_exec(task_id, command, True)
        return output
Example #58
0
def test_ingest_node_replace():
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)
    config.wait_for_expected_nodes_to_exist()
    ingest_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'ingest-0')
    cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                'pod replace ingest-0')
    sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'ingest-0', ingest_ids)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)
    config.wait_for_expected_nodes_to_exist()
Example #59
0
def test_modify_app_config():
    app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS'

    journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal')
    name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name')

    config = marathon.get_config(PACKAGE_NAME)
    sdk_utils.out('marathon config: ')
    sdk_utils.out(config)
    expiry_ms = int(config['env'][app_config_field])
    config['env'][app_config_field] = str(expiry_ms + 1)
    marathon.update_app(PACKAGE_NAME, config)

    # All tasks should be updated because hdfs-site.xml has changed
    check_healthy()
    tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids)
    tasks.check_tasks_updated(PACKAGE_NAME, 'name', name_ids)
    tasks.check_tasks_updated(PACKAGE_NAME, 'data', journal_ids)
Example #60
0
def test_bump_journal_cpus():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    log.info('journal ids: ' + str(journal_ids))

    sdk_marathon.bump_cpu_count_config(FOLDERED_SERVICE_NAME, 'JOURNAL_CPUS')

    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal',
                                  journal_ids)
    check_healthy()