Example #1
0
def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes():
    initial_master_task_ids = sdk_tasks.get_task_ids(foldered_name, "master")
    initial_data_task_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    # Get service configuration.
    _, svc_config, _ = sdk_cmd.svc_cli(
        config.PACKAGE_NAME, foldered_name, "describe", parse_json=True
    )

    ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config)
    coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config)

    global current_expected_task_count

    sdk_service.update_configuration(
        config.PACKAGE_NAME,
        foldered_name,
        {
            "ingest_nodes": {"count": ingest_nodes_count + 1},
            "coordinator_nodes": {"count": coordinator_nodes_count + 1},
        },
        current_expected_task_count,
        # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than
        # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards
        # with `sdk_tasks.check_running`.
        wait_for_deployment=False,
    )

    # Should be running 2 tasks more.
    current_expected_task_count += 2
    sdk_tasks.check_running(foldered_name, current_expected_task_count)
    # Master nodes should not restart.
    sdk_tasks.check_tasks_not_updated(foldered_name, "master", initial_master_task_ids)
    # Data nodes should not restart.
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", initial_data_task_ids)
Example #2
0
def test_modify_app_config_rollback():
    app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS'

    journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal')
    data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data')

    old_config = marathon.get_config(PACKAGE_NAME)
    config = marathon.get_config(PACKAGE_NAME)
    sdk_utils.out('marathon config: ')
    sdk_utils.out(config)
    expiry_ms = int(config['env'][app_config_field])
    sdk_utils.out('expiry ms: ' + str(expiry_ms))
    config['env'][app_config_field] = str(expiry_ms + 1)
    marathon.update_app(PACKAGE_NAME, config)

    # Wait for journal nodes to be affected by the change
    tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids)
    journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal')

    sdk_utils.out('old config: ')
    sdk_utils.out(old_config)
    # Put the old config back (rollback)
    marathon.update_app(PACKAGE_NAME, old_config)

    # Wait for the journal nodes to return to their old configuration
    tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids)
    check_healthy()

    config = marathon.get_config(PACKAGE_NAME)
    assert int(config['env'][app_config_field]) == expiry_ms

    # Data tasks should not have been affected
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
Example #3
0
def test_unchanged_scheduler_restarts_without_restarting_tasks():
    initial_task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, "master")
    shakedown.kill_process_on_host(
        sdk_marathon.get_scheduler_host(FOLDERED_SERVICE_NAME),
        "elastic.scheduler.Main")
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, "master",
                                      initial_task_ids)
Example #4
0
def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes() -> None:
    initial_master_task_ids = sdk_tasks.get_task_ids(service_name, "master")
    initial_data_task_ids = sdk_tasks.get_task_ids(service_name, "data")

    # Get service configuration.
    _, svc_config, _ = sdk_cmd.svc_cli(package_name, service_name, "describe", parse_json=True)

    ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config)
    coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config)

    global current_expected_task_count

    sdk_service.update_configuration(
        package_name,
        service_name,
        {
            "ingest_nodes": {"count": ingest_nodes_count + 1},
            "coordinator_nodes": {"count": coordinator_nodes_count + 1},
        },
        current_expected_task_count,
        # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than
        # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards
        # with `sdk_tasks.check_running`.
        wait_for_deployment=False,
    )

    # Should be running 2 tasks more.
    current_expected_task_count += 2
    sdk_tasks.check_running(service_name, current_expected_task_count)
    # Master nodes should not restart.
    sdk_tasks.check_tasks_not_updated(service_name, "master", initial_master_task_ids)
    # Data nodes should not restart.
    sdk_tasks.check_tasks_not_updated(service_name, "data", initial_data_task_ids)
Example #5
0
def test_modify_app_config_rollback():
    app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS'
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)

    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    old_config = sdk_marathon.get_config(foldered_name)
    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info('marathon config: ')
    log.info(marathon_config)
    expiry_ms = int(marathon_config['env'][app_config_field])
    log.info('expiry ms: ' + str(expiry_ms))
    marathon_config['env'][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60)

    # Wait for journal nodes to be affected by the change
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')

    log.info('old config: ')
    log.info(old_config)
    # Put the old config back (rollback)
    sdk_marathon.update_app(foldered_name, old_config)

    # Wait for the journal nodes to return to their old configuration
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    config.check_healthy(service_name=foldered_name)

    marathon_config = sdk_marathon.get_config(foldered_name)
    assert int(marathon_config['env'][app_config_field]) == expiry_ms

    # Data tasks should not have been affected
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #6
0
def test_unchanged_scheduler_restarts_without_restarting_tasks():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    initial_task_ids = sdk_tasks.get_task_ids(foldered_name, '')
    shakedown.kill_process_on_host(
        sdk_marathon.get_scheduler_host(foldered_name),
        "elastic.scheduler.Main")
    sdk_tasks.check_tasks_not_updated(foldered_name, '', initial_task_ids)
Example #7
0
def test_modify_app_config_rollback():
    app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS"

    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    old_config = sdk_marathon.get_config(foldered_name)
    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info("marathon config: ")
    log.info(marathon_config)
    expiry_ms = int(marathon_config["env"][app_config_field])
    log.info("expiry ms: " + str(expiry_ms))
    marathon_config["env"][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(marathon_config, timeout=15 * 60)

    # Wait for journal nodes to be affected by the change
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")

    log.info("old config: ")
    log.info(old_config)
    # Put the old config back (rollback)
    sdk_marathon.update_app(old_config)

    # Wait for the journal nodes to return to their old configuration
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    config.check_healthy(service_name=foldered_name)

    marathon_config = sdk_marathon.get_config(foldered_name)
    assert int(marathon_config["env"][app_config_field]) == expiry_ms

    # Data tasks should not have been affected
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_modify_app_config_rollback():
    app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS'
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)

    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    old_config = sdk_marathon.get_config(foldered_name)
    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info('marathon config: ')
    log.info(marathon_config)
    expiry_ms = int(marathon_config['env'][app_config_field])
    log.info('expiry ms: ' + str(expiry_ms))
    marathon_config['env'][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60)

    # Wait for journal nodes to be affected by the change
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')

    log.info('old config: ')
    log.info(old_config)
    # Put the old config back (rollback)
    sdk_marathon.update_app(foldered_name, old_config)

    # Wait for the journal nodes to return to their old configuration
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    config.check_healthy(service_name=foldered_name)

    marathon_config = sdk_marathon.get_config(foldered_name)
    assert int(marathon_config['env'][app_config_field]) == expiry_ms

    # Data tasks should not have been affected
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #9
0
def test_modify_app_config_rollback():
    app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS"

    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    old_config = sdk_marathon.get_config(foldered_name)
    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info("marathon config: ")
    log.info(marathon_config)
    expiry_ms = int(marathon_config["env"][app_config_field])
    log.info("expiry ms: " + str(expiry_ms))
    marathon_config["env"][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(marathon_config, timeout=15 * 60)

    # Wait for journal nodes to be affected by the change
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")

    log.info("old config: ")
    log.info(old_config)
    # Put the old config back (rollback)
    sdk_marathon.update_app(old_config)

    # Wait for the journal nodes to return to their old configuration
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    config.check_healthy(service_name=foldered_name)

    marathon_config = sdk_marathon.get_config(foldered_name)
    assert int(marathon_config["env"][app_config_field]) == expiry_ms

    # Data tasks should not have been affected
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
Example #10
0
def _upgrade_or_downgrade(package_name, to_package_version, service_name,
                          running_task_count, additional_options,
                          timeout_seconds, wait_for_deployment):

    initial_config = get_config(package_name, service_name)
    task_ids = sdk_tasks.get_task_ids(service_name, '')

    if sdk_utils.dcos_version_less_than(
            "1.10") or shakedown.ee_version() is None:
        log.info('Using marathon upgrade flow to upgrade {} {}'.format(
            package_name, to_package_version))
        sdk_marathon.destroy_app(service_name)
        sdk_install.install(package_name,
                            service_name,
                            running_task_count,
                            additional_options=additional_options,
                            package_version=to_package_version,
                            timeout_seconds=timeout_seconds,
                            wait_for_deployment=wait_for_deployment)
    else:
        log.info('Using CLI upgrade flow to upgrade {} {}'.format(
            package_name, to_package_version))
        if additional_options:
            with tempfile.NamedTemporaryFile() as opts_f:
                opts_f.write(json.dumps(additional_options).encode('utf-8'))
                opts_f.flush(
                )  # ensure json content is available for the CLI to read below
                sdk_cmd.svc_cli(
                    package_name, service_name,
                    'update start --package-version={} --options={}'.format(
                        to_package_version, opts_f.name))
        else:
            sdk_cmd.svc_cli(
                package_name, service_name,
                'update start --package-version={}'.format(to_package_version))
        # we must manually upgrade the package CLI because it's not done automatically in this flow
        # (and why should it? that'd imply the package CLI replacing itself via a call to the main CLI...)
        sdk_cmd.run_cli(
            'package install --yes --cli --package-version={} {}'.format(
                to_package_version, package_name))

    if wait_for_deployment:

        updated_config = get_config(package_name, service_name)

        if updated_config == initial_config:
            log.info(
                'No config change detected. Tasks should not be restarted')
            sdk_tasks.check_tasks_not_updated(service_name, '', task_ids)
        else:
            log.info('Checking that all tasks have restarted')
            sdk_tasks.check_tasks_updated(service_name, '', task_ids)

        # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected
        # total task count via ONCE tasks, without actually completing deployment
        log.info(
            "Waiting for package={} service={} to finish deployment plan...".
            format(package_name, service_name))
        sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)
Example #11
0
def test_bump_data_nodes():
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')
    log.info('data ids: ' + str(data_ids))

    sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'DATA_COUNT')

    check_healthy(count=DEFAULT_TASK_COUNT + 1)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #12
0
def test_bump_data_nodes():
    data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data')
    sdk_utils.out('data ids: ' + str(data_ids))

    marathon.bump_task_count_config(PACKAGE_NAME, 'DATA_COUNT')

    check_healthy(DEFAULT_TASK_COUNT + 1)
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
Example #13
0
def test_bump_data_nodes():
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")
    log.info("data ids: " + str(data_ids))

    sdk_marathon.bump_task_count_config(foldered_name, "DATA_COUNT")

    config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1)
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
Example #14
0
def test_bump_data_nodes():
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")
    log.info("data ids: " + str(data_ids))

    sdk_marathon.bump_task_count_config(foldered_name, "DATA_COUNT")

    config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1)
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
Example #15
0
def check_permanent_recovery(
    package_name: str,
    service_name: str,
    pod_name: str,
    recovery_timeout_s: int,
    pods_with_updated_tasks: Optional[List[str]] = None,
) -> None:
    """
    Perform a replace (permanent recovery) operation on the specified pod.

    The specified pod AND any additional pods in `pods_with_updated_tasks` are
    checked to ensure that their tasks have been restarted.

    Any remaining pods are checked to ensure that their tasks are not changed.

    For example, performing a pod replace kafka-0 on a Kafka framework should
    result in ONLY the kafa-0-broker task being restarted. In this case,
    pods_with_updated_tasks is specified as None.

    When performing a pod replace operation on a Cassandra seed node (node-0),
    a rolling restart of other nodes is triggered, and
    pods_with_updated_tasks = ["node-0", "node-1", "node-2"]
    (assuming a three node Cassandra ring)
    """
    LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name)

    sdk_plan.wait_for_completed_deployment(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)

    rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list")
    assert rc == 0, "Pod list failed"
    pod_list = set(json.loads(stdout))

    pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else []
    pods_to_update = set(pods_with_updated_tasks + [pod_name])

    tasks_to_replace = {}
    for pod in pods_to_update:
        tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name))

    LOG.info("The following tasks will be replaced: %s", tasks_to_replace)

    tasks_in_other_pods = {}
    for pod in pod_list - pods_to_update:
        tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod))

    LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods)

    sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name))

    sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s)
    sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s)

    for pod, tasks in tasks_to_replace.items():
        sdk_tasks.check_tasks_updated(service_name, pod, tasks)

    for pod, tasks in tasks_in_other_pods.items():
        sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
Example #16
0
def test_bump_data_nodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')
    log.info('data ids: ' + str(data_ids))

    sdk_marathon.bump_task_count_config(foldered_name, 'DATA_COUNT')

    config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #17
0
def test_bump_data_nodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')
    log.info('data ids: ' + str(data_ids))

    sdk_marathon.bump_task_count_config(foldered_name, 'DATA_COUNT')

    config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #18
0
def test_adding_data_node_only_restarts_masters() -> None:
    initial_master_task_ids = sdk_tasks.get_task_ids(foldered_name, "master")
    initial_data_task_ids = sdk_tasks.get_task_ids(foldered_name, "data")
    initial_coordinator_task_ids = sdk_tasks.get_task_ids(
        foldered_name, "coordinator")

    # Get service configuration.
    _, svc_config, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME,
                                       foldered_name,
                                       "describe",
                                       parse_json=True)

    data_nodes_count = get_in(["data_nodes", "count"], svc_config)

    global current_expected_task_count

    # Increase the data nodes count by 1.
    sdk_service.update_configuration(
        config.PACKAGE_NAME,
        foldered_name,
        {"data_nodes": {
            "count": data_nodes_count + 1
        }},
        current_expected_task_count,
        # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than
        # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards
        # with `sdk_tasks.check_running`.
        wait_for_deployment=False,
    )

    sdk_plan.wait_for_kicked_off_deployment(foldered_name)
    sdk_plan.wait_for_completed_deployment(foldered_name)

    _, new_data_pod_info, _ = sdk_cmd.svc_cli(
        config.PACKAGE_NAME,
        foldered_name,
        "pod info data-{}".format(data_nodes_count),
        parse_json=True,
    )

    # Get task ID for new data node task.
    new_data_task_id = get_in([0, "info", "taskId", "value"],
                              new_data_pod_info)

    # Should be running 1 task more.
    current_expected_task_count += 1
    sdk_tasks.check_running(foldered_name, current_expected_task_count)
    # Master nodes should restart.
    sdk_tasks.check_tasks_updated(foldered_name, "master",
                                  initial_master_task_ids)
    # Data node tasks should be the initial ones plus the new one.
    sdk_tasks.check_tasks_not_updated(
        foldered_name, "data", initial_data_task_ids + [new_data_task_id])
    # Coordinator tasks should not restart.
    sdk_tasks.check_tasks_not_updated(foldered_name, "coordinator",
                                      initial_coordinator_task_ids)
Example #19
0
def test_kill_journal_node():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal-0')
    name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    sdk_tasks.kill_task_with_pattern('journalnode', sdk_hosts.system_host(FOLDERED_SERVICE_NAME, 'journal-0-node'))
    config.expect_recovery(service_name=FOLDERED_SERVICE_NAME)
    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #20
0
def test_kill_name_node():
    name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name-0')
    journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal')
    data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data')

    tasks.kill_task_with_pattern('namenode', 'name-0-node.hdfs.mesos')
    check_healthy()
    tasks.check_tasks_updated(PACKAGE_NAME, 'name', name_ids)
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'journal', journal_ids)
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
Example #21
0
def test_bump_hello_nodes():
    config.check_running(FOLDERED_SERVICE_NAME)

    hello_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'hello')
    log.info('hello ids: ' + str(hello_ids))

    sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'HELLO_COUNT')

    config.check_running(FOLDERED_SERVICE_NAME)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'hello', hello_ids)
Example #22
0
def test_kill_name_node():
    name_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'name-0')
    journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal')
    data_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'data')

    sdk_tasks.kill_task_with_pattern('namenode', sdk_hosts.system_host(config.FOLDERED_SERVICE_NAME, 'name-0-node'))
    config.expect_recovery(service_name=config.FOLDERED_SERVICE_NAME)
    sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, 'name', name_ids)
    sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #23
0
def test_bump_hello_nodes():
    config.check_running(FOLDERED_SERVICE_NAME)

    hello_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'hello')
    log.info('hello ids: ' + str(hello_ids))

    sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'HELLO_COUNT')

    config.check_running(FOLDERED_SERVICE_NAME)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'hello', hello_ids)
Example #24
0
def test_bump_hello_nodes():
    check_running()

    hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello')
    sdk_utils.out('hello ids: ' + str(hello_ids))

    marathon.bump_task_count_config(PACKAGE_NAME, 'HELLO_COUNT')

    check_running()
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
Example #25
0
def test_kill_data_node():
    data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0]
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")

    sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host)

    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id])
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
Example #26
0
def test_kill_data_node():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data-0')
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    name_ids = sdk_tasks.get_task_ids(foldered_name, 'name')

    sdk_cmd.kill_task_with_pattern('datanode', sdk_hosts.system_host(foldered_name, 'data-0-node'))
    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
Example #27
0
def test_bump_data_nodes():
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')
    log.info('data ids: ' + str(data_ids))

    sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'DATA_COUNT')

    config.check_healthy(
        service_name=FOLDERED_SERVICE_NAME,
        count=config.DEFAULT_TASK_COUNT + 1
    )
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #28
0
def test_kill_all_journalnodes():
    journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal')
    data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data')

    for host in shakedown.get_service_ips(PACKAGE_NAME):
        tasks.kill_task_with_pattern('journalnode', host)

    check_healthy()
    # name nodes fail and restart, so don't check those
    tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids)
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
Example #29
0
def test_bump_hello_nodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    config.check_running(foldered_name)

    hello_ids = sdk_tasks.get_task_ids(foldered_name, 'hello')
    log.info('hello ids: ' + str(hello_ids))

    sdk_marathon.bump_task_count_config(foldered_name, 'HELLO_COUNT')

    config.check_running(foldered_name)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'hello', hello_ids)
Example #30
0
def test_bump_journal_cpus():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name')
    log.info('journal ids: ' + str(journal_ids))

    sdk_marathon.bump_cpu_count_config(FOLDERED_SERVICE_NAME, 'JOURNAL_CPUS')

    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    # journal node update should not cause any of the name nodes to crash
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
    config.check_healthy(service_name=FOLDERED_SERVICE_NAME)
Example #31
0
def test_kill_data_node():
    data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0]
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")

    sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host)

    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id])
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
Example #32
0
def test_kill_journal_node():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal-0')
    name_ids = sdk_tasks.get_task_ids(foldered_name, 'name')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    sdk_tasks.kill_task_with_pattern('journalnode', sdk_hosts.system_host(foldered_name, 'journal-0-node'))
    config.expect_recovery(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #33
0
def test_bump_journal_cpus():
    journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal')
    name_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'name')
    log.info('journal ids: ' + str(journal_ids))

    sdk_marathon.bump_cpu_count_config(config.FOLDERED_SERVICE_NAME, 'JOURNAL_CPUS')

    sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    # journal node update should not cause any of the name nodes to crash
    sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'name', name_ids)
    config.check_healthy(service_name=config.FOLDERED_SERVICE_NAME)
Example #34
0
def test_kill_all_journalnodes():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    for host in shakedown.get_service_ips(FOLDERED_SERVICE_NAME):
        sdk_tasks.kill_task_with_pattern('journalnode', host)

    expect_recovery()
    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal',
                                  journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #35
0
def test_kill_all_journalnodes():
    journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal')
    data_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'data')

    for journal_pod in config.get_pod_type_instances("journal", config.FOLDERED_SERVICE_NAME):
        sdk_cmd.run_cli('hdfs --name={} pod restart {}'.format(config.FOLDERED_SERVICE_NAME, journal_pod))

    config.expect_recovery(service_name=config.FOLDERED_SERVICE_NAME)

    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #36
0
def test_kill_all_journalnodes():
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    for journal_pod in config.get_pod_type_instances("journal", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(journal_pod))

    config.expect_recovery(service_name=foldered_name)

    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
Example #37
0
def test_kill_all_journalnodes():
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    for journal_pod in config.get_pod_type_instances("journal", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(journal_pod))

    config.expect_recovery(service_name=foldered_name)

    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
Example #38
0
def test_kill_data_node():
    data_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data-0')
    journal_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    name_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name')

    tasks.kill_task_with_pattern(
        'datanode', hosts.system_host(FOLDERED_SERVICE_NAME, 'data-0-node'))
    check_healthy()
    tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
    tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal',
                                  journal_ids)
    tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
Example #39
0
def test_kill_all_datanodes():
    journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal')
    name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name')
    data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data')

    for host in shakedown.get_service_ips(PACKAGE_NAME):
        tasks.kill_task_with_pattern('datanode', host)

    check_healthy()
    tasks.check_tasks_updated(PACKAGE_NAME, 'data', data_ids)
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'journal', journal_ids)
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'name', name_ids)
Example #40
0
def test_bump_journal_cpus():
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")
    log.info("journal ids: " + str(journal_ids))

    sdk_marathon.bump_cpu_count_config(foldered_name, "JOURNAL_CPUS")

    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    # journal node update should not cause any of the name nodes to crash
    # if the name nodes crashed, then it implies the journal nodes were updated in parallel, when they should've been updated serially
    # for journal nodes, the deploy plan is parallel, while the update plan is serial. maybe the deploy plan was mistakenly used?
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
    config.check_healthy(service_name=foldered_name)
Example #41
0
def test_kill_all_journalnodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    data_ids = sdk_tasks.get_task_ids(sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data')

    for journal_pod in config.get_pod_type_instances("journal", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(journal_pod))

    config.expect_recovery(service_name=foldered_name)

    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #42
0
def test_bump_hello_nodes():
    check_running()

    hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello')
    print('hello ids: ' + str(hello_ids))

    config = marathon.get_config(PACKAGE_NAME)
    node_count = int(config['env']['HELLO_COUNT']) + 1
    config['env']['HELLO_COUNT'] = str(node_count)
    cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config)

    check_running()
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
def test_kill_all_journalnodes(hdfs_server):
    service_name = hdfs_server["service"]["name"]
    journal_ids = sdk_tasks.get_task_ids(service_name, "journal")
    name_ids = sdk_tasks.get_task_ids(service_name, "name")
    data_ids = sdk_tasks.get_task_ids(service_name, "data")

    for journal_pod in config.get_pod_type_instances("journal", service_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, "pod restart {}".format(journal_pod))
        config.expect_recovery(service_name=service_name)

    sdk_tasks.check_tasks_updated(service_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(service_name, "name", name_ids)
    sdk_tasks.check_tasks_not_updated(service_name, "data", data_ids)
Example #44
0
def test_kill_all_datanodes():
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    for data_pod in config.get_pod_type_instances("data", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(data_pod))

    config.expect_recovery(service_name=foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
Example #45
0
def test_bump_data_nodes():
    check_healthy()

    data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data')
    print('data ids: ' + str(data_ids))

    config = marathon.get_config(PACKAGE_NAME)
    node_count = int(config['env']['DATA_COUNT']) + 1
    config['env']['DATA_COUNT'] = str(node_count)
    cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config)

    check_healthy(DEFAULT_TASK_COUNT + 1)
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_bump_hello_nodes():
    check_running()

    hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello')
    print('hello ids: ' + str(hello_ids))

    config = marathon.get_config(PACKAGE_NAME)
    node_count = int(config['env']['HELLO_COUNT']) + 1
    config['env']['HELLO_COUNT'] = str(node_count)
    marathon.update_app(PACKAGE_NAME, config)

    check_running()
    tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
Example #47
0
def test_kill_all_journalnodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    data_ids = sdk_tasks.get_task_ids(sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data')

    for journal_pod in config.get_pod_type_instances("journal", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(journal_pod))

    config.expect_recovery(service_name=foldered_name)

    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #48
0
def test_kill_all_namenodes():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    for host in shakedown.get_service_ips(FOLDERED_SERVICE_NAME):
        sdk_tasks.kill_task_with_pattern('namenode', host)
    expect_recovery()

    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal',
                                      journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #49
0
def test_kill_all_datanodes():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    name_ids = sdk_tasks.get_task_ids(foldered_name, 'name')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    for data_pod in config.get_pod_type_instances("data", foldered_name):
        sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(data_pod))

    config.expect_recovery(service_name=foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
Example #50
0
def test_state_refresh_disable_cache():
    '''Disables caching via a scheduler envvar'''
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    config.check_running(foldered_name)
    task_ids = sdk_tasks.get_task_ids(foldered_name, '')

    # caching enabled by default:
    stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache')
    assert "Received cmd: refresh" in stdout

    marathon_config = sdk_marathon.get_config(foldered_name)
    marathon_config['env']['DISABLE_STATE_CACHE'] = 'any-text-here'
    sdk_marathon.update_app(foldered_name, marathon_config)

    sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids)
    config.check_running(foldered_name)

    # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up):
    @retrying.retry(
        wait_fixed=1000,
        stop_max_delay=120*1000,
        retry_on_result=lambda res: not res)
    def check_cache_refresh_fails_409conflict():
        output = sdk_cmd.svc_cli(
            config.PACKAGE_NAME,
            foldered_name,
            'debug state refresh_cache',
            return_stderr_in_stdout=True)
        return "failed: 409 Conflict" in output

    check_cache_refresh_fails_409conflict()

    marathon_config = sdk_marathon.get_config(foldered_name)
    del marathon_config['env']['DISABLE_STATE_CACHE']
    sdk_marathon.update_app(foldered_name, marathon_config)

    sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids)
    config.check_running(foldered_name)
    shakedown.deployment_wait()  # ensure marathon thinks the deployment is complete too

    # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up):
    @retrying.retry(
        wait_fixed=1000,
        stop_max_delay=120*1000,
        retry_on_result=lambda res: not res)
    def check_cache_refresh():
        return sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache')

    stdout = check_cache_refresh()
    assert "Received cmd: refresh" in stdout
Example #51
0
def test_kill_all_journalnodes():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    for journal_pod in config.get_pod_type_instances("journal", FOLDERED_SERVICE_NAME):
        sdk_cmd.svc_cli(
            config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
            'pod restart {}'.format(journal_pod))

    config.expect_recovery(service_name=FOLDERED_SERVICE_NAME)

    # name nodes fail and restart, so don't check those
    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #52
0
def test_kill_all_datanodes():
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    for data_pod in config.get_pod_type_instances("data", FOLDERED_SERVICE_NAME):
        sdk_cmd.svc_cli(
            config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
            'pod restart {}'.format(data_pod))

    config.expect_recovery(service_name=FOLDERED_SERVICE_NAME)

    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
Example #53
0
def test_permanent_and_transient_namenode_failures_1_0():
    config.check_healthy(service_name=FOLDERED_SERVICE_NAME)
    name_0_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-0')
    name_1_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-1')
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod replace name-1')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod restart name-0')

    config.expect_recovery(service_name=FOLDERED_SERVICE_NAME)

    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-0', name_0_ids)
    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-1', name_1_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
Example #54
0
def replace_name_node(index):
    config.check_healthy(service_name=FOLDERED_SERVICE_NAME)
    name_node_name = 'name-' + str(index)
    name_id = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, name_node_name)
    journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal')
    data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data')

    sdk_cmd.svc_cli(
        config.PACKAGE_NAME, FOLDERED_SERVICE_NAME,
        'pod replace {}'.format(name_node_name))

    config.expect_recovery(service_name=FOLDERED_SERVICE_NAME)

    sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, name_node_name, name_id)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_kill_scheduler():
    task_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "")
    scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(config.SERVICE_NAME)
    scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix)
    assert len(scheduler_ids) == 1, "Expected to find ONLY one scheduler task but found {}".format(scheduler_ids)

    sdk_cmd.kill_task_with_pattern(
        "./hello-world-scheduler/bin/helloworld",
        "nobody",
        agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME),
    )

    sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids)
    sdk_tasks.wait_for_active_framework(config.SERVICE_NAME)
    config.check_running()
    sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "", task_ids)
Example #56
0
def test_permanent_and_transient_namenode_failures_1_0():
    config.check_healthy(service_name=foldered_name)
    name_0_ids = sdk_tasks.get_task_ids(foldered_name, "name-0")
    name_1_ids = sdk_tasks.get_task_ids(foldered_name, "name-1")
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod replace name-1")
    sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart name-0")

    config.expect_recovery(service_name=foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, "name-0", name_0_ids)
    sdk_tasks.check_tasks_updated(foldered_name, "name-1", name_1_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
Example #57
0
def test_permanent_and_transient_namenode_failures_0_1():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    config.check_healthy(service_name=foldered_name)
    name_0_ids = sdk_tasks.get_task_ids(foldered_name, 'name-0')
    name_1_ids = sdk_tasks.get_task_ids(foldered_name, 'name-1')
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace name-0')
    sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart name-1')

    config.expect_recovery(service_name=foldered_name)

    sdk_tasks.check_tasks_updated(foldered_name, 'name-0', name_0_ids)
    sdk_tasks.check_tasks_updated(foldered_name, 'name-1', name_1_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
Example #58
0
def test_adding_data_node_only_restarts_masters():
    initial_master_task_ids = sdk_tasks.get_task_ids(foldered_name, "master")
    initial_data_task_ids = sdk_tasks.get_task_ids(foldered_name, "data")
    initial_coordinator_task_ids = sdk_tasks.get_task_ids(foldered_name, "coordinator")
    marathon_config = sdk_marathon.get_config(foldered_name)
    data_nodes = int(marathon_config['env']['DATA_NODE_COUNT'])
    marathon_config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1)
    sdk_marathon.update_app(foldered_name, marathon_config)
    sdk_plan.wait_for_completed_deployment(foldered_name)
    global current_expected_task_count
    current_expected_task_count += 1
    sdk_tasks.check_running(foldered_name, current_expected_task_count)
    sdk_tasks.check_tasks_updated(foldered_name, "master", initial_master_task_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "data", initial_data_task_ids)
    sdk_tasks.check_tasks_not_updated(foldered_name, "coordinator", initial_coordinator_task_ids)
    sdk_plan.wait_for_completed_deployment(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
Example #59
0
def test_kill_scheduler():
    task_ids = sdk_tasks.get_task_ids(foldered_name, "")
    scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(foldered_name)
    scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix)
    assert len(scheduler_ids) == 1, "Expected to find one scheduler task"

    sdk_cmd.kill_task_with_pattern(
        "./hdfs-scheduler/bin/hdfs",
        "nobody",
        agent_host=sdk_marathon.get_scheduler_host(foldered_name),
    )

    # scheduler should be restarted, but service tasks should be left as-is:
    sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids)
    sdk_tasks.wait_for_active_framework(foldered_name)
    sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids)
    config.check_healthy(service_name=foldered_name)