コード例 #1
0
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^node-[0-9]+-server$'))
    assert len(candidate_tasks) != 0, 'Could not find a node to shut down'
    # Cassandra nodes should never share a machine
    assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \
        'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks)
    # Just pick the first one from the list
    replace_task = candidate_tasks[0]

    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id
    ][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
コード例 #2
0
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^node-[0-9]+-server$'))
    assert len(candidate_tasks) != 0, 'Could not find a node to shut down'
    # Cassandra nodes should never share a machine
    assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \
        'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks)
    # Just pick the first one from the list
    replace_task = candidate_tasks[0]

    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
コード例 #3
0
def test_shutdown_host():
    replace_task = sdk_tasks.get_task_avoiding_scheduler(
        config.SERVICE_NAME, re.compile('^(hello|world)-[0-9]+-server$'))
    assert replace_task is not None, 'Could not find a node to shut down'
    replace_pod_name = replace_task.name[:-len('-server')]

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_cmd.shutdown_agent(replace_task.host)

    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # Find the new version of the task. Note that the old one may still be present/'running' as
    # Mesos might not have acknowledged the agent's death.
    new_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == replace_task.name and task.id != replace_task.id
    ][0]
    log.info('Checking that the original pod has moved to a new agent:\n'
             'old={}\nnew={}'.format(replace_task, new_task))
    assert replace_task.agent != new_task.agent
コード例 #4
0
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")
    )
    assert len(candidate_tasks) != 0, "Could not find a node to shut down"

    # Pick the host of the first task from the above list, then get ALL tasks which may be located
    # on that host. We'll need to 'pod replace' all of them.
    replace_hostname = candidate_tasks[0].host
    replace_tasks = [task for task in candidate_tasks if task.host == replace_hostname]
    log.info(
        "Tasks on host {} to be replaced after shutdown: {}".format(replace_hostname, replace_tasks)
    )

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_agents.shutdown_agent(replace_hostname)
    # Reserved resources on this agent are expected to appear as orphaned in Mesos state.
    # Tell our uninstall validation to ignore orphaned resources coming from this agent.
    sdk_install.ignore_dead_agent(replace_hostname)

    # Get pod name from task name: "hello-0-server" => "hello-0"
    replace_pods = set([task.name[: -len("-server")] for task in replace_tasks])
    assert len(replace_pods) == len(
        replace_tasks
    ), "Expected one task per pod in tasks to replace: {}".format(replace_tasks)
    for pod_name in replace_pods:
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # For each task affected by the shutdown, find the new version of it, and check that it moved.
    # Note that the old version on the dead agent may still be present/'running' as
    # Mesos might not have fully acknowledged the agent's death.
    new_tasks = sdk_tasks.get_summary()
    for replaced_task in replace_tasks:
        new_task = [
            task
            for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info(
            "Checking affected task has moved to a new agent:\n"
            "old={}\nnew={}".format(replaced_task, new_task)
        )
        assert replaced_task.agent_id != new_task.agent_id
コード例 #5
0
def test_shutdown_host():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$"))
    assert len(candidate_tasks) != 0, "Could not find a node to shut down"

    # Pick the host of the first task from the above list, then get ALL tasks which may be located
    # on that host. We'll need to 'pod replace' all of them.
    replace_hostname = candidate_tasks[0].host
    replace_tasks = [
        task for task in candidate_tasks if task.host == replace_hostname
    ]
    log.info("Tasks on host {} to be replaced after shutdown: {}".format(
        replace_hostname, replace_tasks))

    # Instead of partitioning or reconnecting, we shut down the host permanently
    sdk_agents.shutdown_agent(replace_hostname)
    # Reserved resources on this agent are expected to appear as orphaned in Mesos state.
    # Tell our uninstall validation to ignore orphaned resources coming from this agent.
    sdk_install.ignore_dead_agent(replace_hostname)

    # Get pod name from task name: "hello-0-server" => "hello-0"
    replace_pods = set([task.name[:-len("-server")] for task in replace_tasks])
    assert len(replace_pods) == len(
        replace_tasks
    ), "Expected one task per pod in tasks to replace: {}".format(
        replace_tasks)
    for pod_name in replace_pods:
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                        "pod replace {}".format(pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)

    # Print another dump of current cluster tasks, now that repair has started.
    sdk_tasks.get_summary()

    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)
    sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT)

    # For each task affected by the shutdown, find the new version of it, and check that it moved.
    # Note that the old version on the dead agent may still be present/'running' as
    # Mesos might not have fully acknowledged the agent's death.
    new_tasks = sdk_tasks.get_summary()
    for replaced_task in replace_tasks:
        new_task = [
            task for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info("Checking affected task has moved to a new agent:\n"
                 "old={}\nnew={}".format(replaced_task, new_task))
        assert replaced_task.agent_id != new_task.agent_id
コード例 #6
0
def test_auto_replace_on_drain():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        service_name, re.compile("^(master|data|coordinator)-[0-9]+-node$")
    )

    log.info("Candidate tasks: {}".format(candidate_tasks))
    assert len(candidate_tasks) != 0, "Could not find a node to drain"

    # Pick the host of the first task from the above list
    replace_agent_id = candidate_tasks[0].agent_id
    replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id]
    log.info(
        "Tasks on agent {} to be replaced after drain: {}".format(replace_agent_id, replace_tasks)
    )
    sdk_agents.drain_agent(replace_agent_id)

    sdk_plan.wait_for_kicked_off_recovery(service_name)
    sdk_plan.wait_for_completed_recovery(service_name)

    new_tasks = sdk_tasks.get_summary()

    for replaced_task in replace_tasks:
        new_task = [
            task
            for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info(
            "Checking affected task has moved to a new agent:\n"
            "old={}\nnew={}".format(replaced_task, new_task)
        )
        assert replaced_task.agent_id != new_task.agent_id

    # Reactivate the drained agent, otherwise uninstall plans will be halted for portworx
    sdk_agents.reactivate_agent(replace_agent_id)
コード例 #7
0
def test_auto_replace_on_decommission():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")
    )

    assert len(candidate_tasks) != 0, "Could not find a node to decommission"

    # Pick the host of the first task from the above list
    replace_agent_id = candidate_tasks[0].agent_id
    replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id]
    log.info(
        "Tasks on agent {} to be replaced after decommission: {}".format(replace_agent_id, replace_tasks)
    )
    sdk_agents.decommission_agent(replace_agent_id)

    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    new_tasks = sdk_tasks.get_summary()

    for replaced_task in replace_tasks:
        new_task = [
            task
            for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info(
            "Checking affected task has moved to a new agent:\n"
            "old={}\nnew={}".format(replaced_task, new_task)
        )
        assert replaced_task.agent_id != new_task.agent_id
コード例 #8
0
def get_task_host(task_name):
    _, out, _ = sdk_cmd.run_cli("task {} --json".format(task_name))
    tasks_json = json.loads(out)
    matching_tasks = list(filter(lambda t: t["name"] == task_name, tasks_json))
    assert len(matching_tasks) == 1, "Duplicate tasks found with same name : [{}]".format(tasks_json)
    task_info = matching_tasks.pop()

    host = None
    for label in task_info["labels"]:
        if label["key"] == "offer_hostname":
            host = label["value"]
            break

    if host is None:
        raise Exception("offer_hostname label is not present!: {}".format(task_info))

    # Validation: Check that label matches summary returned by CLI
    for task in sdk_tasks.get_summary():
        if task.name == task_name:
            if task.host == host:
                # OK!
                return host
            else:
                # CLI's hostname doesn't match the TaskInfo labels. Bug!
                raise Exception(
                    "offer_hostname label [{}] doesn't match CLI output [{}]\nTask:\n{}".format(
                        host, task.host, task_info
                    )
                )

    # Unable to find desired task in CLI!
    raise Exception("Unable to find task named {} in CLI".format(task_name))
コード例 #9
0
def get_completed_task_id(task_name):
    task_ids = [
        t.id for t in sdk_tasks.get_summary(with_completed=True,
                                            task_name=task_name)
    ]
    # Mesos returns newest task first:
    return task_ids[0] if task_ids else None
コード例 #10
0
def test_node_replace_replaces_node():
    replace_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == 'node-2-server'][0]
    log.info('avoid host for task {}'.format(replace_task))

    replace_pod_name = replace_task.name[:-len('-server')]

    # Update the placement constraints so the new node doesn't end up on the same host
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    original_constraint = marathon_config['env']['PLACEMENT_CONSTRAINT']
    try:
        marathon_config['env']['PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(replace_task.host)
        sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

        # start replace and wait for it to finish
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name))
        sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
        sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)

    finally:
        # revert to prior placement setting before proceeding with tests: avoid getting stuck.
        marathon_config['env']['PLACEMENT_CONSTRAINT'] = original_constraint
        sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
コード例 #11
0
def get_task_host(task_name):
    out = sdk_cmd.run_cli('task {} --json'.format(task_name))
    task_info = json.loads(out)[0]

    host = None
    for label in task_info['labels']:
        if label['key'] == 'offer_hostname':
            host = label['value']
            break

    if host is None:
        raise Exception(
            "offer_hostname label is not present!: {}".format(task_info))

    # Validation: Check that label matches summary returned by CLI
    for task in sdk_tasks.get_summary():
        if task.name == task_name:
            if task.host == host:
                # OK!
                return host
            else:
                # CLI's hostname doesn't match the TaskInfo labels. Bug!
                raise Exception(
                    "offer_hostname label {} doesn't match CLI output!\nTask:\n{}"
                    .format(task_info))

    # Unable to find desired task in CLI!
    raise Exception("Unable to find task named {} in CLI".format(task_name))
コード例 #12
0
def handle_test_setup(item: pytest.Item):
    '''Does some initialization at the start of a test.

    This should be called in a pytest_runtest_setup() hook.
    See also handle_failed_test() which must be called from a pytest_runtest_makereport() hookimpl hook.'''

    # Check if we're entering a new test suite.
    global _testlogs_test_index
    global _testlogs_current_test_suite
    test_suite = get_test_suite_name(item)
    if test_suite != _testlogs_current_test_suite:
        # New test suite:
        # 1 Store all the task ids which already exist as of this point.
        _testlogs_current_test_suite = test_suite
        global _testlogs_ignored_task_ids
        _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union([
            task.id for task in sdk_tasks.get_summary(with_completed=True)])
        log.info('Entering new test suite {}: {} preexisting tasks will be ignored on test failure.'.format(
            test_suite, len(_testlogs_ignored_task_ids)))
        # 2 Reset the test index.
        _testlogs_test_index = 0
        # 3 Remove any prior logs for the test suite.
        test_log_dir = _test_suite_artifact_directory(item)
        if os.path.exists(test_log_dir):
            log.info('Deleting existing test suite logs: {}/'.format(test_log_dir))
            shutil.rmtree(test_log_dir)

    # Increment the test index (to 1, if this is a new suite)
    _testlogs_test_index += 1
コード例 #13
0
def get_task_host(task_name):
    out = sdk_cmd.run_cli('task {} --json'.format(task_name))
    task_info = json.loads(out)[0]

    host = None
    for label in task_info['labels']:
        if label['key'] == 'offer_hostname':
            host = label['value']
            break

    if host is None:
        raise Exception("offer_hostname label is not present!: {}".format(task_info))

    # Validation: Check that label matches summary returned by CLI
    for task in sdk_tasks.get_summary():
        if task.name == task_name:
            if task.host == host:
                # OK!
                return host
            else:
                # CLI's hostname doesn't match the TaskInfo labels. Bug!
                raise Exception("offer_hostname label {} doesn't match CLI output!\nTask:\n{}".format(task_info))

    # Unable to find desired task in CLI!
    raise Exception("Unable to find task named {} in CLI".format(task_name))
コード例 #14
0
def test_node_replace_replaces_node():
    replace_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == "node-2-server"
    ][0]
    log.info("avoid host for task {}".format(replace_task))

    replace_pod_name = replace_task.name[:-len("-server")]

    # Update the placement constraints so the new node doesn't end up on the same host
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    original_constraint = marathon_config["env"]["PLACEMENT_CONSTRAINT"]
    try:
        marathon_config["env"][
            "PLACEMENT_CONSTRAINT"] = '[["hostname", "UNLIKE", "{}"]]'.format(
                replace_task.host)
        sdk_marathon.update_app(marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

        # start replace and wait for it to finish
        sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                        "pod replace {}".format(replace_pod_name))
        sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
        sdk_plan.wait_for_completed_recovery(
            config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)

    finally:
        # revert to prior placement setting before proceeding with tests: avoid getting stuck.
        marathon_config["env"]["PLACEMENT_CONSTRAINT"] = original_constraint
        sdk_marathon.update_app(marathon_config)

        sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
コード例 #15
0
def get_task_host(task_name):
    _, out, _ = sdk_cmd.run_cli("task {} --json".format(task_name))
    tasks_json = json.loads(out)
    matching_tasks = list(filter(lambda t: t["name"] == task_name, tasks_json))
    assert len(matching_tasks
               ) == 1, "Duplicate tasks found with same name : [{}]".format(
                   tasks_json)
    task_info = matching_tasks.pop()

    host = None
    for label in task_info["labels"]:
        if label["key"] == "offer_hostname":
            host = label["value"]
            break

    if host is None:
        raise Exception(
            "offer_hostname label is not present!: {}".format(task_info))

    # Validation: Check that label matches summary returned by CLI
    for task in sdk_tasks.get_summary():
        if task.name == task_name:
            if task.host == host:
                # OK!
                return host
            else:
                # CLI's hostname doesn't match the TaskInfo labels. Bug!
                raise Exception(
                    "offer_hostname label [{}] doesn't match CLI output [{}]\nTask:\n{}"
                    .format(host, task.host, task_info))

    # Unable to find desired task in CLI!
    raise Exception("Unable to find task named {} in CLI".format(task_name))
コード例 #16
0
def handle_test_setup(item: pytest.Item):
    '''Does some initialization at the start of a test.

    This should be called in a pytest_runtest_setup() hook.
    See also handle_failed_test() which must be called from a pytest_runtest_makereport() hookimpl hook.'''

    # Check if we're entering a new test suite.
    global _testlogs_test_index
    global _testlogs_current_test_suite
    test_suite = get_test_suite_name(item)
    if test_suite != _testlogs_current_test_suite:
        # New test suite:
        # 1 Store all the task ids which already exist as of this point.
        _testlogs_current_test_suite = test_suite
        global _testlogs_ignored_task_ids
        _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union(
            [task.id for task in sdk_tasks.get_summary(with_completed=True)])
        log.info(
            'Entering new test suite {}: {} preexisting tasks will be ignored on test failure.'
            .format(test_suite, len(_testlogs_ignored_task_ids)))
        # 2 Reset the test index.
        _testlogs_test_index = 0
        # 3 Remove any prior logs for the test suite.
        test_log_dir = _test_suite_artifact_directory(item)
        if os.path.exists(test_log_dir):
            log.info(
                'Deleting existing test suite logs: {}/'.format(test_log_dir))
            shutil.rmtree(test_log_dir)

    # Increment the test index (to 1, if this is a new suite)
    _testlogs_test_index += 1
コード例 #17
0
def test_auto_replace_on_decommission():
    candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler(
        config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$"))

    assert len(candidate_tasks) != 0, "Could not find a node to decommission"

    # Pick the host of the first task from the above list
    replace_agent_id = candidate_tasks[0].agent_id
    replace_tasks = [
        task for task in candidate_tasks if task.agent_id == replace_agent_id
    ]
    log.info("Tasks on agent {} to be replaced after decommission: {}".format(
        replace_agent_id, replace_tasks))
    sdk_agents.decommission_agent(replace_agent_id)

    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME)

    new_tasks = sdk_tasks.get_summary()

    for replaced_task in replace_tasks:
        new_task = [
            task for task in new_tasks
            if task.name == replaced_task.name and task.id != replaced_task.id
        ][0]
        log.info("Checking affected task has moved to a new agent:\n"
                 "old={}\nnew={}".format(replaced_task, new_task))
        assert replaced_task.agent_id != new_task.agent_id
コード例 #18
0
ファイル: sdk_diag.py プロジェクト: thawp99/dcos-commons
def handle_test_report(item: pytest.Item, result):  # _pytest.runner.TestReport
    '''Collects information from the cluster following a failed test.

    This should be called in a hookimpl fixture.
    See also handle_test_setup() which must be called in a pytest_runtest_setup() hook.'''

    if not result.failed:
        return  # passed, nothing to do

    # Fetch all state from all currently-installed services.
    # We do this retrieval first in order to be closer to the actual test failure.
    # Services may still be installed when e.g. we're still in the middle of a test suite.
    service_names = list(
        filter(
            lambda name: name != sdk_package_registry.
            PACKAGE_REGISTRY_SERVICE_NAME,
            sdk_install.get_installed_service_names()))
    if len(service_names) > 0:
        log.info(
            'Fetching plans for {} services that are currently installed: {}'.
            format(len(service_names), ', '.join(service_names)))
        for service_name in service_names:
            try:
                _dump_scheduler(item, service_name)
            except Exception:
                log.exception('Plan collection from service {} failed!'.format(
                    service_name))

    # Fetch all logs from tasks created since the last failure, or since the start of the suite.
    global _testlogs_ignored_task_ids
    new_task_ids = [
        task.id for task in sdk_tasks.get_summary(with_completed=True)
        if task.id not in _testlogs_ignored_task_ids
    ]
    _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union(new_task_ids)
    # Enforce limit on how many tasks we will fetch logs from, to avoid unbounded log fetching.
    if len(new_task_ids) > _testlogs_task_id_limit:
        log.warning(
            'Truncating list of {} new tasks to size {} to avoid fetching logs forever: {}'
            .format(len(new_task_ids), _testlogs_task_id_limit, new_task_ids))
        del new_task_ids[_testlogs_task_id_limit:]
    try:
        log.info(
            'Fetching logs for {} tasks launched in this suite since last failure: {}'
            .format(len(new_task_ids), ', '.join(new_task_ids)))
        _dump_task_logs(item, new_task_ids)
    except Exception:
        log.exception('Task log collection failed!')
    try:
        log.info('Fetching mesos state:')
        _dump_mesos_state(item)
    except Exception:
        log.exception('Mesos state collection failed!')
    try:
        log.info('Creating/fetching cluster diagnostics bundle:')
        _dump_diagnostics_bundle(item)
    except Exception:
        log.exception('Diagnostics bundle creation failed')
    log.info('Post-failure collection complete')
コード例 #19
0
def test_launch_task_with_multiple_ports():
    sdk_install.install(
        config.PACKAGE_NAME,
        config.SERVICE_NAME,
        0,
        additional_options={"service": {"yaml": "multiport"}},
    )
    assert sdk_tasks.get_summary(with_completed=True, task_name="multiport-0-server"), "Unable to find matching completed task"
コード例 #20
0
def test_launch_task_with_multiple_ports():
    sdk_install.install(
        config.PACKAGE_NAME,
        config.SERVICE_NAME,
        0,
        additional_options={"service": {
            "yaml": "multiport"
        }},
    )
    assert sdk_tasks.get_summary(with_completed=True,
                                 task_name="multiport-0-server"
                                 ), "Unable to find matching completed task"
コード例 #21
0
def handle_test_report(item: pytest.Item, result): # _pytest.runner.TestReport
    '''Collects information from the cluster following a failed test.

    This should be called in a hookimpl fixture.
    See also handle_test_setup() which must be called in a pytest_runtest_setup() hook.'''

    if not result.failed:
        return # passed, nothing to do

    # Fetch all plans from all currently-installed services.
    # We do this retrieval first in order to be closer to the actual test failure.
    # Services may still be installed when e.g. we're still in the middle of a test suite.
    service_names = sdk_install.get_installed_service_names()
    if len(service_names) > 0:
        log.info('Fetching plans for {} services that are currently installed: {}'.format(
            len(service_names), ', '.join(service_names)))
        for service_name in service_names:
            try:
                _dump_plans(item, service_name)
            except:
                log.exception('Plan collection from service {} failed!'.format(service_name))

    # Fetch all logs from tasks created since the last failure, or since the start of the suite.
    global _testlogs_ignored_task_ids
    new_task_ids = [task.id for task in sdk_tasks.get_summary(with_completed=True)
                    if task.id not in _testlogs_ignored_task_ids]
    _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union(new_task_ids)
    # Enforce limit on how many tasks we will fetch logs from, to avoid unbounded log fetching.
    if len(new_task_ids) > _testlogs_task_id_limit:
        log.warning('Truncating list of {} new tasks to size {} to avoid fetching logs forever: {}'.format(
            len(new_task_ids), _testlogs_task_id_limit, new_task_ids))
        del new_task_ids[_testlogs_task_id_limit:]
    try:
        log.info('Fetching logs for {} tasks launched in this suite since last failure: {}'.format(
            len(new_task_ids), ', '.join(new_task_ids)))
        _dump_task_logs(item, new_task_ids)
    except:
        log.exception('Task log collection failed!')
    try:
        log.info('Fetching mesos state:')
        _dump_mesos_state(item)
    except:
        log.exception('Mesos state collection failed!')
    try:
        log.info('Creating/fetching cluster diagnostics bundle:')
        _dump_diagnostics_bundle(item)
    except:
        log.exception('Diagnostics bundle creation failed')
    log.info('Post-failure collection complete')
コード例 #22
0
ファイル: conftest.py プロジェクト: zencircle/dcos-commons
def pytest_runtest_makereport(item, call):
    '''Hook to run after every test, before any other post-test hooks.
    See also: https://docs.pytest.org/en/latest/example/simple.html\
    #making-test-result-information-available-in-fixtures
    '''
    # Execute all other hooks to obtain the report object, then a report attribute for each phase of
    # a call, which can be "setup", "call", "teardown".
    # Subsequent fixtures can get the reports off of the request object like: `request.rep_setup.failed`.
    outcome = yield
    rep = outcome.get_result()
    setattr(item, "rep_" + rep.when, rep)

    # Handle failures. Must be done here and not in a fixture in order to
    # properly handle post-yield fixture teardown failures.
    if rep.failed:
        # Fetch all logs from tasks created since the last failure, or since the start of the suite.
        global testlogs_ignored_task_ids
        new_task_ids = [task.id for task in sdk_tasks.get_summary(with_completed=True)
                        if task.id not in testlogs_ignored_task_ids]
        testlogs_ignored_task_ids = testlogs_ignored_task_ids.union(new_task_ids)
        # Enforce limit on how many tasks we will fetch logs from, to avoid unbounded log fetching.
        if len(new_task_ids) > testlogs_task_id_limit:
            log.warning('Truncating list of {} new tasks to size {} to avoid fetching logs forever: {}'.format(
                len(new_task_ids), testlogs_task_id_limit, new_task_ids))
            del new_task_ids[testlogs_task_id_limit:]
        log.info('Test {} failed in {} phase.'.format(item.name, rep.when))

        try:
            log.info('Fetching logs for {} tasks launched in this suite since last failure: {}'.format(
                len(new_task_ids), new_task_ids))
            dump_task_logs(item, new_task_ids)
        except Exception:
            log.exception('Task log collection failed!')
        try:
            log.info('Fetching mesos state')
            dump_mesos_state(item)
        except Exception:
            log.exception('Mesos state collection failed!')
        try:
            log.info('Creating/fetching cluster diagnostics bundle')
            get_diagnostics_bundle(item)
        except Exception:
            log.exception("Diagnostics bundle creation failed")
        log.info('Post-failure collection complete')
コード例 #23
0
ファイル: conftest.py プロジェクト: zencircle/dcos-commons
def pytest_runtest_setup(item):
    '''Hook to run before every test.'''
    # Inject header at start of test, following automatic "path/to/test_file.py::test_name":
    # Don't do this when running in teamcity, where it's redundant.
    if not teamcity.is_running_under_teamcity():
        print('''
==========
======= START: {}::{}
=========='''.format(sdk_utils.get_test_suite_name(item), item.name))

    # Check if we're entering a new test suite.
    global testlogs_test_index
    global testlogs_current_test_suite
    test_suite = sdk_utils.get_test_suite_name(item)
    if test_suite != testlogs_current_test_suite:
        # New test suite:
        # 1 Store all the task ids which already exist as of this point.
        testlogs_current_test_suite = test_suite
        global testlogs_ignored_task_ids
        testlogs_ignored_task_ids = testlogs_ignored_task_ids.union([
            task.id for task in sdk_tasks.get_summary(with_completed=True)])
        log.info('Entering new test suite {}: {} preexisting tasks will be ignored on test failure.'.format(
            test_suite, len(testlogs_ignored_task_ids)))
        # 2 Reset the test index.
        testlogs_test_index = 0
        # 3 Remove any prior logs for the test suite.
        test_log_dir = sdk_utils.get_test_suite_log_directory(item)
        if os.path.exists(test_log_dir):
            log.info('Deleting existing test suite logs: {}/'.format(test_log_dir))
            shutil.rmtree(test_log_dir)

    # Increment the test index (to 1, if this is a new suite), and pass the value to sdk_utils for use internally.
    testlogs_test_index += 1
    sdk_utils.set_test_index(testlogs_test_index)

    min_version_mark = item.get_marker('dcos_min_version')
    if min_version_mark:
        min_version = min_version_mark.args[0]
        message = 'Feature only supported in DC/OS {} and up'.format(min_version)
        if 'reason' in min_version_mark.kwargs:
            message += ': {}'.format(min_version_mark.kwargs['reason'])
        if sdk_utils.dcos_version_less_than(min_version):
            pytest.skip(message)
コード例 #24
0
def test_node_replace_replaces_node():
    replace_task = [
        task for task in sdk_tasks.get_summary()
        if task.name == 'node-2-server'
    ][0]
    log.info('avoid host for task {}'.format(replace_task))

    replace_pod_name = replace_task.name[:-len('-server')]

    # Update the placement constraints so the new node doesn't end up on the same host
    marathon_config = sdk_marathon.get_config(config.SERVICE_NAME)
    marathon_config['env'][
        'PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(
            replace_task.host)
    sdk_marathon.update_app(config.SERVICE_NAME, marathon_config)

    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    # start replace and wait for it to finish
    sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME,
                    'pod replace {}'.format(replace_pod_name))
    sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(
        config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
コード例 #25
0
 def get_scheduler_task_id(service_name: str) -> str:
     for task in sdk_tasks.get_summary():
         if task.name == service_name:
             return task.id
コード例 #26
0
 def get_scheduler_task_id(service_name: str) -> str:
     for task in sdk_tasks.get_summary():
         if task.name == service_name:
             return task.id
コード例 #27
0
def get_completed_task_id(task_name):
    task_ids = [t.id for t in sdk_tasks.get_summary(with_completed=True, task_name=task_name)]
    # Mesos returns newest task first:
    return task_ids[0] if task_ids else None
コード例 #28
0
def handle_test_report(item: pytest.Item, result: runner.TestReport) -> None:
    """Collects information from the cluster following a failed test.

    This should be called in a hookimpl fixture.
    See also handle_test_setup() which must be called in a pytest_runtest_setup() hook."""

    if not result.failed or os.environ.get('DISABLE_DIAG'):
        return  # passed, nothing to do, or diagnostics collection disabled

    # Fetch all state from all currently-installed services.
    # We do this retrieval first in order to be closer to the actual test failure.
    # Services may still be installed when e.g. we're still in the middle of a test suite.
    service_names = list(
        filter(
            lambda name: name != sdk_package_registry.PACKAGE_REGISTRY_SERVICE_NAME,
            sdk_install.get_installed_service_names().union(_whitelisted_service_names(item)),
        )
    )
    if len(service_names) > 0:
        log.info(
            "Fetching plans for {} services that are currently installed: {}".format(
                len(service_names), ", ".join(service_names)
            )
        )
        for service_name in service_names:
            try:
                # Skip thread retrieval if plan retrieval fails:
                _dump_plans(item, service_name)
                _dump_threads(item, service_name)
            except Exception:
                log.exception("Plan/thread collection from service {} failed!".format(service_name))

    # Fetch all logs from tasks created since the last failure, or since the start of the suite.
    global _testlogs_ignored_task_ids
    new_task_ids = [
        task.id
        for task in sdk_tasks.get_summary(with_completed=True)
        if task.id not in _testlogs_ignored_task_ids
    ]
    _testlogs_ignored_task_ids = _testlogs_ignored_task_ids.union(new_task_ids)
    # Enforce limit on how many tasks we will fetch logs from, to avoid unbounded log fetching.
    if len(new_task_ids) > _testlogs_task_id_limit:
        log.warning(
            "Truncating list of {} new tasks to size {} to avoid fetching logs forever: {}".format(
                len(new_task_ids), _testlogs_task_id_limit, new_task_ids
            )
        )
        del new_task_ids[_testlogs_task_id_limit:]
    try:
        log.info(
            "Fetching logs for {} tasks launched in this suite since last failure: {}".format(
                len(new_task_ids), ", ".join(new_task_ids)
            )
        )
        _dump_task_logs(item, new_task_ids)
    except Exception:
        log.exception("Task log collection failed!")
    try:
        log.info("Fetching mesos state:")
        _dump_mesos_state(item)
    except Exception:
        log.exception("Mesos state collection failed!")
    try:
        log.info("Creating/fetching cluster diagnostics bundle:")
        _dump_diagnostics_bundle(item)
    except Exception:
        log.exception("Diagnostics bundle creation failed")
    log.info("Post-failure collection complete")