Example #1
0
def test_container_checker(duthosts, enum_rand_one_per_hwsku_hostname,
                           enum_rand_one_asic_index, enum_dut_feature, tbinfo):
    """Tests the feature of container checker.

    This function will check whether the container names will appear in the Monit
    alerting message if they are stopped explicitly or they hit start limitation.

    Args:
        duthosts: list of DUTs.
        enum_rand_one_per_hwsku_hostname: Fixture returning list of hostname selected per hwsku.
        enum_rand_one_asic_index: Fixture returning list of asics for selected duts.
        enum_dut_feature: A list contains features.
        tbinfo: Testbed information.

    Returns:
        None.
    """
    service_name = enum_dut_feature
    duthost = duthosts[enum_rand_one_per_hwsku_hostname]
    asic = duthost.asic_instance(enum_rand_one_asic_index)
    container_name = asic.get_docker_name(service_name)

    loganalyzer = LogAnalyzer(
        ansible_host=duthost,
        marker_prefix="container_checker_{}".format(container_name))

    disabled_containers = get_disabled_container_list(duthost)

    skip_containers = disabled_containers[:]

    # Skip 'radv' container on devices whose role is not T0.
    if tbinfo["topo"]["type"] != "t0":
        skip_containers.append("radv")

    pytest_require(
        service_name not in skip_containers,
        "Container '{}' is skipped for testing.".format(container_name))

    asic.stop_service(service_name)
    logger.info(
        "Waiting until container '{}' is stopped...".format(container_name))
    stopped = wait_until(CONTAINER_STOP_THRESHOLD_SECS,
                         CONTAINER_CHECK_INTERVAL_SECS, 0,
                         check_container_state, duthost, container_name, False)
    pytest_assert(stopped,
                  "Failed to stop container '{}'".format(container_name))
    logger.info("Container '{}' on DuT '{}' was stopped".format(
        container_name, duthost.hostname))

    loganalyzer.expect_regex = get_expected_alerting_message(container_name)
    with loganalyzer:
        # Wait for 1 minutes such that Monit has a chance to write alerting message into syslog.
        logger.info("Sleep 1 minutes to wait for the alerting message...")
        time.sleep(70)
def test_container_checker(duthosts, enum_dut_feature_container,
                           rand_selected_dut, tbinfo):
    """Tests the feature of container checker.

    This function will check whether the container names will appear in the Monit
    alerting message if they are stopped explicitly or they hit start limitation.

    Args:
        duthosts: list of DUTs.
        enum_dut_feature_container: A list contains strings ("<dut_name>|<container_name>").
        rand_selected_dut: The fixture returns a randomly selected DuT.
        tbinfo: Testbed information.

    Returns:
        None.
    """
    dut_name, container_name = decode_dut_and_container_name(
        enum_dut_feature_container)
    pytest_require(
        dut_name == rand_selected_dut.hostname and container_name != "unknown",
        "Skips testing container_checker of container '{}' on the DuT '{}' since another DuT '{}' was chosen."
        .format(container_name, dut_name, rand_selected_dut.hostname))
    duthost = duthosts[dut_name]

    loganalyzer = LogAnalyzer(
        ansible_host=duthost,
        marker_prefix="container_checker_{}".format(container_name))

    disabled_containers = get_disabled_container_list(duthost)

    skip_containers = disabled_containers[:]
    skip_containers.append("gbsyncd")
    skip_containers.append("database")
    skip_containers.append("database-chassis")

    # Skip 'radv' container on devices whose role is not T0.
    if tbinfo["topo"]["type"] != "t0":
        skip_containers.append("radv")

    pytest_require(
        container_name not in skip_containers,
        "Container '{}' is skipped for testing.".format(container_name))
    stop_container(duthost, container_name)

    loganalyzer.expect_regex = get_expected_alerting_message(container_name)
    with loganalyzer:
        # Wait for 1 minutes such that Monit has a chance to write alerting message into syslog.
        logger.info("Sleep 1 minutes to wait for the alerting message...")
        time.sleep(70)
Example #3
0
def test_container_checker(duthosts, rand_one_dut_hostname, tbinfo):
    """Tests the feature of container checker.

    This function will check whether the container names will appear in the Monit
    alerting message if they are stopped explicitly or they hit start limitation.

    Args:
        duthosts: list of DUTs.
        rand_one_dut_hostname: hostname of DUT.
        tbinfo: Testbed information.

    Returns:
        None.
    """
    duthost = duthosts[rand_one_dut_hostname]
    loganalyzer = LogAnalyzer(ansible_host=duthost,
                              marker_prefix="container_checker")
    loganalyzer.expect_regex = []

    container_autorestart_states = duthost.get_container_autorestart_states()
    disabled_containers = get_disabled_container_list(duthost)

    skip_containers = disabled_containers[:]
    skip_containers.append("gbsyncd")
    # Skip 'radv' container on devices whose role is not T0.
    if tbinfo["topo"]["type"] != "t0":
        skip_containers.append("radv")

    stopped_container_list = stop_containers(duthost,
                                             container_autorestart_states,
                                             skip_containers)
    pytest_assert(
        len(stopped_container_list) > 0, "None of containers was stopped!")

    expected_alerting_messages = get_expected_alerting_messages(
        stopped_container_list)
    loganalyzer.expect_regex.extend(expected_alerting_messages)
    marker = loganalyzer.init()

    # Wait for 2 minutes such that Monit has a chance to write alerting message into syslog.
    logger.info("Sleep 2 minutes to wait for the alerting message...")
    time.sleep(130)

    logger.info("Checking the alerting messages from syslog...")
    loganalyzer.analyze(marker)
    logger.info("Found all the expected alerting messages from syslog!")
Example #4
0
def run_test_on_single_container(duthost, container_name, service_name,
                                 tbinfo):
    container_autorestart_states = duthost.get_container_autorestart_states()
    disabled_containers = get_disabled_container_list(duthost)

    skip_condition = disabled_containers[:]
    skip_condition.append("database")
    skip_condition.append("acms")
    if tbinfo["topo"]["type"] != "t0":
        skip_condition.append("radv")

    # bgp0 -> bgp, bgp -> bgp
    feature_name = re.match(CONTAINER_NAME_REGEX, container_name).group(1)

    # Skip testing the database container, radv container on T1 devices and containers/services which are disabled
    pytest_require(feature_name not in skip_condition,
                   "Skipping test for container {}".format(feature_name))

    is_running = is_container_running(duthost, container_name)
    pytest_assert(
        is_running,
        "Container '{}' is not running. Exiting...".format(container_name))

    up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established")

    logger.info("Start testing the container '{}'...".format(container_name))

    restore_disabled_state = False
    if container_autorestart_states[feature_name] == "disabled":
        logger.info(
            "Change auto-restart state of container '{}' to be 'enabled'".
            format(container_name))
        duthost.shell(
            "sudo config feature autorestart {} enabled".format(feature_name))
        restore_disabled_state = True

    # Currently we select 'rsyslogd' as non-critical processes for testing based on
    # the assumption that every container has an 'rsyslogd' process running and it is not
    # considered to be a critical process
    program_status, program_pid = get_program_info(duthost, container_name,
                                                   "rsyslogd")
    verify_no_autorestart_with_non_critical_process(duthost, container_name,
                                                    "rsyslogd", program_status,
                                                    program_pid)

    critical_group_list, critical_process_list, succeeded = duthost.get_critical_group_and_process_lists(
        container_name)
    pytest_assert(
        succeeded,
        "Failed to get critical group and process lists of container '{}'".
        format(container_name))

    for critical_process in critical_process_list:
        # Skip 'dsserve' process since it was not managed by supervisord
        # TODO: Should remove the following two lines once the issue was solved in the image.
        if feature_name == "syncd" and critical_process == "dsserve":
            continue

        program_status, program_pid = get_program_info(duthost, container_name,
                                                       critical_process)
        verify_autorestart_with_critical_process(duthost, container_name,
                                                 service_name,
                                                 critical_process,
                                                 program_status, program_pid)
        # Sleep 20 seconds in order to let the processes come into live after container is restarted.
        # We will uncomment the following line once the "extended" mode is added
        # time.sleep(20)
        # We are currently only testing one critical process, that is why we use 'break'. Once
        # we add the "extended" mode, we will remove this statement
        break

    for critical_group in critical_group_list:
        group_program_info = get_group_program_info(duthost, container_name,
                                                    critical_group)
        for program_name in group_program_info:
            verify_autorestart_with_critical_process(
                duthost, container_name, service_name, program_name,
                group_program_info[program_name][0],
                group_program_info[program_name][1])
            # We are currently only testing one critical program for each critical group, which is
            # why we use 'break' statement. Once we add the "extended" mode, we will remove this
            # statement
            break

    if restore_disabled_state:
        logger.info(
            "Restore auto-restart state of container '{}' to 'disabled'".
            format(container_name))
        duthost.shell(
            "sudo config feature autorestart {} disabled".format(feature_name))

    critical_proceses, bgp_check = postcheck_critical_processes_status(
        duthost, container_autorestart_states, up_bgp_neighbors)
    if not (critical_proceses and bgp_check):
        config_reload(duthost, safe_reload=True)
        failed_check = "[Critical Process] " if not critical_proceses else ""
        failed_check += "[BGP] " if not bgp_check else ""
        processes_status = duthost.all_critical_process_status()
        pstatus = [
            {
                k: {
                    "status": v["status"],
                    "exited_critical_process": v["exited_critical_process"]
                }
            } for k, v in processes_status.items()
            if v["status"] is False and len(v["exited_critical_process"]) > 0
        ]

        pytest.fail(
            ("{}check failed, testing feature {}, \nBGP:{}, \nNeighbors:{}"
             "\nProcess status {}").format(
                 failed_check, container_name,
                 [{
                     x: v['state']
                 } for x, v in duthost.get_bgp_neighbors().items()
                  if v['state'] != 'established'], up_bgp_neighbors, pstatus))

    logger.info("End of testing the container '{}'".format(container_name))