Ejemplo n.º 1
0
def is_node_down():
    """
    Check if node is down according to scheduler.

    The node is considered as down if:
    - there is a failure contacting the scheduler
    - node is not reported in the compute nodes list
    - node is in one of the SGE_ERROR_STATES states
    """
    try:
        hostname = check_command_output("hostname").strip()
        host_fqdn = socket.getfqdn(hostname)
        nodes = get_compute_nodes_info(hostname_filter=hostname)
        if not any(host in nodes for host in [hostname, host_fqdn]):
            log.warning("Node is not attached to scheduler. Reporting as down")
            return True

        node = nodes.get(host_fqdn, nodes.get(hostname))
        log.info("Node is in state: '{0}'".format(node.state))
        if all(error_state not in node.state for error_state in SGE_ERROR_STATES):
            return False
    except Exception as e:
        log.error("Failed when checking if node is down with exception %s. Reporting node as down.", e)

    return True
def test_get_compute_nodes_info(qstat_mocked_response, expected_output, test_datadir, mocker):
    qstat_output = read_text(test_datadir / qstat_mocked_response)
    mock = mocker.patch(
        "common.schedulers.sge_commands.check_sge_command_output", return_value=qstat_output, autospec=True
    )

    nodes = get_compute_nodes_info()

    mock.assert_called_with("qstat -xml -g dt -u '*' -f")
    assert_that(nodes).is_equal_to(expected_output)
Ejemplo n.º 3
0
def get_busy_nodes():
    """
    Count nodes that have at least 1 job running or have a state that makes them unusable for jobs submission.
    """
    nodes = get_compute_nodes_info()
    busy_nodes = 0
    for node in nodes.values():
        if (any(busy_state in node.state for busy_state in SGE_BUSY_STATES)
                or int(node.slots_used) > 0 or int(node.slots_reserved) > 0):
            if SGE_ORPHANED_STATE in node.state:
                logging.info(
                    "Skipping host %s since in orphaned state, hence not in ASG. "
                    "Host will disappear when assigned jobs are deleted.",
                    node.name,
                )
            else:
                busy_nodes += 1

    return busy_nodes