Beispiel #1
0
def test_hostlist_create_new():
    """Hostlist: Create new hostlist"""
    hl = pyslurm.hostlist()
    hosts = "c1, c[2-3]"

    assert_true(hl.create(hosts))
    assert_equals(hl.count(), 3)

    assert_equals(hl.find("c3"), 2)

    assert_equals(hl.push("c[4-5]"), 2)
    assert_equals(hl.count(), 5)

    assert_equals(hl.push_host("c6"), 1)

    assert_equals(hl.ranged_string(), "c[1-6]")

    assert_equals(hl.shift(), "c1")
    assert_equals(hl.count(), 5)

    assert_equals(hl.push_host("c6"), 1)
    assert_equals(hl.ranged_string(), "c[2-6,6]")

    hl.uniq()
    assert_equals(hl.ranged_string(), "c[2-6]")

    hl.destroy()
    assert_equals(hl.count(), -1)
Beispiel #2
0
def test_hostlist_create_new():
    """Hostlist: Create new hostlist"""
    hl = pyslurm.hostlist()
    hosts = "c1, c[2-3]"

    assert_true(hl.create(hosts))
    assert_equals(hl.count(), 3)

    assert_equals(hl.find("c3"), 2)

    assert_equals(hl.push("c[4-5]"), 2)
    assert_equals(hl.count(), 5)

    assert_equals(hl.push_host("c6"), 1)

    assert_equals(hl.ranged_string(), "c[1-6]")

    assert_equals(hl.shift(), "c1")
    assert_equals(hl.count(), 5)

    assert_equals(hl.push_host("c6"), 1)
    assert_equals(hl.ranged_string(), "c[2-6,6]")

    hl.uniq()
    assert_equals(hl.ranged_string(), "c[2-6]")

    hl.destroy()
    assert_equals(hl.count(), -1)
Beispiel #3
0
def test_hostlist_create_new():
    """Hostlist: Create new hostlist"""
    hl = pyslurm.hostlist()
    hosts = "c1, c[2-3]"

    assert hl.create(hosts)
    assert hl.count() == 3

    assert hl.find("c3") == 2

    assert hl.push("c[4-5]") == 2
    assert hl.count() == 5

    assert hl.push_host("c6") == 1

    assert hl.ranged_string() == "c[1-6]"

    assert hl.shift() == "c1"
    assert hl.count() == 5

    assert hl.push_host("c6") == 1
    assert hl.ranged_string() == "c[2-6,6]"

    hl.uniq()
    assert hl.ranged_string() == "c[2-6]"

    hl.destroy()
    assert hl.count() == -1
Beispiel #4
0
def test_hostlist_create_empty():
    """Hostlist: Test create empty hostlist."""
    hl = pyslurm.hostlist()

    hl.create()
    assert_equals(hl.count(), 0)

    hl.destroy()
    assert_equals(hl.count(), -1)
Beispiel #5
0
def test_hostlist_create_empty():
    """Hostlist: Test create empty hostlist."""
    hl = pyslurm.hostlist()

    hl.create()
    assert hl.count() == 0

    hl.destroy()
    assert hl.count() == -1
Beispiel #6
0
 def __init__(self):
     self.noFarm = ['slurmweb', 'huematrix']
     self.nodeInformations = {}
     self.__hosts = hostlist()
     self.__node = node()
     self.SearchNodes()
     self.nodes = sorted(self.nodeInformations.keys())
     self.state = ''
     self.node = ''
Beispiel #7
0
def test_hostlist_create_empty():
    """Hostlist: Test create empty hostlist."""
    hl = pyslurm.hostlist()

    hl.create()
    assert_equals(hl.count(), 0)

    hl.destroy()
    assert_equals(hl.count(), -1)
Beispiel #8
0
#!/usr/bin/env python
"""
Retrieve Slurm hosts
"""
from __future__ import print_function

import pyslurm

b = pyslurm.hostlist()

hosts = "dummy0,dummy1,dummy1,dummy3,dummy4"
print("Creating hostlist ...... with {0}".format(hosts))
if b.create(hosts):
    print()
    print("\tHost list count is {0}".format(b.count()))
    node = "dummy3"
    pos = b.find(node)
    if pos == -1:
        print("Failed to find {0} in list".format(node))
    else:
        print("\tHost {0} found at position {1}".format(node, pos))
    print("\tCalling uniq on current host list")
    b.uniq()

    print("\tNew host list is {0}".format(b.get()))
    print("\tNew host list count is {0}".format(b.count()))
    pos = b.find(node)
    if pos == -1:
        print("Failed to find {0} in list".format(node))
    else:
        print("\tHost {0} found at position {1}".format(node, pos))
Beispiel #9
0
#!/usr/bin/env python

from __future__ import print_function

import pyslurm

b = pyslurm.hostlist()

hosts = "dummy0,dummy1,dummy1,dummy3,dummy4"
print("Creating hostlist ...... with {0}".format(hosts))
if b.create(hosts):

    print()
    print("\tHost list count is {0}".format(b.count()))
    node = "dummy3"
    pos = b.find(node)
    if pos == -1:
        print("Failed to find {0} in list".format(node))
    else:
        print("\tHost {0} found at position {1}".format(node, pos))
    print("\tCalling uniq on current host list")
    b.uniq()

    print("\tNew host list is {0}".format(b.get()))
    print("\tNew host list count is {0}".format(b.count()))
    pos = b.find(node)
    if pos == -1:
        print("Failed to find {0} in list".format(node))
    else:
        print("\tHost {0} found at position {1}".format(node, pos))
Beispiel #10
0
def get_core_usage(data):
    # split gpu cores from cpu cores
    for id, job in data["jobs"].items():
        if job["state"] == "RUNNING":
            for n, l in job["layout"].items():
                # data['nodes'][n]['gpuJobsOnNode'].append(id)
                # data['nodes'][n]['gpuJobsCoresUsed'] += job['nCpus']
                k = "cpuLayout"
                r = "cpuJobCnt"
                if job["nGpus"] > 0:
                    k = "gpuLayout"
                    r = "gpuJobCnt"
                data["nodes"][n][k].extend(l)
                data["nodes"][n][r] += 1

        elif job["state"] == "PENDING":
            # store future jobs with the nodes they're going to run on
            if job["startTime"] != 0 and job["schedNodes"] is not None:
                b = pyslurm.hostlist()
                b.create(job["schedNodes"])
                for n in b.get_list():
                    n = str(n, "utf-8")
                    # print(n,id)
                    data["nodes"][n]["futureJobs"].append(id)

    for hostname, node in data["nodes"].items():
        node["nCpuCores"] = len(node["cpuLayout"])
        node["nGpuCores"] = len(node["gpuLayout"])
        if debug:
            if hostname in [
                    "john6",
                    "john99",
                    "gstar102",
                    "sstar011",
                    "sstar107",
                    "sstar301",
                    "gstar201",
                    "bryan1",
                    "gstar040",
                    "gina3",
                    "john32",
                    "john34",
            ]:
                print(hostname, node)

    # bins of cores free
    bcu = {}
    u = {}
    for part, prefixes in PARTITIONS.items():
        # running/alloc, idle/avail, offline, blocked/unschedulable/inaccessible, total - for cores and nodes and gpus
        usage = {
            "cores": {
                "a": 0,
                "i": 0,
                "o": 0,
                "b": 0,
                "t": 0
            },
            "nodes": {
                "a": 0,
                "i": 0,
                "o": 0,
                "b": 0,
                "t": 0
            },
            "gpus": {
                "a": 0,
                "i": 0,
                "o": 0,
                "b": 0,
                "t": 0
            },
        }
        bc = bins()

        for hostname, node in data["nodes"].items():
            for prefix in prefixes:
                if prefix in hostname:
                    # gpu jobs get 4 cores (in skylake-gpu), but can use more
                    gpuCoresMax = max(partitionResGpuCores[part],
                                      node["nGpuCores"])
                    # cpu jobs get the rest. ie. max 32 on skylake
                    cpuCoresMax = node["nCpus"] - gpuCoresMax

                    nCpuIdleCores = cpuCoresMax - node["nCpuCores"]
                    nGpuIdleCores = gpuCoresMax - node["nGpuCores"]

                    # which type of cores we are considering
                    ty = "nCpuCores"
                    if part == "skylake-gpu":
                        ty = "nGpuCores"

                    # unavail nodes do not contribute to free counts
                    if node["avail"]:
                        # can't schedule jobs on these cores for a variety of reasons
                        unsched = 0

                        # find idle and idle per-socket layouts
                        if part == "skylake":
                            # ideally gpus would have 2 cores reserved on each socket but that's not actually
                            # the case - they get >= 4 cores overall.

                            # assume 0-17 on socket0, 18-35 on socket1
                            s0 = 0
                            s1 = 0
                            g0 = 0
                            g1 = 0
                            for i in node["cpuLayout"]:
                                if i < 18:
                                    s0 += 1
                                else:
                                    s1 += 1
                            for i in node["gpuLayout"]:
                                if i < 18:
                                    g0 += 1
                                else:
                                    g1 += 1
                            if debug:
                                if s0 + s1 != node["nCpuCores"]:
                                    print(
                                        hostname,
                                        "err: cpu: ",
                                        s0,
                                        "+",
                                        s1,
                                        "!=",
                                        node["nCpuCores"],
                                        "node",
                                        node,
                                    )
                                if g0 + g1 != node["nGpuCores"]:
                                    print(
                                        hostname,
                                        "err: gpu: ",
                                        g0,
                                        "+",
                                        g1,
                                        "!=",
                                        node["nGpuCores"],
                                        "node",
                                        node,
                                    )

                            # g0+g1  = 0 - 4 cores held idle (no gpu jobs running)
                            # g0+g1 >= 4 - 0 cores held idle
                            # but we could be a weird situation where previous gpu jobs (
                            # eg. 4 cores, 1 gpu) have skewed the cpus used
                            # on each socket, so even though if there are cores avail we don't
                            # know where they'll be.
                            cpuCoresIdle0 = 18 - (s0 + g0)
                            cpuCoresIdle1 = 18 - (s1 + g1)

                            # also need to take account of inaccessible.
                            # eg. all gpus used, but not all 4 gpu cores used. so some cores unschedulable
                            gpuRes = g0 + g1
                            if (node["nGpusUsed"] == node["nGpus"]
                                ):  # both gpus being used
                                if gpuRes < 4:
                                    unsched += 4 - gpuRes

                            # just round-robin subtract
                            for i in range(0, 4 - gpuRes):
                                if i % 2:
                                    if cpuCoresIdle0 == 0:
                                        cpuCoresIdle1 -= 1
                                    else:
                                        cpuCoresIdle0 -= 1
                                else:
                                    if cpuCoresIdle1 == 0:
                                        cpuCoresIdle0 -= 1
                                    else:
                                        cpuCoresIdle1 -= 1

                            if debug:
                                if cpuCoresIdle0 < 0 or cpuCoresIdle1 < 0:
                                    print(
                                        "nope, you stuffed up idle",
                                        cpuCoresIdle0,
                                        cpuCoresIdle1,
                                        "node",
                                        node,
                                    )
                                if cpuCoresIdle0 + cpuCoresIdle1 != nCpuIdleCores:
                                    print(
                                        hostname,
                                        "err",
                                        cpuCoresIdle0,
                                        "+",
                                        cpuCoresIdle1,
                                        "+",
                                        unsched,
                                        "!=",
                                        nCpuIdleCores,
                                        "node",
                                        node,
                                    )

                            # also need to take account of inaccessible.
                            #     1-4 cores slots for cpu jobs on non-smalljobs, non-largemem nodes will never be used
                            if (
                                    cpuCoresIdle0 + cpuCoresIdle1 <= 16
                            ):  # small jobs could run and would have socket affinity set
                                if not ("smalljobs" in node["features"]
                                        or "largemem" in node["features"]):
                                    if cpuCoresIdle0 != 0 and cpuCoresIdle0 <= 4:
                                        unsched += cpuCoresIdle0
                                        cpuCoresIdle0 = 0
                                        if debug:
                                            print(
                                                "not smalljobs",
                                                "cpuCoresIdle0",
                                                cpuCoresIdle0,
                                                "cpuCoresIdle1",
                                                cpuCoresIdle1,
                                                "unsched",
                                                unsched,
                                                hostname,
                                                node,
                                            )
                                    if cpuCoresIdle1 != 0 and cpuCoresIdle1 <= 4:
                                        unsched += cpuCoresIdle1
                                        cpuCoresIdle1 = 0
                                        if debug:
                                            print(
                                                "not smalljobs",
                                                "cpuCoresIdle1",
                                                cpuCoresIdle1,
                                                "cpuCoresIdle0",
                                                cpuCoresIdle0,
                                                "unsched",
                                                unsched,
                                                hostname,
                                                node,
                                            )

                            if debug:
                                if (cpuCoresIdle0 + cpuCoresIdle1 + unsched <
                                        nCpuIdleCores):
                                    print(
                                        hostname,
                                        "err2",
                                        cpuCoresIdle0,
                                        "+",
                                        cpuCoresIdle1,
                                        "+",
                                        unsched,
                                        "<",
                                        nCpuIdleCores,
                                        "node",
                                        node,
                                    )

                            if debug:
                                if part == "skylake" and unsched > 0:
                                    print("blocked core", unsched, hostname,
                                          node)

                            if debug:
                                if node["nCpus"] - node[
                                        "nCpusUsed"] - unsched < 0:
                                    print("err3", unsched, hostname, node)

                            if (debug and node["nCpus"] - node["nCpusUsed"] -
                                (cpuCoresIdle0 + cpuCoresIdle1 + unsched) !=
                                    0):
                                print(
                                    "unsched",
                                    "free - (c0+c1+un)",
                                    node["nCpus"] - node["nCpusUsed"] -
                                    (cpuCoresIdle0 + cpuCoresIdle1 + unsched),
                                    "cpuCoresIdle0",
                                    cpuCoresIdle0,
                                    "cpuCoresIdle1",
                                    cpuCoresIdle1,
                                    "unsched",
                                    unsched,
                                    hostname,
                                    node,
                                )

                            # affinity kicks in here:
                            #   16,32 core jobs need whole (16 core) sockets
                            #   <16 core jobs need to be on 1 socket
                            # eg.
                            # 2,4 free cores means max 1x4 and 1x2 core jobs can run, but not a 6
                            # 2,14 free cores means max 1x14 and 1x2 core jobs can run, but not a 16
                            # 8,8 free cores means max 2x8 core jobs can run, but not a 16
                            # 10,6 free cores means 1x10 and 1x6 can run but not a 16
                            # 10,8 free cores means 1x18 job (no affinity set)
                            # 16,0 free cores means 1x16 job
                            # 17,1 free cores means 1x18 job (no affinity set)
                            # 18,0 free cores means 1x18 job (no affinity set)
                            # 16,8 free cores means 1x24 job (no affinity set)
                            # 15,15 free cores means 1x30 job (no affinity set)
                            # it's never eg. 18,0 free 'cos of gpu reserved cores
                            #
                            # so:
                            #  if >16 total free then that's the job size
                            #  else it's the per-socket numbers
                            c = cpuCoresIdle0 + cpuCoresIdle1
                            if c > 0:
                                m = (node["realMem"] - node["allocMem"]) / c
                                d = (node["realDisk"] - node["allocDisk"]) / c

                                # cores/nodes can be unavail because of being out of ram or disk too
                                if d <= 100 or m <= 100:  # 100 MB per core
                                    unsched += cpuCoresIdle0 + cpuCoresIdle1
                                    cpuCoresIdle0 = 0
                                    cpuCoresIdle1 = 0
                                    if debug:
                                        print("mem/disk blocked:", hostname, m,
                                              d)
                                else:
                                    mint = minFutureJobTime(
                                        node, data["jobs"], hostname,
                                        data["res"])
                                    if c > 16:
                                        bc.add(c, m, d, mint, hostname)
                                    else:
                                        if cpuCoresIdle0 > 0:
                                            bc.add(cpuCoresIdle0, m, d, mint,
                                                   hostname)
                                        if cpuCoresIdle1 > 0:
                                            bc.add(cpuCoresIdle1, m, d, mint,
                                                   hostname)
                        else:  # not skylake
                            if part != "skylake-gpu":
                                # sstar, gstar, knl avail
                                if nCpuIdleCores > 0:
                                    m = (node["realMem"] -
                                         node["allocMem"]) / nCpuIdleCores
                                    d = (node["realDisk"] -
                                         node["allocDisk"]) / nCpuIdleCores

                                    # cores/nodes can be unavail because of being out of ram or disk too
                                    if d <= 100 or m <= 100:  # 100 MB per core
                                        unsched += nCpuIdleCores
                                        nCpuIdleCores = 0
                                        if debug:
                                            print("mem/disk blocked:",
                                                  hostname, m, d)
                                    else:
                                        mint = minFutureJobTime(
                                            node, data["jobs"], hostname,
                                            data["res"])
                                        bc.add(nCpuIdleCores, m, d, mint,
                                               hostname)

                        # cores
                        if part == "skylake-gpu":
                            c = nGpuIdleCores - unsched
                            if c > 0:
                                m = (node["realMem"] - node["allocMem"]) / c
                                d = (node["realDisk"] - node["allocDisk"]) / c

                                # cores/nodes can be unavail because of being out of ram or disk too
                                if d <= 100 or m <= 100:  # 100 MB per core
                                    unsched = nGpuIdleCores

                            usage["cores"]["i"] += nGpuIdleCores - unsched
                            usage["cores"]["t"] += gpuCoresMax
                            usage["cores"]["b"] += unsched
                        elif part == "skylake":
                            usage["cores"][
                                "i"] += cpuCoresIdle0 + cpuCoresIdle1
                            usage["cores"]["t"] += cpuCoresMax
                            usage["cores"]["b"] += unsched
                        else:
                            usage["cores"]["i"] += nCpuIdleCores
                            usage["cores"]["t"] += cpuCoresMax
                            usage["cores"]["b"] += unsched
                        usage["cores"]["a"] += node[ty]

                        # nodes
                        if node[ty] == 0:
                            usage["nodes"]["i"] += 1
                        else:
                            usage["nodes"]["a"] += 1
                        usage["nodes"]["t"] += 1

                        # gpus
                        if part != "skylake":
                            # if there are any cores free then gpu jobs can run
                            if node["nCpus"] - node["nCpusUsed"] > 0:
                                usage["gpus"][
                                    "i"] += node["nGpus"] - node["nGpusUsed"]
                            else:
                                if debug:
                                    if (part == "skylake-gpu"
                                            and node["nGpus"] -
                                            node["nGpusUsed"] > 0):
                                        print(
                                            "blocked gpu",
                                            node["nGpus"] - node["nGpusUsed"],
                                            node,
                                        )
                                usage["gpus"]["b"] += (node["nGpus"] -
                                                       node["nGpusUsed"]
                                                       )  # blocked/unavail
                            # if part == 'skylake-gpu' and node['nGpus'] - node['nGpusUsed'] != 0:
                            #     print(hostname, node['nGpus'], node['nGpusUsed'], node['nGpus'] - node['nGpusUsed'])
                            usage["gpus"]["a"] += node["nGpusUsed"]
                            usage["gpus"]["t"] += node["nGpus"]

                    else:
                        # non-avail nodes - drained, draining, down, reserved (& unused)

                        # cores
                        if part == "skylake-gpu":
                            usage["cores"]["o"] += nGpuIdleCores
                        else:
                            usage["cores"]["o"] += nCpuIdleCores
                        usage["cores"]["a"] += node[ty]
                        usage["cores"]["t"] += node[ty]

                        # nodes
                        if node["nCpusUsed"] > 0:
                            usage["nodes"]["a"] += 1
                        usage["nodes"]["o"] += 1
                        usage["nodes"]["t"] += 1

                        # gpus
                        if part != "skylake":
                            usage["gpus"][
                                "o"] += node["nGpus"] - node["nGpusUsed"]
                            usage["gpus"]["a"] += node["nGpusUsed"]
                            usage["gpus"]["t"] += node["nGpusUsed"]

                    if debug:
                        if (node["nCpus"] == 0 and node["state"] != "IDLE"
                                and node["avail"]):
                            print(
                                "state not idle or unavail, yet 0 cores used",
                                hostname,
                                node,
                            )
                        if (node["nGpusUsed"] > 0 and node["nGpuCores"] == 0
                                or node["nGpusUsed"] == 0
                                and node["nGpuCores"] > 0):
                            print("gpu count mismatch", hostname, node)
                        # pretty often be transiently wrong
                        if node["nCpusUsed"] != (node["nGpuCores"] +
                                                 node["nCpuCores"]):
                            print("cpu count mismatch", hostname, node)

        # if debug:
        #    print(part, usage)

        # for a, b in usage.items():
        #    s = 0
        #    for i, j in b.items():
        #        if i == 't' or i == 'o':
        #            continue
        #        s += j
        #    if s != b['t'] - b['o']:
        #        print('usage total wrong', 'sum', s, part, a, b)

        u[part] = usage
        bcu[part] = bc

        if debug:
            print(bc.data, "sum", bc.sum())
        # print(bc.data, 'sum', bc.sum())

    return u, bcu