コード例 #1
0
def get_required_nodes(instance_properties, max_size):
    command = "/opt/torque/bin/qstat -at"

    # Example output of torque
    #                                                                                   Req'd       Req'd       Elap
    # Job ID                  Username    Queue    Jobname          SessID  NDS   TSK   Memory      Time    S   Time
    # ----------------------- ----------- -------- ---------------- ------ ----- ------ --------- --------- - ---------
    # 0.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5343     5     30       --   01:00:00 Q  00:04:58
    # 1.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5340     3      6       --   01:00:00 R  00:08:14
    # 2.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5387     2      4       --   01:00:00 R  00:08:27

    status = ["Q"]
    _output = check_command_output(command)
    output = _output.split("\n")[5:]
    slots_requested = []
    nodes_requested = []
    for line in output:
        line_arr = line.split()
        if len(line_arr) >= 10 and line_arr[9] in status:
            # if a job has been looked at to account for pending nodes, don't look at it again
            slots_requested.append(int(line_arr[6]))
            nodes_requested.append(int(line_arr[5]))

    return get_optimal_nodes(nodes_requested, slots_requested,
                             instance_properties)
コード例 #2
0
ファイル: slurm.py プロジェクト: awslabs/cfncluster-node
def get_required_nodes(instance_properties):
    log.info("Computing number of required nodes for submitted jobs")
    command = "/opt/slurm/bin/squeue -r -h -o '%i-%t-%D-%C-%r'"
    # Example output of squeue
    # 1-PD-1-24-Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions
    # 2-PD-1-24-Licenses
    # 3-PD-1-24-PartitionNodeLimit
    # 4-R-1-24-
    output = check_command_output(command, log)
    slots_requested = []
    nodes_requested = []
    output = output.split("\n")
    for line in output:
        line_arr = line.split("-")
        if len(line_arr) == 5 and line_arr[1] == 'PD':
            if line_arr[4] in PENDING_RESOURCES_REASONS:
                slots_requested.append(int(line_arr[3]))
                nodes_requested.append(int(line_arr[2]))
            else:
                log.info("Skipping pending job %s due to pending reason: %s", line_arr[0], line_arr[4])

    return get_optimal_nodes(nodes_requested, slots_requested, instance_properties)
コード例 #3
0
ファイル: torque.py プロジェクト: awslabs/cfncluster-node
def get_required_nodes(instance_properties):
    command = "/opt/torque/bin/qstat -at"

    # Example output of torque
    #                                                                                   Req'd       Req'd       Elap
    # Job ID                  Username    Queue    Jobname          SessID  NDS   TSK   Memory      Time    S   Time
    # ----------------------- ----------- -------- ---------------- ------ ----- ------ --------- --------- - ---------
    # 0.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5343     5     30       --   01:00:00 Q  00:04:58
    # 1.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5340     3      6       --   01:00:00 R  00:08:14
    # 2.ip-172-31-11-1.ec2.i  centos      batch    job.sh             5387     2      4       --   01:00:00 R  00:08:27

    status = ['Q']
    _output = check_command_output(command, log)
    output = _output.split("\n")[5:]
    slots_requested = []
    nodes_requested = []
    for line in output:
        line_arr = line.split()
        if len(line_arr) >= 10 and line_arr[9] in status:
            # if a job has been looked at to account for pending nodes, don't look at it again
            slots_requested.append(int(line_arr[6]))
            nodes_requested.append(int(line_arr[5]))

    return get_optimal_nodes(nodes_requested, slots_requested, instance_properties)
コード例 #4
0
def get_required_nodes(instance_properties):
    log.info("Computing number of required nodes for submitted jobs")
    command = "/opt/slurm/bin/squeue -r -h -o '%i-%t-%D-%C-%r'"
    # Example output of squeue
    # 1-PD-1-24-Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions
    # 2-PD-1-24-Licenses
    # 3-PD-1-24-PartitionNodeLimit
    # 4-R-1-24-
    output = check_command_output(command, log)
    slots_requested = []
    nodes_requested = []
    output = output.split("\n")
    for line in output:
        line_arr = line.split("-")
        if len(line_arr) == 5 and line_arr[1] == 'PD':
            if line_arr[4] in PENDING_RESOURCES_REASONS:
                slots_requested.append(int(line_arr[3]))
                nodes_requested.append(int(line_arr[2]))
            else:
                log.info("Skipping pending job %s due to pending reason: %s",
                         line_arr[0], line_arr[4])

    return get_optimal_nodes(nodes_requested, slots_requested,
                             instance_properties)
コード例 #5
0
 def test_empty_lists(self):
     nodes = utils.get_optimal_nodes([], [], instance_properties)
     expected = 0
     self.assertEqual(nodes, expected, "test_empty_lists failed. Got %s; Expected: %s" % (nodes, expected))
コード例 #6
0
 def test_each_node_partial_capacity(self):
     nodes = utils.get_optimal_nodes([1, 5, 3, 2], [6, 35, 1, 1], instance_properties)
     expected = 6
     self.assertEqual(nodes, expected, "test_each_node_partial_capacity failed: Got %s; Expected: %s" % (nodes, expected))
コード例 #7
0
 def test_each_node_one_vcpu_except_max(self):
     nodes = utils.get_optimal_nodes([1, 5, 3], [1, 40, 1], instance_properties)
     expected = 8
     self.assertEqual(nodes, expected, "test_each_node_one_vcpu_except_max failed: Got %s; Expected: %s" % (nodes, expected))
コード例 #8
0
 def test_each_node_half_capacity(self):
     nodes = utils.get_optimal_nodes([1, 5, 3], [4, 20, 12], instance_properties)
     expected = 5
     self.assertEqual(nodes, expected, "test_exact_fit failed: Got %s; Expected: %s" % (nodes, expected))
コード例 #9
0
 def test_only_vcpus(self):
     nodes = utils.get_optimal_nodes([1], [27], instance_properties)
     expected = 4
     self.assertEqual(nodes, expected, "test_exact_fit failed. Got %s; Expected: %s" % (nodes, expected))
コード例 #10
0
 def test_each_node_at_capacity(self):
     nodes = utils.get_optimal_nodes([1, 5, 3], [8, 40, 24], instance_properties)
     expected = 9
     self.assertEqual(nodes, expected, "test_exact_fit failed. Got %s; Expected: %s" % (nodes, expected))