Beispiel #1
0
def fetch_hardware():
    '''Autodetect which system we are trying to contact and perform hardware
    information fetch'''
    global partition_table
    global node_state
    global indexes
    global system_type
    # Generate a list of all possible nodecards in all possible partitions
    system = ComponentProxy('system', defer=True)
    if system_type is None:
        # We are self-discovering.  Can save this step if we
        # already know the system type.
        system_type = system.get_implementation()
    if system_type in bg_types:
        partition_table = dict((part['name'], part['node_card_names'])
            for part in system.get_partitions([{'name': '*',
                                                'node_card_names': '*'}]))
        node_state = {}
    elif system_type in cluster_types:
        partition_table = {}
        node_state = dict((node[0], node[1]) for node in system.get_node_status())
    elif system_type in cray_types:
        partition_table = {}
        # Using JSON for speed and avoinding the XML-RPC marshaller.
        stst = json.loads(system.get_nodes(True, None, alps_system_query_fields, True))
        indexes = {}
        for idx in stst:
            r = stst[idx]
            if r['status'] == 'busy':
                indexes[idx] = r['name']
        node_state = dict((k, v['status']) for k, v in stst.iteritems())
    else:
        raise RuntimeError('The %s system implementation is not supported by cweb')
    return system_type
Beispiel #2
0
        opts, args  = opt_parser.parse_args() 
    except optparse.OptParseError, msg:
        print msg
        print helpmsg
        raise SystemExit, 1

    try:
        system = ComponentProxy("system", defer=False)
    except ComponentLookupError:
        print "Failed to connect to system component"
        raise SystemExit, 1

    whoami = pwd.getpwuid(os.getuid())[0]

    if opts.recursive:
        partdata = system.get_partitions([{'tag':'partition', 'name':name, 'children_list':'*'} for name in args])
        parts = args

        for part in partdata:
            for child in part['children']:
                if child not in parts:
                    parts.append(child)
    else:
        parts = args

    if opts.add:
        args = ([{'tag':'partition', 'name':partname, 'size':"*", 'functional':False,
            'scheduled':False, 'queue':'default', 'deps':[]} for partname in parts])
        parts = system.add_partitions(args, whoami)
    elif opts.delete:
        args = ([{'tag':'partition', 'name':partname} for partname in parts], whoami)
Beispiel #3
0
    try:
        system = ComponentProxy("system", defer=False)
    except ComponentLookupError:
        print "Failed to connect to system"
        raise SystemExit, 1

    try:
        scheduler = ComponentProxy("scheduler", defer=False)
    except ComponentLookupError:
        print "Failed to connect to scheduler"
        raise SystemExit, 1

    spec = [{'tag':'partition', 'name':'*', 'queue':'*', 'state':'*', 'size':'*',
             'functional':'*', 'scheduled':'*', 'children':'*', 'backfill_time':"*", 'draining':"*"}]
    try:
        parts = system.get_partitions(spec)
    except xmlrpclib.Fault, flt:
        if flt.faultCode == NotSupportedError.fault_code:
            print "incompatible with cluster support:  try nodelist"
            raise SystemExit, 1
        else:
            raise

    reservations = scheduler.get_reservations([{'queue':"*", 'partitions':"*", 'active':True}])

    expanded_parts = {}
    for res in reservations:
        for res_part in res['partitions'].split(":"):
            for p in parts:
                if p['name'] == res_part:
                    if expanded_parts.has_key(res['queue']):
Beispiel #4
0
    except getopt.GetoptError, msg:
        print msg
        print helpmsg
        raise SystemExit, 1
    try:
        system = ComponentProxy("system", defer=False)
    except ComponentLookupError:
        print "Failed to connect to system component"
        raise SystemExit, 1

    whoami = getpass.getuser()

    if '-r' in sys.argv:
        partdata = system.get_partitions([{
            'tag': 'partition',
            'name': name,
            'children': '*'
        } for name in args])
        parts = args

        for part in partdata:
            for child in part['children']:
                if child not in parts:
                    parts.append(child)
    else:
        parts = args
    if '-a' in sys.argv:
        func = system.add_partitions
        args = ([{
            'tag': 'partition',
            'name': partname,
Beispiel #5
0
                                     ['dump', 'free', 'load=', 'enable', 'disable', 'activate', 'deactivate',
                                      'queue=', 'deps=', 'xml', 'diag=', 'fail', 'unfail', 'savestate'])
    except getopt.GetoptError, msg:
        print msg
        print helpmsg
        raise SystemExit, 1
    try:
        system = ComponentProxy("system", defer=False)
    except ComponentLookupError:
        print "Failed to connect to system component"
        raise SystemExit, 1

    whoami = getpass.getuser()
    
    if '-r' in sys.argv:
        partdata = system.get_partitions([{'tag':'partition', 'name':name, 'children':'*'} for name in args])
        parts = args
        
        for part in partdata:
            for child in part['children']:
                if child not in parts:
                    parts.append(child)
    else:
        parts = args
    if '-a' in sys.argv:
        func = system.add_partitions
        args = ([{'tag':'partition', 'name':partname, 'size':"*", 'functional':False,
                  'scheduled':False, 'queue':'default', 'deps':[]} for partname in parts], whoami)
    elif '-d' in sys.argv:
        func = system.del_partitions
        args = ([{'tag':'partition', 'name':partname} for partname in parts], whoami)
Beispiel #6
0
    def test_something(self):
        logging.basicConfig()

        try:
            cqm = ComponentProxy("queue-manager")
        except ComponentLookupError:
            assert not "failed to connect to queue manager"

        # add a queue
        queues = cqm.add_queues([{"tag": "queue", "name": "default"}])
        assert len(queues) == 1

        # try adding a job to a queue that doesn't exist
        try:
            jobs = cqm.add_jobs([{"tag": "job", "queue": "jonx"}])
        except xmlrpclib.Fault:
            # trying to add a job to a queue that doesn't exist results in an xmlrpc Fault
            pass
        else:
            assert not "Adding job to non-existent queue should raise xmlrpclib.Fault"

        # get the list of available partitions and add them to the pool of managed partitions
        try:
            simulator = ComponentProxy("system")
        except ComponentLookupError:
            assert not "failed to connect to simulator"

        for part_name in self.system._partitions:
            partitions = simulator.add_partitions([{"tag": "partition", "name": part_name, "queue": "default"}])
            assert len(partitions) == 1
            partitions = simulator.set_partitions(
                [{"tag": "partition", "name": part_name}], {"functional": True, "scheduled": True}
            )
            assert len(partitions) == 1

        partitions = simulator.get_partitions([{"name": "*", "size": "*", "queue": "*"}])
        assert len(partitions) > 0

        # now run a real job
        #
        # 1. add the job to the default queue
        # 2. obtain a partition for it to run on
        # 3. start running it on that paritition
        # 4. check that it started running
        # 5. sleep for a bit, and then check that it's still running
        # 6. sleep some more and then check to see if it actually finished running

        nodes = partitions[0]["size"]
        jobs = cqm.add_jobs(
            [
                {
                    "queue": "default",
                    "mode": "co",
                    "command": "/bin/ls",
                    "outputdir": os.getcwd(),
                    "walltime": 4,
                    "nodes": nodes,
                    "procs": nodes,
                    "args": [],
                    "user": "******",
                    "jobid": "*",
                }
            ]
        )
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job["jobid"]
        job_location_args = [
            {
                "jobid": jobid,
                "nodes": job["nodes"],
                "queue": job["queue"],
                "utility_score": 1,
                "threshold": 1,
                "walltime": job["walltime"],
                "attrs": {},
            }
        ]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{"jobid": jobid}], location)

        r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
            if r:
                if time.time() - start_time > 240:
                    assert not "the job seems to have run overtime"
                else:
                    time.sleep(5)
            else:
                break

        # this time, we'll add a job to the queue, start the job, sleep for a bit
        # and then try to kill the job before it has finished
        nodes = partitions[0]["size"]
        jobs = cqm.add_jobs(
            [
                {
                    "queue": "default",
                    "mode": "co",
                    "command": "/bin/ls",
                    "outputdir": os.getcwd(),
                    "walltime": 4,
                    "nodes": nodes,
                    "procs": nodes,
                    "args": [],
                    "user": "******",
                    "jobid": "*",
                }
            ]
        )
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job["jobid"]
        job_location_args = [
            {
                "jobid": jobid,
                "nodes": job["nodes"],
                "queue": job["queue"],
                "utility_score": 1,
                "threshold": 1,
                "walltime": job["walltime"],
                "attrs": {},
            }
        ]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{"jobid": jobid}], location)

        r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{"jobid": jobid, "is_active": True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        cqm.del_jobs([{"jobid": jobid}])

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{"jobid": jobid, "is_active": True, "state": "*"}])
            if r:
                if time.time() - start_time > 30:
                    assert not "the job didn't die when asked to"
                else:
                    time.sleep(1)
            else:
                break
    def test_something(self):
        logging.basicConfig()

        try:
            cqm = ComponentProxy("queue-manager")
        except ComponentLookupError:
            assert not "failed to connect to queue manager"

        # add a queue
        queues = cqm.add_queues([{'tag': "queue", 'name': "default"}])
        assert len(queues) == 1

        # try adding a job to a queue that doesn't exist
        try:
            jobs = cqm.add_jobs([{'tag': "job", 'queue': "jonx"}])
        except xmlrpclib.Fault:
            # trying to add a job to a queue that doesn't exist results in an xmlrpc Fault
            pass
        else:
            assert not "Adding job to non-existent queue should raise xmlrpclib.Fault"

        # get the list of available partitions and add them to the pool of managed partitions
        try:
            simulator = ComponentProxy("system")
        except ComponentLookupError:
            assert not "failed to connect to simulator"

        for part_name in self.system._partitions:
            partitions = simulator.add_partitions([{
                'tag': "partition",
                'name': part_name,
                'queue': "default"
            }])
            assert len(partitions) == 1
            partitions = simulator.set_partitions([{
                'tag': "partition",
                'name': part_name
            }], {
                'functional': True,
                'scheduled': True
            })
            assert len(partitions) == 1

        partitions = simulator.get_partitions([{
            'name': "*",
            'size': "*",
            'queue': "*"
        }])
        assert len(partitions) > 0

        # now run a real job
        #
        # 1. add the job to the default queue
        # 2. obtain a partition for it to run on
        # 3. start running it on that paritition
        # 4. check that it started running
        # 5. sleep for a bit, and then check that it's still running
        # 6. sleep some more and then check to see if it actually finished running

        nodes = partitions[0]['size']
        jobs = cqm.add_jobs([{
            'queue': "default",
            'mode': "co",
            'command': "/bin/ls",
            'outputdir': os.getcwd(),
            'walltime': 4,
            'nodes': nodes,
            'procs': nodes,
            'args': [],
            'user': "******",
            'jobid': "*"
        }])
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job['jobid']
        job_location_args = [{
            'jobid': jobid,
            'nodes': job['nodes'],
            'queue': job['queue'],
            'utility_score': 1,
            'threshold': 1,
            'walltime': job['walltime'],
            'attrs': {}
        }]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{'jobid': jobid}], location)

        r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{
                'jobid': jobid,
                'state': "*",
                'is_active': True
            }])
            if r:
                if time.time() - start_time > 240:
                    assert not "the job seems to have run overtime"
                else:
                    time.sleep(5)
            else:
                break

        # this time, we'll add a job to the queue, start the job, sleep for a bit
        # and then try to kill the job before it has finished
        nodes = partitions[0]['size']
        jobs = cqm.add_jobs([{
            'queue': "default",
            'mode': "co",
            'command': "/bin/ls",
            'outputdir': os.getcwd(),
            'walltime': 4,
            'nodes': nodes,
            'procs': nodes,
            'args': [],
            'user': "******",
            'jobid': "*"
        }])
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job['jobid']
        job_location_args = [{
            'jobid': jobid,
            'nodes': job['nodes'],
            'queue': job['queue'],
            'utility_score': 1,
            'threshold': 1,
            'walltime': job['walltime'],
            'attrs': {}
        }]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{'jobid': jobid}], location)

        r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{'jobid': jobid, 'is_active': True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        cqm.del_jobs([{'jobid': jobid}])

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{
                'jobid': jobid,
                'is_active': True,
                'state': "*"
            }])
            if r:
                if time.time() - start_time > 30:
                    assert not "the job didn't die when asked to"
                else:
                    time.sleep(1)
            else:
                break
Beispiel #8
0
        print msg
        print helpmsg
        raise SystemExit, 1

    try:
        system = ComponentProxy("system", defer=False)
    except ComponentLookupError:
        print "Failed to connect to system component"
        raise SystemExit, 1

    whoami = pwd.getpwuid(os.getuid())[0]

    if opts.recursive:
        partdata = system.get_partitions([{
            'tag': 'partition',
            'name': name,
            'children_list': '*'
        } for name in args])
        parts = args

        for part in partdata:
            for child in part['children']:
                if child not in parts:
                    parts.append(child)
    else:
        parts = args

    if opts.add:
        args = ([{
            'tag': 'partition',
            'name': partname,