Ejemplo n.º 1
0
Archivo: cweb.py Proyecto: ido/cobalt
def cobalt_query(state):
    cqm = ComponentProxy('queue-manager', defer=True)
    scheduler = ComponentProxy('scheduler', defer=True)
    if state not in ('running', 'queued', 'reservation'):
        return None
    # Templates for queries to coblat

    query_job = dict.fromkeys(job_query_fields, '*')
    query_res = dict.fromkeys(reservation_query_fields, '*')
    if state == 'reservation':
        return scheduler.get_reservations([query_res])
    if state == 'running' or state == 'starting':
        query_job['state'] = 'running'
        query_job['location'] = '*'
    if state == 'queued':
        query_job['state'] = 'queued'
        query_job['score'] = '*'
    return cqm.get_jobs([query_job])
Ejemplo n.º 2
0
        cqm = ComponentProxy("queue-manager", defer=False)
    except ComponentLookupError:
        print >> sys.stderr, "Failed to connect to queue manager"
        sys.exit(1)

    for i in range(len(args)):
        if args[i] == '*':
            continue
        try:
            args[i] = int(args[i])
        except:
            logger.error("jobid must be an integer")
            raise SystemExit, 1

    if opts['start']:
        query = [{
            'tag': 'job',
            'jobid': jid,
            'is_active': False,
            'has_completed': False
        } for jid in args]
    else:
        query = [{'tag': 'job', 'jobid': jid} for jid in args]

    while True:
        response = cqm.get_jobs(query)
        if len(response) == 0:
            raise SystemExit, 0
        else:
            time.sleep(2)
Ejemplo n.º 3
0
                    query.append({'tag':'job', 'jobid':int(n), 'queue':'*'})
        except ValueError:
            print "%s is not a valid jobid or queue name" % n
            sys.exit(2)
        for q in query:
            for h in long_header:
                if h == 'JobName':
                    q.update({'outputpath':'*'})
                elif h not in ['JobID', 'Queue']:
                    q.update({h.lower():'*'})
                if h in query_dependencies.keys():
                    for x in query_dependencies[h]:
                        if x not in header:
                            q.update({x.lower():'*'})
            q["user"] = user_name
        response = cqm.get_jobs(query)
        
    if len(args) and not response:
        sys.exit(1)

    if opts['Q']:
        for q in response:
            if q['maxtime'] is not None:
                q['maxtime'] = "%02d:%02d:00" % (divmod(int(q['maxtime']), 60))
            if q['mintime'] is not None:
                q['mintime'] = "%02d:%02d:00" % (divmod(int(q['mintime']), 60))
        output = [[q[x] for x in [y.lower() for y in header]] for q in response]
    else:
        if response:
            maxjoblen = max([len(str(item.get('jobid'))) for item in response])
            jobidfmt = "%%%ss" % maxjoblen
Ejemplo n.º 4
0
    try:
        pm = ComponentProxy("process-manager", defer=False)
    except ComponentLookupError:
        print >> sys.stderr, "Failed to connect to process manager"
        sys.exit(1)

    r = pm.add_jobs([{
        'tag': 'process-group',
        'user': user,
        'args': [],
        'env': {},
        'executable': '/tmp/testscript',
        'size': 700,
        'cwd': '/tmp',
        'location': ['ANLR00'],
        'outputfile': '/tmp/test1-output',
        'errorfile': '/tmp/test1-error',
        'id': '*'
    }])
    print "jobs : " + ` len(r) `
    pgid = r[0]['id']
    while True:
        r = pm.get_jobs([{'tag': 'process-group', 'id': pgid, 'state': '*'}])
        state = r[0]['state']
        if state == 'running':
            time.sleep(5)
            continue
        else:
            break
    print "process group %s has completed" % (pgid)
Ejemplo n.º 5
0
        try:
            args[i] = int(args[i])
            all_jobs.add(args[i])
        except:
            logger.error("jobid must be an integer")
            raise SystemExit, 1

    check_specs = [{
        'tag': 'job',
        'user': user,
        'jobid': jobid,
        'user_hold': '*'
    } for jobid in args]

    try:
        check_response = cqm.get_jobs(check_specs)
    except xmlrpclib.Fault, flt:
        print flt.faultString
        raise SystemExit, 1

    jobs_existed = [j.get('jobid') for j in check_response]
    all_jobs = all_jobs.union(set(jobs_existed))
    update_specs = [{
        'tag': 'job',
        'user': user,
        'jobid': jobid,
        'user_hold': "*",
        'is_active': "*"
    } for jobid in jobs_existed]
    updates = {'user_hold': True}
Ejemplo n.º 6
0
        try:
            args[i] = int(args[i])
        except:
            logger.error("jobid must be an integer")
            raise SystemExit, 1
        
    spec = [{'tag':'job', 'user':user, 'jobid':jobid, 'project':'*', 'notify':'*',
             'walltime':'*', 'queue':'*', 'procs':'*', 'nodes':'*'} for jobid in args]

    try:
        filters = CP.get('cqm', 'filters').split(':')
    except ConfigParser.NoOptionError:
        filters = []

    try:
        jobdata = cqm.get_jobs(spec)
    except xmlrpclib.Fault, flt:
        print flt.faultString
        raise SystemExit, 1

    if not jobdata:
        print "Failed to match any jobs"
        sys.exit(1)

    response = []
    for jobinfo in jobdata:
        original_spec = jobinfo.copy()
        jobinfo.update({'queue': queue})
        for filt in filters:
            Cobalt.Util.processfilter(filt, jobinfo)
        try:
Ejemplo n.º 7
0
from Cobalt.Exceptions import ComponentLookupError
import Cobalt.Util

if __name__ == '__main__':
    level = 20
    if '-d' in sys.argv:
        level = 10
    Cobalt.Logging.setup_logging('cmd', to_syslog=False, level=0)
    user = pwd.getpwuid(os.getuid())[0]
    try:
        pm = ComponentProxy("process-manager", defer=False)
    except ComponentLookupError:
        print >> sys.stderr, "Failed to connect to process manager"
        sys.exit(1)

    r = pm.add_jobs([{'tag':'process-group', 'user':user, 'args':[], 'env':{}, 
                                'executable':'/tmp/testscript', 'size':700, 'cwd':'/tmp', 'location':['ANLR00'],
                                'outputfile':'/tmp/test1-output', 'errorfile':'/tmp/test1-error', 'id': '*'}])
    print "jobs : " + `len(r)`
    pgid = r[0]['id']
    while True:
        r = pm.get_jobs([{'tag':'process-group', 'id':pgid, 'state':'*'}])
        state = r[0]['state']
        if state == 'running':
            Cobalt.Util.sleep(5)
            continue
        else:
            break
    print "process group %s has completed" % (pgid)
        
Ejemplo n.º 8
0
    if opts['held']:
        query['state'] = opts['held']
    else:
        query['state'] = '*'

    if opts['queue']:
        query['queue'] = opts['queue']
    else:
        query['queue'] = '*'

    try:
        cqm = ComponentProxy("queue-manager", defer=False)

        query['tag'] = 'job'
        query['jobid'] = '*'
        response = cqm.get_jobs([query])

    except ComponentLookupError:
        logger.error("Can't connect to the queue manager")
        sys.exit(1)
    #except:
    #$logger.error("Error querying jobs")
    #sys.exit(1)
    # log jobid to stdout

    if not response:
        Cobalt.Logging.logging.error("Failed to match any jobs")
    else:
        Cobalt.Logging.logging.debug(response)
        print "   The following jobs matched your query:"
        for job in response:
Ejemplo n.º 9
0
            logger.error("node count out of realistic range")
            sys.exit(1)
        updates['nodes'] = opts['nodecount']
    # ensure time is actually in minutes
    if opts['time']:
        if opts['time'][0] in ['+', '-']:
            try:
                minutes = Cobalt.Util.get_time(opts['time'][1:])
            except Cobalt.Exceptions.TimeFormatError, e:
                print "invalid time specification: %s" % e.args[0]
                sys.exit(1)

            jobdata = None
            try:
                cqm = ComponentProxy("queue-manager", defer=False)
                jobdata = cqm.get_jobs(spec)
            except ComponentLookupError:
                print >> sys.stderr, "Failed to connect to queue manager"
                sys.exit(1)
            if not jobdata:
                print "Failed to match any jobs"
                sys.exit(1)

            if opts['time'][0] == '-':
                new_time = float(jobdata[0]['walltime']) - minutes
                if new_time <= 0:
                    print >> sys.stderr, "invalid wall time: ", new_time
                else:
                    updates['walltime'] = str(
                        float(jobdata[0]['walltime']) - minutes)
            elif opts['time'][0] == '+':
Ejemplo n.º 10
0
    if opts["held"]:
        query["state"] = opts["held"]
    else:
        query["state"] = "*"

    if opts["queue"]:
        query["queue"] = opts["queue"]
    else:
        query["queue"] = "*"

    try:
        cqm = ComponentProxy("queue-manager", defer=False)

        query["tag"] = "job"
        query["jobid"] = "*"
        response = cqm.get_jobs([query])

    except ComponentLookupError:
        logger.error("Can't connect to the queue manager")
        sys.exit(1)
    # except:
    # $logger.error("Error querying jobs")
    # sys.exit(1)
    # log jobid to stdout

    if not response:
        Cobalt.Logging.logging.error("Failed to match any jobs")
    else:
        Cobalt.Logging.logging.debug(response)
        print "   The following jobs matched your query:"
        for job in response:
Ejemplo n.º 11
0
    def test_something(self):
        logging.basicConfig()

        try:
            cqm = ComponentProxy("queue-manager")
        except ComponentLookupError:
            assert not "failed to connect to queue manager"

        # add a queue
        queues = cqm.add_queues([{"tag": "queue", "name": "default"}])
        assert len(queues) == 1

        # try adding a job to a queue that doesn't exist
        try:
            jobs = cqm.add_jobs([{"tag": "job", "queue": "jonx"}])
        except xmlrpclib.Fault:
            # trying to add a job to a queue that doesn't exist results in an xmlrpc Fault
            pass
        else:
            assert not "Adding job to non-existent queue should raise xmlrpclib.Fault"

        # get the list of available partitions and add them to the pool of managed partitions
        try:
            simulator = ComponentProxy("system")
        except ComponentLookupError:
            assert not "failed to connect to simulator"

        for part_name in self.system._partitions:
            partitions = simulator.add_partitions([{"tag": "partition", "name": part_name, "queue": "default"}])
            assert len(partitions) == 1
            partitions = simulator.set_partitions(
                [{"tag": "partition", "name": part_name}], {"functional": True, "scheduled": True}
            )
            assert len(partitions) == 1

        partitions = simulator.get_partitions([{"name": "*", "size": "*", "queue": "*"}])
        assert len(partitions) > 0

        # now run a real job
        #
        # 1. add the job to the default queue
        # 2. obtain a partition for it to run on
        # 3. start running it on that paritition
        # 4. check that it started running
        # 5. sleep for a bit, and then check that it's still running
        # 6. sleep some more and then check to see if it actually finished running

        nodes = partitions[0]["size"]
        jobs = cqm.add_jobs(
            [
                {
                    "queue": "default",
                    "mode": "co",
                    "command": "/bin/ls",
                    "outputdir": os.getcwd(),
                    "walltime": 4,
                    "nodes": nodes,
                    "procs": nodes,
                    "args": [],
                    "user": "******",
                    "jobid": "*",
                }
            ]
        )
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job["jobid"]
        job_location_args = [
            {
                "jobid": jobid,
                "nodes": job["nodes"],
                "queue": job["queue"],
                "utility_score": 1,
                "threshold": 1,
                "walltime": job["walltime"],
                "attrs": {},
            }
        ]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{"jobid": jobid}], location)

        r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
            if r:
                if time.time() - start_time > 240:
                    assert not "the job seems to have run overtime"
                else:
                    time.sleep(5)
            else:
                break

        # this time, we'll add a job to the queue, start the job, sleep for a bit
        # and then try to kill the job before it has finished
        nodes = partitions[0]["size"]
        jobs = cqm.add_jobs(
            [
                {
                    "queue": "default",
                    "mode": "co",
                    "command": "/bin/ls",
                    "outputdir": os.getcwd(),
                    "walltime": 4,
                    "nodes": nodes,
                    "procs": nodes,
                    "args": [],
                    "user": "******",
                    "jobid": "*",
                }
            ]
        )
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job["jobid"]
        job_location_args = [
            {
                "jobid": jobid,
                "nodes": job["nodes"],
                "queue": job["queue"],
                "utility_score": 1,
                "threshold": 1,
                "walltime": job["walltime"],
                "attrs": {},
            }
        ]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{"jobid": jobid}], location)

        r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{"jobid": jobid, "is_active": True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        cqm.del_jobs([{"jobid": jobid}])

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{"jobid": jobid, "is_active": True, "state": "*"}])
            if r:
                if time.time() - start_time > 30:
                    assert not "the job didn't die when asked to"
                else:
                    time.sleep(1)
            else:
                break
Ejemplo n.º 12
0
    def test_something(self):
        logging.basicConfig()

        try:
            cqm = ComponentProxy("queue-manager")
        except ComponentLookupError:
            assert not "failed to connect to queue manager"

        # add a queue
        queues = cqm.add_queues([{'tag': "queue", 'name': "default"}])
        assert len(queues) == 1

        # try adding a job to a queue that doesn't exist
        try:
            jobs = cqm.add_jobs([{'tag': "job", 'queue': "jonx"}])
        except xmlrpclib.Fault:
            # trying to add a job to a queue that doesn't exist results in an xmlrpc Fault
            pass
        else:
            assert not "Adding job to non-existent queue should raise xmlrpclib.Fault"

        # get the list of available partitions and add them to the pool of managed partitions
        try:
            simulator = ComponentProxy("system")
        except ComponentLookupError:
            assert not "failed to connect to simulator"

        for part_name in self.system._partitions:
            partitions = simulator.add_partitions([{
                'tag': "partition",
                'name': part_name,
                'queue': "default"
            }])
            assert len(partitions) == 1
            partitions = simulator.set_partitions([{
                'tag': "partition",
                'name': part_name
            }], {
                'functional': True,
                'scheduled': True
            })
            assert len(partitions) == 1

        partitions = simulator.get_partitions([{
            'name': "*",
            'size': "*",
            'queue': "*"
        }])
        assert len(partitions) > 0

        # now run a real job
        #
        # 1. add the job to the default queue
        # 2. obtain a partition for it to run on
        # 3. start running it on that paritition
        # 4. check that it started running
        # 5. sleep for a bit, and then check that it's still running
        # 6. sleep some more and then check to see if it actually finished running

        nodes = partitions[0]['size']
        jobs = cqm.add_jobs([{
            'queue': "default",
            'mode': "co",
            'command': "/bin/ls",
            'outputdir': os.getcwd(),
            'walltime': 4,
            'nodes': nodes,
            'procs': nodes,
            'args': [],
            'user': "******",
            'jobid': "*"
        }])
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job['jobid']
        job_location_args = [{
            'jobid': jobid,
            'nodes': job['nodes'],
            'queue': job['queue'],
            'utility_score': 1,
            'threshold': 1,
            'walltime': job['walltime'],
            'attrs': {}
        }]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{'jobid': jobid}], location)

        r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{
                'jobid': jobid,
                'state': "*",
                'is_active': True
            }])
            if r:
                if time.time() - start_time > 240:
                    assert not "the job seems to have run overtime"
                else:
                    time.sleep(5)
            else:
                break

        # this time, we'll add a job to the queue, start the job, sleep for a bit
        # and then try to kill the job before it has finished
        nodes = partitions[0]['size']
        jobs = cqm.add_jobs([{
            'queue': "default",
            'mode': "co",
            'command': "/bin/ls",
            'outputdir': os.getcwd(),
            'walltime': 4,
            'nodes': nodes,
            'procs': nodes,
            'args': [],
            'user': "******",
            'jobid': "*"
        }])
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job['jobid']
        job_location_args = [{
            'jobid': jobid,
            'nodes': job['nodes'],
            'queue': job['queue'],
            'utility_score': 1,
            'threshold': 1,
            'walltime': job['walltime'],
            'attrs': {}
        }]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{'jobid': jobid}], location)

        r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{'jobid': jobid, 'is_active': True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        cqm.del_jobs([{'jobid': jobid}])

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{
                'jobid': jobid,
                'is_active': True,
                'state': "*"
            }])
            if r:
                if time.time() - start_time > 30:
                    assert not "the job didn't die when asked to"
                else:
                    time.sleep(1)
            else:
                break
Ejemplo n.º 13
0
    Cobalt.Logging.setup_logging('cobalt-mpirun', to_syslog=False, level=level)
    logger = logging.getLogger('cobalt-mpirun')

    try:
        os.environ["COBALT_JOBID"] = os.environ["COBALT_JOBID"]
    except KeyError:
        logger.error("cobalt-mpirun must be invoked by a script submitted to cobalt.")
        raise SystemExit, 1

    try:
        cqm = ComponentProxy("queue-manager", defer=False)
    except ComponentLookupError:
        print >> sys.stderr, "Failed to connect to queue manager"
        sys.exit(1)
        
    response = cqm.get_jobs([{'tag':'job', 'jobid':int(os.environ["COBALT_JOBID"]), 'state':'*', 'procs':'*', 'location':'*', 'walltime':'*', 'outputdir':'*'}])
    if len(response) == 0:
        logger.error("Error: cqm did not find a job with id " + os.environ["COBALT_JOBID"])
        raise SystemExit, 1
    if len(response) > 1:
        logger.error("Error: cqm did not find a unique job with id " + os.environ["COBALT_JOBID"])
        raise SystemExit, 1
         
    j = response[0]
    if j['location'] is None:
        logger.error("Error: cobalt-mpirun's parent is in state '%s' and has not specified a partition." % j['state'])
        raise SystemExit, 1
#    j['location'] = "ANLR00"
    
    arglist = ['-partition', j['location'][0]] + arglist
Ejemplo n.º 14
0
    all_jobs = set()
    for i in range(len(args)):
        if args[i] == '*':
            continue
        try:
            args[i] = int(args[i])
            all_jobs.add(args[i])
        except:
            logger.error("jobid must be an integer")
            sys.exit(1)
        
    check_specs = [{'tag':'job', 'user':user, 'jobid':jobid, 'user_hold':'*'} for jobid in args]

    try:
        check_response = cqm.get_jobs(check_specs)
    except xmlrpclib.Fault, flt:
        print flt.faultString
        raise SystemExit, 1

    jobs_existed = [j.get('jobid') for j in check_response]
    all_jobs = all_jobs.union(set(jobs_existed))
    update_specs = [{'tag':'job', 'user':user, 'jobid':jobid, 'user_hold':"*", 'is_active':"*"} for jobid in jobs_existed]
    
    if opt.deps:
        updates = {'all_dependencies': []}
    else:
        updates = {'user_hold':False}

    try:
        update_response = cqm.set_jobs(update_specs, updates, user)
Ejemplo n.º 15
0
    Cobalt.Logging.setup_logging('cobalt-mpirun', to_syslog=False, level=level)
    logger = logging.getLogger('cobalt-mpirun')

    try:
        os.environ["COBALT_JOBID"] = os.environ["COBALT_JOBID"]
    except KeyError:
        logger.error("cobalt-mpirun must be invoked by a script submitted to cobalt.")
        raise SystemExit, 1

    try:
        cqm = ComponentProxy("queue-manager", defer=False)
    except ComponentLookupError:
        print >> sys.stderr, "Failed to connect to queue manager"
        sys.exit(1)
        
    response = cqm.get_jobs([{'tag':'job', 'jobid':int(os.environ["COBALT_JOBID"]), 'state':'*', 'procs':'*', 'location':'*', 'walltime':'*', 'outputdir':'*'}])
    if len(response) == 0:
        logger.error("Error: cqm did not find a job with id " + os.environ["COBALT_JOBID"])
        raise SystemExit, 1
    if len(response) > 1:
        logger.error("Error: cqm did not find a unique job with id " + os.environ["COBALT_JOBID"])
        raise SystemExit, 1
         
    j = response[0]
    if j['location'] is None:
        logger.error("Error: cobalt-mpirun's parent is in state '%s' and has not specified a partition." % j['state'])
        raise SystemExit, 1
#    j['location'] = "ANLR00"
    
    arglist = ['-partition', j['location'][0]] + arglist