Exemple #1
0
    def __init__(self, port):
        handler = hQExecServerHandler
        processor = hQExecServerRequestProcessor()

        self.user = USER

        super(hQExecServer, self).__init__(port, handler, processor)

        # connect to database
        dbconnection = hQDBConnection()

        # get database id of host
        try:
            self.host_id = dbconnection.query(db.Host.id).filter(db.Host.full_name == self.host).one()[0]
        except:
            sys.stderr.write("Host is not in cluster!")
            sys.exit(-1)

        # set interval for loop of calling loop functions
        self.loops = {
            "print_status": {
                "fct": self.print_status,
                "kwargs": {"short": True, "remove_connection": True},
                "interval": 5,
                "description": "print periodically status of server",
            }
        }

        # flags which indicate running processes
        self.printing_status = threading.Event()

        self.user_id = dbconnection.query(db.User.id).filter(db.User.name == self.user).one()
Exemple #2
0
    def process_findjobs( self, request, match_str ):
        """ ! @brief process 'findjobs' command
        """
        
        # connect to database
        dbconnection = hQDBConnection()

        jobs = dbconnection.query( db.Job ).filter( or_( db.Job.command.ilike( '%{s}%'.format(s=match_str) ),
                                                         db.Job.info_text.ilike( '%{s}%'.format(s=match_str) ),
                                                         db.Job.group.ilike( '%{s}%'.format(s=match_str) ) ) ).all()

        response = []
        response.append( "Matching jobs" )
        response.append( "-------------" )

        jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:{status}] [group:{group}] [info:{info}] [command:{command}{dots}]"
        for idx,job in enumerate(jobs):
            response.append( jobString.format( i=idx,
                                               id=job.id,
                                               user=job.user.name,
                                               status=job.job_details.job_status.name,
                                               group=job.group,
                                               info=job.info_text,
                                               command=job.command[:30],
                                               dots="..." if len(job.command)>30 else "" ) )
                             
        if jobs:
            request.send( '\n'.join( response ) )
        else:
            request.send("no jobs found")
Exemple #3
0
    def init_database_ids( self ):
        """save some database ids in the dictinary self.database_ids for faster access.
        """

        # establish database connection
        con = hQDBConnection()
        
        self.database_ids = dict( con.query( db.JobStatus.name,
                                             db.JobStatus.id ).all() )
Exemple #4
0
    def get_status(self, remove_connection=True):
        """! @brief get status of server from database """
        dbconnection = hQDBConnection()

        self.logger.write("print status: request database about status", logCategory="debug")

        # get all number of jobs for each status type
        query = (
            dbconnection.query(db.JobStatus.name, func.count("*"))
            .join(db.JobDetails)
            .join(db.Job)
            .filter(db.Job.user_id == self.user_id)
            .group_by(db.JobStatus.name)
        )

        counts = dict(query.all())

        if not counts:
            # no jobs so far in the database
            counts = {}

        self.logger.write("print status: get slot info", logCategory="debug")
        slotInfo = (
            dbconnection.query(
                func.count("*"),
                func.sum(db.Host.max_number_occupied_slots),
                func.sum(db.HostSummary.number_occupied_slots),
            )
            .select_from(db.Host)
            .join(db.HostSummary, db.HostSummary.host_id == db.Host.id)
            .filter(db.HostSummary.active == True)
            .one()
        )

        if slotInfo[0] == 0:
            slotInfo = (0, 0, 0)

        dbconnection.remove()
        countsDict = {
            "hosts": slotInfo[0],
            "oSlots": slotInfo[2],
            "tSlots": slotInfo[1],
            "wJobs": counts.get("waiting", 0),
            "pJobs": counts.get("pending", 0),
            "rJobs": counts.get("running", 0),
            "fJobs": counts.get("finished", 0),
        }

        if remove_connection:
            # connection has to be removed. otherwise calling hQBDSession returns (in the same thread)
            # the same connection which doesn't see recent updates
            dbconnection.remove()

        return countsDict
Exemple #5
0
    def process_invokeservers( self, request):
        # connect to database
        dbconnection = hQDBConnection()

        hosts = dbconnection.query( db.Host ).join( db.HostSummary ).filter( and_(db.HostSummary.available==True,
                                                                                  db.HostSummary.reachable==True,
                                                                                  db.HostSummary.active==True ) ).all()

        for host in hosts:
            hostID = host.id
            hostName = host.full_name

            ExecServer = self.server.get_exec_server( hostID, hostName )
            
            if not ExecServer:
                self.writeLog( "... could not start a hq-exec-server on {h}!\n".format(h=hostName),
                               logCategory='error')

        request.send("done")
Exemple #6
0
    def process_lsgroups( self, request ):
        """ ! @brief process 'lagroup' command
        """
        
        # connect to database
        dbconnection = hQDBConnection()
            
        groupNames = dbconnection.query( db.Job.group )\
                     .filter( db.Job.user_id==self.TMS.userID )\
                     .distinct()\
                     .all()

        response = ""
        for groupName, in groupNames:
            # get all number of jobs for each status type for user
            counts = dict( dbconnection.query( db.JobStatus.name, func.count('*') ).\
                           join( db.JobDetails, db.JobDetails.job_status_id==db.JobStatus.id ).\
                           join( db.Job, db.Job.id==db.JobDetails.job_id ).\
                           filter( and_(db.Job.user_id==self.TMS.userID, db.Job.group==groupName) ).\
                           group_by( db.JobStatus.name ).\
                           all() )

            finished = counts.get('finished',0)
            all = counts.get('waiting',0) + counts.get('pending',0) + counts.get('running',0) + counts.get('finished',0)

            if all!= 0:
                progress = 1.0 * finished/all

                response += "{s:>20} : {value}\n".format(s="group", value=groupName )
                response += "{s:>20} : {value}\n".format(s="waiting jobs", value=counts.get('waiting',0) )
                response += "{s:>20} : {value}\n".format(s="pending jobs", value=counts.get('pending',0) )
                response += "{s:>20} : {value}\n".format(s="running jobs", value=counts.get('running',0) )
                response += "{s:>20} : {value}\n".format(s="finished jobs", value=counts.get('finished',0) )
                response += "{s:>20} : {value:.2%}\n".format(s="progress", value=progress )
                response += "\n"


        if response:
            request.send( response )
        else:
            request.send( "no groups found" )
Exemple #7
0
    def process_lajob( self, request, job_id ):
        """ ! @brief process 'lajob' command
        """
        
        # connect to database
        dbconnection = hQDBConnection()
            
        job = dbconnection.query( db.Job ).get( int(job_id) )

        if job:
            response = ""
            response += "{s:>20} : {value}\n".format(s="job id", value=job.id )
            response += "{s:>20} : {value}\n".format(s="command", value=job.command )
            response += "{s:>20} : {value}\n".format(s="info text", value=job.info_text )
            response += "{s:>20} : {value}\n".format(s="group", value=job.group )
            response += "{s:>20} : {value}\n".format(s="stdout", value=job.stdout )
            response += "{s:>20} : {value}\n".format(s="stderr", value=job.stderr )
            response += "{s:>20} : {value}\n".format(s="logfile", value=job.logfile )
            response += "{s:>20} : {value}\n".format(s="excludedHosts", value=job.excluded_hosts )
            response += "{s:>20} : {value}\n".format(s="slots", value=job.slots )

            for idx,hist in enumerate(job.job_history):
                if idx==0: s = "status"
                else: s=""

                response += "{s:>20} : [{t}] {status}\n".format(s=s, t=str(hist.datetime), status=hist.job_status.name )

            try:
                response += "{s:>20} : {value}\n".format(s="host", value=job.job_details.host.short_name )
            except:
                response += "{s:>20} : {value}\n".format(s="host", value="None" )
            response += "{s:>20} : {value}\n".format(s="pid", value=job.job_details.pid )
            response += "{s:>20} : {value}\n".format(s="return code", value=job.job_details.return_code )

            request.send( response )
        else:
            request.send("unkown job.")
Exemple #8
0
    def __init__(self, port):
        handler = hQUserServerHandler
        processor = hQUserServerRequestProcessor()

        self.user=USER

        super( hQUserServer, self ).__init__( port, handler, processor )

        # connect to hq-server and register hq-user-server
        try:
            allowed = self.register_server()
            if not allowed:
                sys.stderr.write( "Your are not alowed to use the hq pacakge.\nPlease contact your hq administrator." )
                sys.exit( -1 )
        except:
            sys.stderr.write( "hq server ist not running\nPlease contact your hq administrator." )
            sys.exit( -1 )
            
        # connect to database
        dbconnection = hQDBConnection()

        # set interval for loop of calling loop functions
        self.loops = { 'print_status': { 'fct': self.print_status,
                                         'kwargs': {'short': True, 'remove_connection': True},
                                         'interval': 5,
                                         'description': "print periodically status of server" } }
        #self.loops = { 'print_status': { 'fct': self.loop_print_status,
        #                                 'interval': 60,
        #                                 'description': "print periodically status of server" }
        #               }

        self.exec_servers = {}

        # flags which indicate running processes
        self.printing_status = threading.Event()
        self.not_invoking_exec_server = {}
Exemple #9
0
    def process_lss( self, request ):
        """ ! @brief process 'lss' command
        """
        
        # connect to database
        dbconnection = hQDBConnection()

        response = ""

        hosts = dict( dbconnection.query( db.Host.id, db.Host.full_name ).all() )
        for idx,hostID in enumerate( hosts ):
            host_fullname = hosts[ hostID ]
            ExecServer = self.server.get_exec_server( hostID, host_fullname, do_not_invoke=True )

            if ExecServer:
                response += "{idx} - [host:{host}] [port:{port}] [status:{status}]\n".format( idx=idx,
                                                                                              host=host_fullname,
                                                                                              port=ExecServer.port if ExecServer else '?',
                                                                                              status="running" if ExecServer else "not running" )

        if response:
            request.send( response )
        else:
            request.send("no servers known.")
Exemple #10
0
    def process_run(self, request, job_id):
        """ ! @brief process 'addjobs' command

        @param job_id Job.id of job which will be executed here

        """

        try:
            job_id = int(job_id)

            # connect to database
            dbconnection = hQDBConnection()

            # get job instance
            job = dbconnection.query(db.Job).get(job_id)

            command = job.command
            shell = job.shell

            # create temporary file object for stdout and stderr of executing command
            fOut = tempfile.NamedTemporaryFile(prefix="hq-es.", bufsize=0, delete=True)
            fErr = tempfile.NamedTemporaryFile(prefix="hq-es.", bufsize=0, delete=True)

            startTime = datetime.now()

            self.writeLog(
                "job ({j}) has been started at {t}".format(j=job_id, t=str(startTime)), logCategory="request_processing"
            )

            ###############################
            # execute job in a subprocess #
            ###############################
            sp = subprocess.Popen(
                command, shell=True, cwd=os.path.expanduser("~"), executable=shell, stdout=fOut, stderr=fErr
            )

            ### tell server that job has been started
            ##clientSock = hSocket(host=self.host,
            ##                     port=self.port,
            ##                     EOCString=self.EOCString,
            ##                     sslConnection=self.sslConnection,
            ##                     certfile=certfile,
            ##                     keyfile=keyfile,
            ##                     ca_certs=ca_certs,
            ##                     catchErrors=False)
            ##
            ##clientSock.send("jobstarted:{jobID}".format(jobID=jobID))
            ##clientSock.close()

            # store info about running job in database

            # set job as running
            dbconnection.query(db.JobDetails.job_id).filter(db.JobDetails.job_id == job_id).update(
                {db.JobDetails.job_status_id: self.server.database_ids["running"]}
            )

            job.job_details.host_id = self.server.host_id
            job.job_details.pid = sp.pid

            # set history
            jobHistory = db.JobHistory(job=job, job_status_id=self.server.database_ids["running"])

            dbconnection.introduce(jobHistory)
            dbconnection.commit()
            dbconnection.remove()

            ###################################
            # wait until process has finished #
            sp.wait()
            ###################################

            endTime = datetime.now()

            self.writeLog(
                "job ({j}) has been finished at {t}".format(j=job_id, t=str(endTime)), logCategory="request_processing"
            )

            # connect to database
            dbconnection = hQDBConnection()

            # get job instance (again, since we use here another connection)
            job = dbconnection.query(db.Job).get(job_id)

            ##################################################
            # write command, stdout, and stderr to a files   #

            if job.stdout:
                # copy temporary file for stdout
                try:
                    shutil.copyfile(fOut.name, job.stdout)
                except:
                    # error while opening or writing file
                    # what to do??
                    pass

            if job.stderr:
                # copy temporary file for stderr
                try:
                    shutil.copyfile(fErr.name, job.stderr)
                except:
                    # error while opening or writing file
                    # what to do??
                    pass

            if job.logfile:
                # write
                try:
                    # write logfile
                    p = os.path.expanduser(job.logfile)
                    with open(p, "w") as f:

                        f.write("-----------------------\n")
                        f.write("--------command--------\n")
                        f.write("-----------------------\n")

                        f.write("%s\n" % command)
                        f.write("\n")

                        f.write("-----------------------\n")
                        f.write("----------info---------\n")
                        f.write("-----------------------\n")

                        f.write("host: {0}\n".format(os.uname()[1]))
                        f.write("started: %s\n" % (startTime))
                        f.write("finished: %s\n" % (endTime))
                        f.write("running time: %s\n" % (endTime - startTime))
                        f.write("\n")

                        f.write("-----------------------\n")
                        f.write("------BEGIN stdout-----\n")
                        f.write("-----------------------\n")

                        fOut.seek(0)

                        for line in fOut:
                            f.write("%s" % line)

                        f.write("-----------------------\n")
                        f.write("------END stdout-------\n")
                        f.write("-----------------------\n")

                        f.write("\n")

                        f.write("-----------------------\n")
                        f.write("------BEGIN stderr-----\n")
                        f.write("-----------------------\n")

                        fErr.seek(0)
                        for line in fErr:
                            f.write("%s" % line)

                        f.write("-----------------------\n")
                        f.write("------END stderr-------\n")
                        f.write("-----------------------\n")

                except:
                    # error while opening or writing file
                    # what to do??
                    print traceback.print_exc()
                    pass

            fOut.close()
            fErr.close()

            # set job as finished
            dbconnection.query(db.JobDetails.job_id).filter(db.JobDetails.job_id == job_id).update(
                {db.JobDetails.job_status_id: self.server.database_ids["finished"]}
            )

            job.job_details.return_code = sp.returncode

            # set history
            jobHistory = db.JobHistory(job=job, job_status_id=self.server.database_ids["finished"])

            dbconnection.introduce(jobHistory)

            finishedJob = db.FinishedJob(job=job)

            dbconnection.introduce(finishedJob)
            dbconnection.commit()
            dbconnection.remove()

        except:
            # something went wrong.
            print traceback.print_exc()
            pass
Exemple #11
0
                        )
    
    parser.add_argument('-v', '--verbose-mode',
                        nargs = 0,
                        dest = 'verboseMode',
                        action = ValidateVerboseMode,
                        default = False,
                        help = 'Activate verbose mode.'
                        )
    
    args = parser.parse_args()

    logger.info( "Welcome to {p}!".format(p=PROGNAME) )

    if args.showReservations:
        con = hQDBConnection()

        reservations = con.query( db.Reservation ).all()

        for reservation in reservations:
            print_reservation( reservation, con )

        if not reservations:
            print "there are no reservations."
                    
    elif args.addReservation:
        #questions = [ { 'question': 'Number of slots? ',
        #                'answer': "" } ]
        #
        #for q in questions:
        #    q['answer']  = raw_input( q['question'] )
    def next( self, numJobs=1, excludedJobIDs=set([]), returnInstances=False, logFct=None ):
        """! @brief get next jobs which will be send to cluster

        @param numJobs (int) maximal number of jobs which will be returned
        @param excludedJobIDs (set) set of jobIDs which should not be considered
        @param returnInstances (bool) if True return db.Job instances otherwise return job ids

        @return (list) job ids or db.Job
        
        @todo think about something more sophisticated than just taking the next in queue
        """

        dbconnection = hQDBConnection()

        # assign a dummyLog function to logFct if 
        if logFct:
            self.logFct = logFct
        else:
            def dummyLog( *args, **kargs ):
                return
            
            self.logFct = dummyLog
        
        # get list of tuples (<job.id>,<host.id>)
        nextJobs = []
        # get newly occupied slots grouped by host id
        newlyOccupiedSlots = defaultdict( int )

        # get next waiting jobs in queue
        self.logFct( "   get {n} waiting jobs ...".format( n=numJobs ),
                     logCategory="job_scheduler" )
        
        if excludedJobIDs:
            jobs = dbconnection.query( db.WaitingJob )\
                   .join( db.User )\
                   .join( db.Job )\
                   .filter( db.User.enabled==True )\
                   .filter( not_(db.Job.id.in_(excludedJobIDs) ) )\
                   .order_by( db.WaitingJob.priorityValue )\
                   .limit( numJobs )\
                   .all()
            
        else:
            jobs = dbconnection.query( db.WaitingJob )\
                   .join( db.User )\
                   .filter( db.User.enabled==True )\
                   .order_by( db.WaitingJob.priorityValue.desc() )\
                   .limit( numJobs )\
                   .all()
            
        self.logFct( "   ... found {n} jobs".format(n=len(jobs)),
                     logCategory="job_scheduler" )


        # iterate over all jobs
        for idx,wJob in enumerate(jobs):
            # get owner of current job
            job = wJob.job
            user = job.user
            
            self.logFct( "     {idx}/{N} get vacant host for job {i} of user {u}".format( idx=idx+1,
                                                                                          N=len(jobs),
                                                                                          i=job.id,
                                                                                          u=user.name ),
                         logCategory="sendingjobs" )

            # get excluded hosts
            excludedHosts = json.loads( job.excluded_hosts )

            # get vacant host which has the required number of free slots. jobs which have been
            # processed here but have not been started are also considered
            vacantHost = self.get_vacant_host( job.slots,
                                               newlyOccupiedSlots,
                                               excludedHosts=set( excludedHosts ) )

            if vacantHost:
                if returnInstances:
                    nextJobs.append( (user, job, vacantHost) )
                else:
                    nextJobs.append( (user.id, job.id, vacantHost.id) )
                    
                newlyOccupiedSlots[ vacantHost.id ] += job.slots
                

        return nextJobs
    def get_vacant_host( self, slots, slotDict, excludedHosts=set([]) ):
        """! @brief get vacant host which is not in excludedHosts and has at least slots unused slots

        @param slots (int) minimum number of free slots on vacant host
        @param slotDict (dict) newly occupied slots grouped by host
        @param excludedHosts (set) set of full names of host which should be excluded

        @return (@c Host|None)
        """

        #timeLogger = TimeLogger( prefix="getVacantHost" )
        
        dbconnection = hQDBConnection()
        
        self.logFct( "   find vacant host ...",
                     logCategory="job_scheduler" )
        
        if excludedHosts:
            hosts = dbconnection.query( db.Host )\
                    .join( db.HostSummary )\
                    .filter( and_(db.HostSummary.available==True,
                                  db.HostSummary.reachable==True,
                                  db.HostSummary.active==True,
                                  not_(db.Host.full_name.in_( excludedHosts ) ),
                                  db.Host.max_number_occupied_slots >= db.HostSummary.number_occupied_slots+slots
                                  ) )\
                    .all()
        else:
            hosts = dbconnection.query( db.Host ). \
              join( db.HostSummary ). \
              filter( and_(db.HostSummary.available==True,
                           db.HostSummary.reachable==True,
                           db.HostSummary.active==True,
                           db.Host.max_number_occupied_slots >= db.HostSummary.number_occupied_slots+slots ) ).all()

        if not hosts:
            self.logFct( "   ... no vacant host found.",
                         logCategory="sendingjobs" )
            
            return None
        else:
            # check load

            # pick randomly a host from list
            host = choice( hosts )
            hostSummary = host.host_summary
            
            # get latest load
            try:
                hostLoad = sorted( host.host_load, key=attrgetter( 'datetime' ) )[-1]
            except:
                # no load is given
                self.logFct( "   ... host {h} has no load in db. skip.".format(h=host.full_name),
                             logCategory="job_scheduler" )
                
                # get another vacant host
                excludedHosts.add( host.full_name )
                
                return self.get_vacant_host( slots, slotDict, excludedHosts=excludedHosts )

            expectedNewLoad = hostLoad.loadavg_1min + slotDict[ host.id ] + slots
            if expectedNewLoad <= 1.10 * host.total_number_slots:
                self.logFct( "   ... {h} is vacant. load is {l}. ok.".format(h=host.full_name,l=hostLoad.loadavg_1min),
                             logCategory="job_scheduler" )
                return host
            else:
                # load is too high
                self.logFct( "   ... {h} is vacant. load is {l}. too high. skip".format(h=host.full_name,l=hostLoad.loadavg_1min),
                             logCategory="job_scheduler" )

                # get another vacant host
                excludedHosts.add( host.full_name )
                
                return self.get_vacant_host( slots, slotDict, excludedHosts=excludedHosts )
Exemple #14
0
    def _render_job_list( self, num, job_type ):
        """ ! @brief helper function for lswjobs, lspjobs, ...
        """

        if not num:
            # default
            num=10

        # connect to database
        dbconnection = hQDBConnection()

        query = dbconnection.query( db.Job )\
                .join( db.JobDetails )\
                .join( db.JobHistory )\
                .filter( db.JobDetails.job_status_id==self.server.database_ids[job_type] )\
                .filter( db.JobHistory.job_status_id==self.server.database_ids[job_type] )\
                .filter( db.Job.user_id==self.server.user_id )\
                .order_by( db.JobHistory.datetime.desc() )
        
        if num!='all':
            query = query.limit( int(num) )

        jobs = query.all()

        header = []
        response = []

        if job_type=="waiting":
            header.append( "Waiting jobs" )
            header.append( "------------" )
            
            jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:waiting since {t}] [group:{group}] [info:{info}] [command:{command}{dots}]"

        elif job_type=="pending":
            header.append( "Pending jobs" )
            header.append( "------------" )
            
            jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:pending on {host} since {t}] [group:{group}] [info:{info}] [command:{command}{dots}]\n"

        elif job_type=="running":
            header.append( "Running jobs" )
            header.append( "------------" )
            
            jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:running on {host} since {t}] [group:{group}] [info:{info}] [command:{command}{dots}]"

        elif job_type=="finished":
            header.append( "Finished jobs" )
            header.append( "------------" )
            
            jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:finished since {t}] [group:{group}] [info:{info}] [command:{command}{dots}]"
        
        
        for idx,job in enumerate(jobs):
            response.append( jobString.format( i=idx,
                                               id=job.id,
                                               user=job.user.name,
                                               t=str(job.job_history[-1].datetime),
                                               group=job.group,
                                               info=job.info_text,
                                               command=job.command[:30],
                                               dots="..." if len(job.command)>30 else "" ) )
        return "\n".join( header + response )