def __init__(self, port): handler = hQExecServerHandler processor = hQExecServerRequestProcessor() self.user = USER super(hQExecServer, self).__init__(port, handler, processor) # connect to database dbconnection = hQDBConnection() # get database id of host try: self.host_id = dbconnection.query(db.Host.id).filter(db.Host.full_name == self.host).one()[0] except: sys.stderr.write("Host is not in cluster!") sys.exit(-1) # set interval for loop of calling loop functions self.loops = { "print_status": { "fct": self.print_status, "kwargs": {"short": True, "remove_connection": True}, "interval": 5, "description": "print periodically status of server", } } # flags which indicate running processes self.printing_status = threading.Event() self.user_id = dbconnection.query(db.User.id).filter(db.User.name == self.user).one()
def process_findjobs( self, request, match_str ): """ ! @brief process 'findjobs' command """ # connect to database dbconnection = hQDBConnection() jobs = dbconnection.query( db.Job ).filter( or_( db.Job.command.ilike( '%{s}%'.format(s=match_str) ), db.Job.info_text.ilike( '%{s}%'.format(s=match_str) ), db.Job.group.ilike( '%{s}%'.format(s=match_str) ) ) ).all() response = [] response.append( "Matching jobs" ) response.append( "-------------" ) jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:{status}] [group:{group}] [info:{info}] [command:{command}{dots}]" for idx,job in enumerate(jobs): response.append( jobString.format( i=idx, id=job.id, user=job.user.name, status=job.job_details.job_status.name, group=job.group, info=job.info_text, command=job.command[:30], dots="..." if len(job.command)>30 else "" ) ) if jobs: request.send( '\n'.join( response ) ) else: request.send("no jobs found")
def init_database_ids( self ): """save some database ids in the dictinary self.database_ids for faster access. """ # establish database connection con = hQDBConnection() self.database_ids = dict( con.query( db.JobStatus.name, db.JobStatus.id ).all() )
def get_status(self, remove_connection=True): """! @brief get status of server from database """ dbconnection = hQDBConnection() self.logger.write("print status: request database about status", logCategory="debug") # get all number of jobs for each status type query = ( dbconnection.query(db.JobStatus.name, func.count("*")) .join(db.JobDetails) .join(db.Job) .filter(db.Job.user_id == self.user_id) .group_by(db.JobStatus.name) ) counts = dict(query.all()) if not counts: # no jobs so far in the database counts = {} self.logger.write("print status: get slot info", logCategory="debug") slotInfo = ( dbconnection.query( func.count("*"), func.sum(db.Host.max_number_occupied_slots), func.sum(db.HostSummary.number_occupied_slots), ) .select_from(db.Host) .join(db.HostSummary, db.HostSummary.host_id == db.Host.id) .filter(db.HostSummary.active == True) .one() ) if slotInfo[0] == 0: slotInfo = (0, 0, 0) dbconnection.remove() countsDict = { "hosts": slotInfo[0], "oSlots": slotInfo[2], "tSlots": slotInfo[1], "wJobs": counts.get("waiting", 0), "pJobs": counts.get("pending", 0), "rJobs": counts.get("running", 0), "fJobs": counts.get("finished", 0), } if remove_connection: # connection has to be removed. otherwise calling hQBDSession returns (in the same thread) # the same connection which doesn't see recent updates dbconnection.remove() return countsDict
def process_invokeservers( self, request): # connect to database dbconnection = hQDBConnection() hosts = dbconnection.query( db.Host ).join( db.HostSummary ).filter( and_(db.HostSummary.available==True, db.HostSummary.reachable==True, db.HostSummary.active==True ) ).all() for host in hosts: hostID = host.id hostName = host.full_name ExecServer = self.server.get_exec_server( hostID, hostName ) if not ExecServer: self.writeLog( "... could not start a hq-exec-server on {h}!\n".format(h=hostName), logCategory='error') request.send("done")
def process_lsgroups( self, request ): """ ! @brief process 'lagroup' command """ # connect to database dbconnection = hQDBConnection() groupNames = dbconnection.query( db.Job.group )\ .filter( db.Job.user_id==self.TMS.userID )\ .distinct()\ .all() response = "" for groupName, in groupNames: # get all number of jobs for each status type for user counts = dict( dbconnection.query( db.JobStatus.name, func.count('*') ).\ join( db.JobDetails, db.JobDetails.job_status_id==db.JobStatus.id ).\ join( db.Job, db.Job.id==db.JobDetails.job_id ).\ filter( and_(db.Job.user_id==self.TMS.userID, db.Job.group==groupName) ).\ group_by( db.JobStatus.name ).\ all() ) finished = counts.get('finished',0) all = counts.get('waiting',0) + counts.get('pending',0) + counts.get('running',0) + counts.get('finished',0) if all!= 0: progress = 1.0 * finished/all response += "{s:>20} : {value}\n".format(s="group", value=groupName ) response += "{s:>20} : {value}\n".format(s="waiting jobs", value=counts.get('waiting',0) ) response += "{s:>20} : {value}\n".format(s="pending jobs", value=counts.get('pending',0) ) response += "{s:>20} : {value}\n".format(s="running jobs", value=counts.get('running',0) ) response += "{s:>20} : {value}\n".format(s="finished jobs", value=counts.get('finished',0) ) response += "{s:>20} : {value:.2%}\n".format(s="progress", value=progress ) response += "\n" if response: request.send( response ) else: request.send( "no groups found" )
def process_lajob( self, request, job_id ): """ ! @brief process 'lajob' command """ # connect to database dbconnection = hQDBConnection() job = dbconnection.query( db.Job ).get( int(job_id) ) if job: response = "" response += "{s:>20} : {value}\n".format(s="job id", value=job.id ) response += "{s:>20} : {value}\n".format(s="command", value=job.command ) response += "{s:>20} : {value}\n".format(s="info text", value=job.info_text ) response += "{s:>20} : {value}\n".format(s="group", value=job.group ) response += "{s:>20} : {value}\n".format(s="stdout", value=job.stdout ) response += "{s:>20} : {value}\n".format(s="stderr", value=job.stderr ) response += "{s:>20} : {value}\n".format(s="logfile", value=job.logfile ) response += "{s:>20} : {value}\n".format(s="excludedHosts", value=job.excluded_hosts ) response += "{s:>20} : {value}\n".format(s="slots", value=job.slots ) for idx,hist in enumerate(job.job_history): if idx==0: s = "status" else: s="" response += "{s:>20} : [{t}] {status}\n".format(s=s, t=str(hist.datetime), status=hist.job_status.name ) try: response += "{s:>20} : {value}\n".format(s="host", value=job.job_details.host.short_name ) except: response += "{s:>20} : {value}\n".format(s="host", value="None" ) response += "{s:>20} : {value}\n".format(s="pid", value=job.job_details.pid ) response += "{s:>20} : {value}\n".format(s="return code", value=job.job_details.return_code ) request.send( response ) else: request.send("unkown job.")
def __init__(self, port): handler = hQUserServerHandler processor = hQUserServerRequestProcessor() self.user=USER super( hQUserServer, self ).__init__( port, handler, processor ) # connect to hq-server and register hq-user-server try: allowed = self.register_server() if not allowed: sys.stderr.write( "Your are not alowed to use the hq pacakge.\nPlease contact your hq administrator." ) sys.exit( -1 ) except: sys.stderr.write( "hq server ist not running\nPlease contact your hq administrator." ) sys.exit( -1 ) # connect to database dbconnection = hQDBConnection() # set interval for loop of calling loop functions self.loops = { 'print_status': { 'fct': self.print_status, 'kwargs': {'short': True, 'remove_connection': True}, 'interval': 5, 'description': "print periodically status of server" } } #self.loops = { 'print_status': { 'fct': self.loop_print_status, # 'interval': 60, # 'description': "print periodically status of server" } # } self.exec_servers = {} # flags which indicate running processes self.printing_status = threading.Event() self.not_invoking_exec_server = {}
def process_lss( self, request ): """ ! @brief process 'lss' command """ # connect to database dbconnection = hQDBConnection() response = "" hosts = dict( dbconnection.query( db.Host.id, db.Host.full_name ).all() ) for idx,hostID in enumerate( hosts ): host_fullname = hosts[ hostID ] ExecServer = self.server.get_exec_server( hostID, host_fullname, do_not_invoke=True ) if ExecServer: response += "{idx} - [host:{host}] [port:{port}] [status:{status}]\n".format( idx=idx, host=host_fullname, port=ExecServer.port if ExecServer else '?', status="running" if ExecServer else "not running" ) if response: request.send( response ) else: request.send("no servers known.")
def process_run(self, request, job_id): """ ! @brief process 'addjobs' command @param job_id Job.id of job which will be executed here """ try: job_id = int(job_id) # connect to database dbconnection = hQDBConnection() # get job instance job = dbconnection.query(db.Job).get(job_id) command = job.command shell = job.shell # create temporary file object for stdout and stderr of executing command fOut = tempfile.NamedTemporaryFile(prefix="hq-es.", bufsize=0, delete=True) fErr = tempfile.NamedTemporaryFile(prefix="hq-es.", bufsize=0, delete=True) startTime = datetime.now() self.writeLog( "job ({j}) has been started at {t}".format(j=job_id, t=str(startTime)), logCategory="request_processing" ) ############################### # execute job in a subprocess # ############################### sp = subprocess.Popen( command, shell=True, cwd=os.path.expanduser("~"), executable=shell, stdout=fOut, stderr=fErr ) ### tell server that job has been started ##clientSock = hSocket(host=self.host, ## port=self.port, ## EOCString=self.EOCString, ## sslConnection=self.sslConnection, ## certfile=certfile, ## keyfile=keyfile, ## ca_certs=ca_certs, ## catchErrors=False) ## ##clientSock.send("jobstarted:{jobID}".format(jobID=jobID)) ##clientSock.close() # store info about running job in database # set job as running dbconnection.query(db.JobDetails.job_id).filter(db.JobDetails.job_id == job_id).update( {db.JobDetails.job_status_id: self.server.database_ids["running"]} ) job.job_details.host_id = self.server.host_id job.job_details.pid = sp.pid # set history jobHistory = db.JobHistory(job=job, job_status_id=self.server.database_ids["running"]) dbconnection.introduce(jobHistory) dbconnection.commit() dbconnection.remove() ################################### # wait until process has finished # sp.wait() ################################### endTime = datetime.now() self.writeLog( "job ({j}) has been finished at {t}".format(j=job_id, t=str(endTime)), logCategory="request_processing" ) # connect to database dbconnection = hQDBConnection() # get job instance (again, since we use here another connection) job = dbconnection.query(db.Job).get(job_id) ################################################## # write command, stdout, and stderr to a files # if job.stdout: # copy temporary file for stdout try: shutil.copyfile(fOut.name, job.stdout) except: # error while opening or writing file # what to do?? pass if job.stderr: # copy temporary file for stderr try: shutil.copyfile(fErr.name, job.stderr) except: # error while opening or writing file # what to do?? pass if job.logfile: # write try: # write logfile p = os.path.expanduser(job.logfile) with open(p, "w") as f: f.write("-----------------------\n") f.write("--------command--------\n") f.write("-----------------------\n") f.write("%s\n" % command) f.write("\n") f.write("-----------------------\n") f.write("----------info---------\n") f.write("-----------------------\n") f.write("host: {0}\n".format(os.uname()[1])) f.write("started: %s\n" % (startTime)) f.write("finished: %s\n" % (endTime)) f.write("running time: %s\n" % (endTime - startTime)) f.write("\n") f.write("-----------------------\n") f.write("------BEGIN stdout-----\n") f.write("-----------------------\n") fOut.seek(0) for line in fOut: f.write("%s" % line) f.write("-----------------------\n") f.write("------END stdout-------\n") f.write("-----------------------\n") f.write("\n") f.write("-----------------------\n") f.write("------BEGIN stderr-----\n") f.write("-----------------------\n") fErr.seek(0) for line in fErr: f.write("%s" % line) f.write("-----------------------\n") f.write("------END stderr-------\n") f.write("-----------------------\n") except: # error while opening or writing file # what to do?? print traceback.print_exc() pass fOut.close() fErr.close() # set job as finished dbconnection.query(db.JobDetails.job_id).filter(db.JobDetails.job_id == job_id).update( {db.JobDetails.job_status_id: self.server.database_ids["finished"]} ) job.job_details.return_code = sp.returncode # set history jobHistory = db.JobHistory(job=job, job_status_id=self.server.database_ids["finished"]) dbconnection.introduce(jobHistory) finishedJob = db.FinishedJob(job=job) dbconnection.introduce(finishedJob) dbconnection.commit() dbconnection.remove() except: # something went wrong. print traceback.print_exc() pass
) parser.add_argument('-v', '--verbose-mode', nargs = 0, dest = 'verboseMode', action = ValidateVerboseMode, default = False, help = 'Activate verbose mode.' ) args = parser.parse_args() logger.info( "Welcome to {p}!".format(p=PROGNAME) ) if args.showReservations: con = hQDBConnection() reservations = con.query( db.Reservation ).all() for reservation in reservations: print_reservation( reservation, con ) if not reservations: print "there are no reservations." elif args.addReservation: #questions = [ { 'question': 'Number of slots? ', # 'answer': "" } ] # #for q in questions: # q['answer'] = raw_input( q['question'] )
def next( self, numJobs=1, excludedJobIDs=set([]), returnInstances=False, logFct=None ): """! @brief get next jobs which will be send to cluster @param numJobs (int) maximal number of jobs which will be returned @param excludedJobIDs (set) set of jobIDs which should not be considered @param returnInstances (bool) if True return db.Job instances otherwise return job ids @return (list) job ids or db.Job @todo think about something more sophisticated than just taking the next in queue """ dbconnection = hQDBConnection() # assign a dummyLog function to logFct if if logFct: self.logFct = logFct else: def dummyLog( *args, **kargs ): return self.logFct = dummyLog # get list of tuples (<job.id>,<host.id>) nextJobs = [] # get newly occupied slots grouped by host id newlyOccupiedSlots = defaultdict( int ) # get next waiting jobs in queue self.logFct( " get {n} waiting jobs ...".format( n=numJobs ), logCategory="job_scheduler" ) if excludedJobIDs: jobs = dbconnection.query( db.WaitingJob )\ .join( db.User )\ .join( db.Job )\ .filter( db.User.enabled==True )\ .filter( not_(db.Job.id.in_(excludedJobIDs) ) )\ .order_by( db.WaitingJob.priorityValue )\ .limit( numJobs )\ .all() else: jobs = dbconnection.query( db.WaitingJob )\ .join( db.User )\ .filter( db.User.enabled==True )\ .order_by( db.WaitingJob.priorityValue.desc() )\ .limit( numJobs )\ .all() self.logFct( " ... found {n} jobs".format(n=len(jobs)), logCategory="job_scheduler" ) # iterate over all jobs for idx,wJob in enumerate(jobs): # get owner of current job job = wJob.job user = job.user self.logFct( " {idx}/{N} get vacant host for job {i} of user {u}".format( idx=idx+1, N=len(jobs), i=job.id, u=user.name ), logCategory="sendingjobs" ) # get excluded hosts excludedHosts = json.loads( job.excluded_hosts ) # get vacant host which has the required number of free slots. jobs which have been # processed here but have not been started are also considered vacantHost = self.get_vacant_host( job.slots, newlyOccupiedSlots, excludedHosts=set( excludedHosts ) ) if vacantHost: if returnInstances: nextJobs.append( (user, job, vacantHost) ) else: nextJobs.append( (user.id, job.id, vacantHost.id) ) newlyOccupiedSlots[ vacantHost.id ] += job.slots return nextJobs
def get_vacant_host( self, slots, slotDict, excludedHosts=set([]) ): """! @brief get vacant host which is not in excludedHosts and has at least slots unused slots @param slots (int) minimum number of free slots on vacant host @param slotDict (dict) newly occupied slots grouped by host @param excludedHosts (set) set of full names of host which should be excluded @return (@c Host|None) """ #timeLogger = TimeLogger( prefix="getVacantHost" ) dbconnection = hQDBConnection() self.logFct( " find vacant host ...", logCategory="job_scheduler" ) if excludedHosts: hosts = dbconnection.query( db.Host )\ .join( db.HostSummary )\ .filter( and_(db.HostSummary.available==True, db.HostSummary.reachable==True, db.HostSummary.active==True, not_(db.Host.full_name.in_( excludedHosts ) ), db.Host.max_number_occupied_slots >= db.HostSummary.number_occupied_slots+slots ) )\ .all() else: hosts = dbconnection.query( db.Host ). \ join( db.HostSummary ). \ filter( and_(db.HostSummary.available==True, db.HostSummary.reachable==True, db.HostSummary.active==True, db.Host.max_number_occupied_slots >= db.HostSummary.number_occupied_slots+slots ) ).all() if not hosts: self.logFct( " ... no vacant host found.", logCategory="sendingjobs" ) return None else: # check load # pick randomly a host from list host = choice( hosts ) hostSummary = host.host_summary # get latest load try: hostLoad = sorted( host.host_load, key=attrgetter( 'datetime' ) )[-1] except: # no load is given self.logFct( " ... host {h} has no load in db. skip.".format(h=host.full_name), logCategory="job_scheduler" ) # get another vacant host excludedHosts.add( host.full_name ) return self.get_vacant_host( slots, slotDict, excludedHosts=excludedHosts ) expectedNewLoad = hostLoad.loadavg_1min + slotDict[ host.id ] + slots if expectedNewLoad <= 1.10 * host.total_number_slots: self.logFct( " ... {h} is vacant. load is {l}. ok.".format(h=host.full_name,l=hostLoad.loadavg_1min), logCategory="job_scheduler" ) return host else: # load is too high self.logFct( " ... {h} is vacant. load is {l}. too high. skip".format(h=host.full_name,l=hostLoad.loadavg_1min), logCategory="job_scheduler" ) # get another vacant host excludedHosts.add( host.full_name ) return self.get_vacant_host( slots, slotDict, excludedHosts=excludedHosts )
def _render_job_list( self, num, job_type ): """ ! @brief helper function for lswjobs, lspjobs, ... """ if not num: # default num=10 # connect to database dbconnection = hQDBConnection() query = dbconnection.query( db.Job )\ .join( db.JobDetails )\ .join( db.JobHistory )\ .filter( db.JobDetails.job_status_id==self.server.database_ids[job_type] )\ .filter( db.JobHistory.job_status_id==self.server.database_ids[job_type] )\ .filter( db.Job.user_id==self.server.user_id )\ .order_by( db.JobHistory.datetime.desc() ) if num!='all': query = query.limit( int(num) ) jobs = query.all() header = [] response = [] if job_type=="waiting": header.append( "Waiting jobs" ) header.append( "------------" ) jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:waiting since {t}] [group:{group}] [info:{info}] [command:{command}{dots}]" elif job_type=="pending": header.append( "Pending jobs" ) header.append( "------------" ) jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:pending on {host} since {t}] [group:{group}] [info:{info}] [command:{command}{dots}]\n" elif job_type=="running": header.append( "Running jobs" ) header.append( "------------" ) jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:running on {host} since {t}] [group:{group}] [info:{info}] [command:{command}{dots}]" elif job_type=="finished": header.append( "Finished jobs" ) header.append( "------------" ) jobString = "{i:3d} - [jobid:{id}] [user:{user}] [status:finished since {t}] [group:{group}] [info:{info}] [command:{command}{dots}]" for idx,job in enumerate(jobs): response.append( jobString.format( i=idx, id=job.id, user=job.user.name, t=str(job.job_history[-1].datetime), group=job.group, info=job.info_text, command=job.command[:30], dots="..." if len(job.command)>30 else "" ) ) return "\n".join( header + response )