def getJobStatus( self, jobIDList ): """ Get the status information for the given list of jobs """ jobDict = {} for job in jobIDList: if not job: continue jobNumber = job.split( '.' )[0] jobDict[jobNumber] = job cmd = [ 'qstat' ] + jobIDList result = systemCall( 10, cmd ) if not result['OK']: return result resultDict = {} output = result['Value'][1].replace( '\r', '' ) lines = output.split( '\n' ) for job in jobDict: resultDict[jobDict[job]] = 'Unknown' for line in lines: if line.find( job ) != -1: if line.find( 'Unknown' ) != -1: resultDict[jobDict[job]] = 'Unknown' else: torqueStatus = line.split()[4] if torqueStatus in ['E', 'C']: resultDict[jobDict[job]] = 'Done' elif torqueStatus in ['R']: resultDict[jobDict[job]] = 'Running' elif torqueStatus in ['S', 'W', 'Q', 'H', 'T']: resultDict[jobDict[job]] = 'Waiting' return S_OK( resultDict )
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ jobDict = {} for job in jobIDList: jobNumber = job.split(".")[0] jobDict[jobNumber] = job cmd = ["qstat", " ".join(jobIDList)] result = systemCall(10, cmd) if not result["OK"]: return result resultDict = {} output = result["Value"][1].replace("\r", "") lines = output.split("\n") for job in jobDict: resultDict[jobDict[job]] = "Unknown" for line in lines: if line.find(job) != -1: if line.find("Unknown") != -1: resultDict[jobDict[job]] = "Unknown" else: torqueStatus = line.split()[4] if torqueStatus in ["E", "C"]: resultDict[jobDict[job]] = "Done" elif torqueStatus in ["R"]: resultDict[jobDict[job]] = "Running" elif torqueStatus in ["S", "W", "Q", "H", "T"]: resultDict[jobDict[job]] = "Waiting" return S_OK(resultDict)
def getDynamicInfo(self): """ Method to return information on running and pending jobs. """ result = S_OK() result["SubmittedJobs"] = self.submittedJobs cmd = ["qstat", "-Q", self.execQueue] ret = systemCall(10, cmd) if not ret["OK"]: self.log.error("Timeout", ret["Message"]) return ret status = ret["Value"][0] stdout = ret["Value"][1] stderr = ret["Value"][2] self.log.debug("status:", status) self.log.debug("stdout:", stdout) self.log.debug("stderr:", stderr) if status: self.log.error("Failed qstat execution:", stderr) return S_ERROR(stderr) matched = re.search( self.queue + "\D+(\d+)\D+(\d+)\W+(\w+)\W+(\w+)\D+(\d+)\D+(\d+)\D+(\d+)\D+(\d+)\D+(\d+)\D+(\d+)\W+(\w+)", stdout, ) if matched.groups < 6: return S_ERROR("Error retrieving information from qstat:" + stdout + stderr) try: waitingJobs = int(matched.group(5)) runningJobs = int(matched.group(6)) except: return S_ERROR("Error retrieving information from qstat:" + stdout + stderr) result["WaitingJobs"] = waitingJobs result["RunningJobs"] = runningJobs self.log.verbose("Waiting Jobs: ", waitingJobs) self.log.verbose("Running Jobs: ", runningJobs) return result
def executeGridCommand( proxy, cmd, gridEnvScript = None ): """ Execute cmd tuple after sourcing GridEnv """ currentEnv = dict( os.environ ) if not gridEnvScript: # if not passed as argument, use default from CS Helpers gridEnvScript = Local.gridEnv() if gridEnvScript: command = gridEnvScript.split() ret = sourceEnv( 10, command ) if not ret['OK']: return S_ERROR( 'Failed sourcing GridEnv: %s' % ret['Message'] ) gridEnv = ret['outputEnv'] # # Preserve some current settings if they are there # if currentEnv.has_key( 'X509_VOMS_DIR' ): gridEnv['X509_VOMS_DIR'] = currentEnv['X509_VOMS_DIR'] if currentEnv.has_key( 'X509_CERT_DIR' ): gridEnv['X509_CERT_DIR'] = currentEnv['X509_CERT_DIR'] else: gridEnv = currentEnv if not proxy: res = getProxyInfo() if not res['OK']: return res gridEnv['X509_USER_PROXY' ] = res['Value']['path'] elif type( proxy ) in types.StringTypes: if os.path.exists( proxy ): gridEnv[ 'X509_USER_PROXY' ] = proxy else: return S_ERROR( 'Can not treat proxy passed as a string' ) else: ret = gProxyManager.dumpProxyToFile( proxy ) if not ret['OK']: return ret gridEnv[ 'X509_USER_PROXY' ] = ret['Value'] result = systemCall( 120, cmd, env = gridEnv ) return result
def rescheduleFailedJob(jobID,message): try: import DIRAC global jobReport gLogger.warn('Failure during %s' %(message)) #Setting a job parameter does not help since the job will be rescheduled, #instead set the status with the cause and then another status showing the #reschedule operation. if not jobReport: gLogger.info('Creating a new JobReport Object') jobReport = JobReport(int(jobID),'JobWrapperTemplate') jobReport.setApplicationStatus( 'Failed %s ' % message, sendFlag = False ) jobReport.setJobStatus( 'Rescheduled', message, sendFlag = False ) # We must send Job States and Parameters before it gets reschedule jobReport.sendStoredStatusInfo() jobReport.sendStoredJobParameters() gLogger.info('Job will be rescheduled after exception during execution of the JobWrapper') jobManager = RPCClient('WorkloadManagement/JobManager') result = jobManager.rescheduleJob(int(jobID)) if not result['OK']: gLogger.warn(result) # Send mail to debug errors mailAddress = DIRAC.alarmMail site = DIRAC.siteName() subject = 'Job rescheduled at %s' % site ret = systemCall(0,'hostname') wn = ret['Value'][1] msg = 'Job %s rescheduled at %s, wn=%s\n' % ( jobID, site, wn ) msg += message NotificationClient().sendMail(mailAddress,subject,msg,fromAddress="*****@*****.**",localAttempt=False) return except Exception,x: gLogger.exception('JobWrapperTemplate failed to reschedule Job') return
def executeGridCommand( proxy, cmd, gridEnvScript = None ): """ Execute cmd tuple after sourcing GridEnv """ currentEnv = dict( os.environ ) if gridEnvScript: ret = sourceEnv( 10, [gridEnvScript] ) if not ret['OK']: return S_ERROR( 'Failed sourcing GridEnv: %s' % ret['Message'] ) gridEnv = ret['outputEnv'] # # Preserve some current settings if they are there # if currentEnv.has_key( 'X509_VOMS_DIR' ): gridEnv['X509_VOMS_DIR'] = currentEnv['X509_VOMS_DIR'] if currentEnv.has_key( 'X509_CERT_DIR' ): gridEnv['X509_CERT_DIR'] = currentEnv['X509_CERT_DIR'] else: gridEnv = currentEnv dumpedProxy = False if not proxy: res = getProxyInfo() if not res['OK']: return res gridEnv['X509_USER_PROXY' ] = res['Value']['path'] elif type( proxy ) in types.StringTypes: if os.path.exists( proxy ): gridEnv[ 'X509_USER_PROXY' ] = proxy else: return S_ERROR( 'Can not treat proxy passed as a string' ) else: ret = gProxyManager.dumpProxyToFile( proxy ) if not ret['OK']: return ret gridEnv[ 'X509_USER_PROXY' ] = ret['Value'] dumpedProxy = True result = systemCall( 120, cmd, env = gridEnv ) if dumpedProxy: # If proxy is stored in a file, delete it, gProxyManager fails to reuse it. # Once the gProxyManager is fixed, this file removal can be dropped os.unlink( gridEnv[ 'X509_USER_PROXY' ] ) return result
def executeGridCommand(proxy, cmd, gridEnvScript=None): """ Execute cmd tuple after sourcing GridEnv """ currentEnv = dict(os.environ) if gridEnvScript: ret = sourceEnv(10, [gridEnvScript]) if not ret["OK"]: return S_ERROR("Failed sourcing GridEnv: %s" % ret["Message"]) gridEnv = ret["outputEnv"] # # Preserve some current settings if they are there # if currentEnv.has_key("X509_VOMS_DIR"): gridEnv["X509_VOMS_DIR"] = currentEnv["X509_VOMS_DIR"] if currentEnv.has_key("X509_CERT_DIR"): gridEnv["X509_CERT_DIR"] = currentEnv["X509_CERT_DIR"] else: gridEnv = currentEnv if not proxy: res = getProxyInfo() if not res["OK"]: return res gridEnv["X509_USER_PROXY"] = res["Value"]["path"] elif type(proxy) in types.StringTypes: if os.path.exists(proxy): gridEnv["X509_USER_PROXY"] = proxy else: return S_ERROR("Can not treat proxy passed as a string") else: ret = gProxyManager.dumpProxyToFile(proxy) if not ret["OK"]: return ret gridEnv["X509_USER_PROXY"] = ret["Value"] result = systemCall(120, cmd, env=gridEnv) return result
def executeGridCommand( proxy, cmd, gridEnvScript = None ): """ Execute cmd tuple after sourcing GridEnv """ currentEnv = dict( os.environ ) if gridEnvScript: ret = sourceEnv( 10, [gridEnvScript] ) if not ret['OK']: return S_ERROR( 'Failed sourcing GridEnv: %s' % ret['Message'] ) gridEnv = ret['outputEnv'] # # Preserve some current settings if they are there # if currentEnv.has_key( 'X509_VOMS_DIR' ): gridEnv['X509_VOMS_DIR'] = currentEnv['X509_VOMS_DIR'] if currentEnv.has_key( 'X509_CERT_DIR' ): gridEnv['X509_CERT_DIR'] = currentEnv['X509_CERT_DIR'] else: gridEnv = currentEnv if not proxy: res = getProxyInfo() if not res['OK']: return res gridEnv['X509_USER_PROXY' ] = res['Value']['path'] elif type( proxy ) in types.StringTypes: if os.path.exists( proxy ): gridEnv[ 'X509_USER_PROXY' ] = proxy else: return S_ERROR( 'Can not treat proxy passed as a string' ) else: ret = gProxyManager.dumpProxyToFile( proxy ) if not ret['OK']: return ret gridEnv[ 'X509_USER_PROXY' ] = ret['Value'] return systemCall( 120, cmd, env = gridEnv )
def export_updateSoftware( self, version, rootPath = "", gridVersion = "" ): """ Update the local DIRAC software installation to version """ # Check that we have a sane local configuration result = gConfig.getOptionsDict( '/LocalInstallation' ) if not result['OK']: return S_ERROR( 'Invalid installation - missing /LocalInstallation section in the configuration' ) elif not result['Value']: return S_ERROR( 'Invalid installation - empty /LocalInstallation section in the configuration' ) if rootPath and not os.path.exists( rootPath ): return S_ERROR( 'Path "%s" does not exists' % rootPath ) # For LHCb we need to check Oracle client installOracleClient = False oracleFlag = gConfig.getValue( '/LocalInstallation/InstallOracleClient', 'unknown' ) if oracleFlag.lower() in ['yes', 'true', '1']: installOracleClient = True elif oracleFlag.lower() == "unknown": result = systemCall( 30, ['python', '-c', 'import cx_Oracle'] ) if result['OK'] and result['Value'][0] == 0: installOracleClient = True cmdList = ['dirac-install', '-r', version, '-t', 'server'] if rootPath: cmdList.extend( ['-P', rootPath] ) # Check if there are extensions extensionList = getCSExtensions() webFlag = gConfig.getValue( '/LocalInstallation/WebPortal', False ) if webFlag: extensionList.append( 'Web' ) if extensionList: cmdList += ['-e', ','.join( extensionList )] # Are grid middleware bindings required ? if gridVersion: cmdList.extend( ['-g', gridVersion] ) targetPath = gConfig.getValue( '/LocalInstallation/TargetPath', gConfig.getValue( '/LocalInstallation/RootPath', '' ) ) if targetPath and os.path.exists( targetPath + '/etc/dirac.cfg' ): cmdList.append( targetPath + '/etc/dirac.cfg' ) else: return S_ERROR( 'Local configuration not found' ) result = systemCall( 240, cmdList ) if not result['OK']: return result status = result['Value'][0] if status != 0: # Get error messages error = [] output = result['Value'][1].split( '\n' ) for line in output: line = line.strip() if 'error' in line.lower(): error.append( line ) if error: message = '\n'.join( error ) else: message = "Failed to update software to %s" % version return S_ERROR( message ) # Check if there is a MySQL installation and fix the server scripts if necessary if os.path.exists( InstallTools.mysqlDir ): startupScript = os.path.join( InstallTools.instancePath, 'mysql', 'share', 'mysql', 'mysql.server' ) if not os.path.exists( startupScript ): startupScript = os.path.join( InstallTools.instancePath, 'pro', 'mysql', 'share', 'mysql', 'mysql.server' ) if os.path.exists( startupScript ): InstallTools.fixMySQLScripts( startupScript ) # For LHCb we need to check Oracle client if installOracleClient: result = systemCall( 30, 'install_oracle-client.sh' ) if not result['OK']: return result status = result['Value'][0] if status != 0: # Get error messages error = result['Value'][1].split( '\n' ) error.extend( result['Value'][2].split( '\n' ) ) error.append( 'Failed to install Oracle client module' ) return S_ERROR( '\n'.join( error ) ) return S_OK()
def export_updateSoftware(self, version, rootPath="", gridVersion=""): """ Update the local DIRAC software installation to version """ # Check that we have a sane local configuration result = gConfig.getOptionsDict('/LocalInstallation') if not result['OK']: return S_ERROR( 'Invalid installation - missing /LocalInstallation section in the configuration' ) elif not result['Value']: return S_ERROR( 'Invalid installation - empty /LocalInstallation section in the configuration' ) if rootPath and not os.path.exists(rootPath): return S_ERROR('Path "%s" does not exists' % rootPath) # For LHCb we need to check Oracle client installOracleClient = False oracleFlag = gConfig.getValue('/LocalInstallation/InstallOracleClient', 'unknown') if oracleFlag.lower() in ['yes', 'true', '1']: installOracleClient = True elif oracleFlag.lower() == "unknown": result = systemCall(0, ['python', '-c', 'import cx_Oracle']) if result['OK'] and result['Value'][0] == 0: installOracleClient = True cmdList = ['dirac-install', '-r', version, '-t', 'server'] if rootPath: cmdList.extend(['-P', rootPath]) # Check if there are extensions extensionList = getCSExtensions() webFlag = gConfig.getValue('/LocalInstallation/WebPortal', False) if webFlag: extensionList.append('Web') if extensionList: cmdList += ['-e', ','.join(extensionList)] # Are grid middleware bindings required ? if gridVersion: cmdList.extend(['-g', gridVersion]) targetPath = gConfig.getValue( '/LocalInstallation/TargetPath', gConfig.getValue('/LocalInstallation/RootPath', '')) if targetPath and os.path.exists(targetPath + '/etc/dirac.cfg'): cmdList.append(targetPath + '/etc/dirac.cfg') else: return S_ERROR('Local configuration not found') result = systemCall(0, cmdList) if not result['OK']: return result status = result['Value'][0] if status != 0: # Get error messages error = [] output = result['Value'][1].split('\n') for line in output: line = line.strip() if 'error' in line.lower(): error.append(line) if error: message = '\n'.join(error) else: message = "Failed to update software to %s" % version return S_ERROR(message) # Check if there is a MySQL installation and fix the server scripts if necessary if os.path.exists(InstallTools.mysqlDir): startupScript = os.path.join(InstallTools.instancePath, 'mysql', 'share', 'mysql', 'mysql.server') if not os.path.exists(startupScript): startupScript = os.path.join(InstallTools.instancePath, 'pro', 'mysql', 'share', 'mysql', 'mysql.server') if os.path.exists(startupScript): InstallTools.fixMySQLScripts(startupScript) # For LHCb we need to check Oracle client if installOracleClient: result = systemCall(0, 'install_oracle-client.sh') if not result['OK']: return result status = result['Value'][0] if status != 0: # Get error messages error = result['Value'][1].split('\n') error.extend(result['Value'][2].split('\n')) error.append('Failed to install Oracle client module') return S_ERROR('\n'.join(error)) return S_OK()
def export_updateSoftware(self, version, rootPath="", gridVersion="2009-08-13"): """ Update the local DIRAC software installation to version """ # Check that we have a sane local configuration result = gConfig.getOptionsDict("/LocalInstallation") if not result["OK"]: return S_ERROR("Invalid installation - missing /LocalInstallation section in the configuration") elif not result["Value"]: return S_ERROR("Invalid installation - empty /LocalInstallation section in the configuration") if rootPath and not os.path.exists(rootPath): return S_ERROR('Path "%s" does not exists' % rootPath) # For LHCb we need to check Oracle client installOracleClient = False oracleFlag = gConfig.getValue("/LocalInstallation/InstallOracleClient", "unknown") if oracleFlag.lower() in ["yes", "true", "1"]: installOracleClient = True elif oracleFlag.lower() == "unknown": result = systemCall(0, ["python", "-c", "import cx_Oracle"]) if result["OK"] and result["Value"][0] == 0: installOracleClient = True cmdList = ["dirac-install", "-r", version, "-t", "server"] if rootPath: cmdList.extend(["-P", rootPath]) # Check if there are extensions extensionList = getCSExtensions() webFlag = gConfig.getValue("/LocalInstallation/WebPortal", False) if webFlag: extensionList.append("Web") if extensionList: cmdList += ["-e", ",".join(extensionList)] # Are grid middleware bindings required ? if gridVersion: cmdList.extend(["-g", gridVersion]) targetPath = gConfig.getValue( "/LocalInstallation/TargetPath", gConfig.getValue("/LocalInstallation/RootPath", "") ) if targetPath and os.path.exists(targetPath + "/etc/dirac.cfg"): cmdList.append(targetPath + "/etc/dirac.cfg") else: return S_ERROR("Local configuration not found") result = systemCall(0, cmdList) if not result["OK"]: return result status = result["Value"][0] if status != 0: # Get error messages error = [] output = result["Value"][1].split("\n") for line in output: line = line.strip() if "error" in line.lower(): error.append(line) if error: message = "\n".join(error) else: message = "Failed to update software to %s" % version return S_ERROR(message) # Check if there is a MySQL installation and fix the server scripts if necessary if os.path.exists(InstallTools.mysqlDir): startupScript = os.path.join(InstallTools.instancePath, "mysql", "share", "mysql", "mysql.server") if not os.path.exists(startupScript): startupScript = os.path.join( InstallTools.instancePath, "pro", "mysql", "share", "mysql", "mysql.server" ) if os.path.exists(startupScript): InstallTools.fixMySQLScripts(startupScript) # For LHCb we need to check Oracle client if installOracleClient: result = systemCall(0, "install_oracle-client.sh") if not result["OK"]: return result status = result["Value"][0] if status != 0: # Get error messages error = result["Value"][1].split("\n") error.extend(result["Value"][2].split("\n")) error.append("Failed to install Oracle client module") return S_ERROR("\n".join(error)) return S_OK()
def getCEStatus(self): """ Method to return information on running and pending jobs. """ result = S_OK() result['SubmittedJobs'] = self.submittedJobs cmd = ["qstat", "-Q", self.execQueue] if self.userName: cmd = ["qstat", "-u", self.userName, self.execQueue] ret = systemCall(10, cmd) if not ret['OK']: self.log.error('Timeout', ret['Message']) return ret status = ret['Value'][0] stdout = ret['Value'][1] stderr = ret['Value'][2] self.log.debug("status:", status) self.log.debug("stdout:", stdout) self.log.debug("stderr:", stderr) if status: self.log.error('Failed qstat execution:', stderr) return S_ERROR(stderr) if self.userName: # Parse qstat -u userName queueName runningJobs = 0 waitingJobs = 0 lines = stdout.replace('\r', '').split('\n') for line in lines: if not line: continue if line.find(self.userName) != -1: if 'R' == line.split(' ')[-2]: runningJobs += 1 else: # every other status to assimilate to Waiting waitingJobs += 1 else: # parse qstat -Q queueName matched = re.search( self.queue + "\D+(\d+)\D+(\d+)\W+(\w+)\W+(\w+)\D+(\d+)\D+(\d+)\D+(\d+)\D+(\d+)\D+(\d+)\D+(\d+)\W+(\w+)", stdout) if matched.groups < 6: return S_ERROR("Error retrieving information from qstat:" + stdout + stderr) try: waitingJobs = int(matched.group(5)) runningJobs = int(matched.group(6)) except ValueError: return S_ERROR("Error retrieving information from qstat:" + stdout + stderr) result['WaitingJobs'] = waitingJobs result['RunningJobs'] = runningJobs self.log.verbose('Waiting Jobs: ', waitingJobs) self.log.verbose('Running Jobs: ', runningJobs) return result
def getCEStatus( self ): """ Method to return information on running and pending jobs. """ result = S_OK() result['SubmittedJobs'] = self.submittedJobs cmd = ["qstat", "-Q" , self.execQueue ] if self.userName: cmd = [ "qstat", "-u", self.userName, self.execQueue ] ret = systemCall( 10, cmd ) if not ret['OK']: self.log.error( 'Timeout', ret['Message'] ) return ret status = ret['Value'][0] stdout = ret['Value'][1] stderr = ret['Value'][2] self.log.debug( "status:", status ) self.log.debug( "stdout:", stdout ) self.log.debug( "stderr:", stderr ) if status: self.log.error( 'Failed qstat execution:', stderr ) return S_ERROR( stderr ) if self.userName: # Parse qstat -u userName queueName runningJobs = 0 waitingJobs = 0 lines = stdout.replace( '\r', '' ).split( '\n' ) for line in lines: if not line: continue if line.find( self.userName ) != -1: if 'R' == line.split( ' ' )[-2]: runningJobs += 1 else: # every other status to assimilate to Waiting waitingJobs += 1 else: # parse qstat -Q queueName matched = re.search( self.queue + "\D+(\d+)\D+(\d+)\W+(\w+)\W+(\w+)\D+(\d+)\D+(\d+)\D+(\d+)\D+(\d+)\D+(\d+)\D+(\d+)\W+(\w+)", stdout ) if matched.groups < 6: return S_ERROR( "Error retrieving information from qstat:" + stdout + stderr ) try: waitingJobs = int( matched.group( 5 ) ) runningJobs = int( matched.group( 6 ) ) except ValueError: return S_ERROR( "Error retrieving information from qstat:" + stdout + stderr ) result['WaitingJobs'] = waitingJobs result['RunningJobs'] = runningJobs self.log.verbose( 'Waiting Jobs: ', waitingJobs ) self.log.verbose( 'Running Jobs: ', runningJobs ) return result
""" __RCSID__ = "$Id$" from DIRAC import gLogger, gConfig, S_OK, S_ERROR from DIRAC.Core.Base.AgentModule import AgentModule from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB from DIRAC.AccountingSystem.Client.Types.WMSHistory import WMSHistory from DIRAC.AccountingSystem.Client.DataStoreClient import DataStoreClient from DIRAC.Core.Utilities import Time import json try: import pika except Exception, e: from DIRAC import systemCall result = systemCall( False, ["pip", "install", "pika"] ) if not result['OK']: raise RuntimeError( result['Message'] ) else: import pika class StatesMonitoringAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart