def web_getTaskJobList( self ): taskClient = TaskClient() taskID = int( self.request.arguments["TaskID"][0] ) result = yield self.threadTask( taskClient.getTaskJobs, taskID ) if not result["OK"]: self.finish( {"success":"false", "result":'', "error":result["Message"]} ) return jobIDs = result['Value'] RPC = RPCClient( "WorkloadManagement/JobMonitoring" ) result = RPC.getJobsStatus( result['Value'] ) if not result["OK"]: self.finish( {"success":"false", "result":'', "error":result["Message"]} ) return jobStatuses = result['Value'] jobList = [] for jobID in jobIDs: if jobID in jobStatuses: jobList.append( [jobID, jobStatuses[jobID]['Status']] ) else: jobList.append( [jobID, 'Deleted'] ) callback = {"success":"true", "result":jobList} self.finish( callback )
def deleteJob(self, jobID): """ Delete job(s) from the WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ jobManager = RPCClient("WorkloadManagement/JobManager", useCertificates=False, timeout=self.timeout) result = jobManager.deleteJob(jobID) return result
class FileCatalogProxyClient: """ File catalog client for the File Catalog proxy service """ def __init__( self, fcName, **kwargs ): """ Constructor of the LCGFileCatalogProxy client class """ self.method = None self.fcName = fcName self.rpc = RPCClient( 'DataManagement/FileCatalogProxy', timeout=120 ) self.valid = False self.valid = self.rpc.ping()['OK'] def isOK( self ): """ Is the Catalog available? """ return self.valid def getName( self ): """ Get the file catalog name """ return self.fcName def __getattr__( self, name ): self.method = name return self.execute def execute( self, *parms, **kws ): """ Magic method dispatcher """ return self.rpc.callProxyMethod( self.fcName, self.method, parms, kws )
def listDirectory( self, path ): """ List the contents of the directory """ res = checkArgumentFormat( path ) if not res['OK']: return res urls = res['Value'] successful = {} failed = {} gLogger.debug( "DIPStorage.listDirectory: Attempting to list %s directories." % len( urls ) ) serviceClient = RPCClient( self.url ) for url in urls: res = serviceClient.listDirectory( url, 'l' ) if not res['OK']: failed[url] = res['Message'] else: files = {} subDirs = {} for subPath, pathDict in res['Value'].items(): if pathDict['Type'] == 'File': files[subPath] = pathDict elif pathDict['Type'] == 'Directory': subDirs[subPath] = pathDict successful[url] = {} successful[url]['SubDirs'] = subDirs successful[url]['Files'] = files resDict = {'Failed':failed, 'Successful':successful} return S_OK( resDict )
def web_requestListFiles(self): self.log.debug(self.request.arguments) cache = [] if self.request.arguments.get("reqid", None): reqid = int(self.request.arguments["reqid"][0]) RPC = RPCClient("Transfer/TransferRequest") cond = {"trans_req_id": reqid} res = RPC.show(cond) if res["OK"]: cache = res["Value"] #self.log.always(cache) # (1L, # '/path/does/not/exist', # 1L, # datasetid # datetime.datetime(2013, 8, 23, 3, 12, 37), # datetime.datetime(2013, 8, 23, 3, 14, 37), # 'finish', # 'error' # ) data = [] for vv in cache: data.append({ "id": vv[0], "LFN": vv[1], "starttime": vv[3].strftime("%Y-%m-%d %H:%M [UTC]") if vv[3] else "", "finishtime": vv[4].strftime("%Y-%m-%d %H:%M [UTC]") if vv[4] else "", "status": vv[5], "error": vv[6], }) self.write({"result": data})
def __getUniqueKeyValues( self, typeName ): sessionData = SessionData().getData() userGroup = sessionData["user"]["group"] if 'NormalUser' in CS.getPropertiesForGroup( userGroup ): cacheKey = ( sessionData["user"]["username"], userGroup, sessionData["setup"], typeName ) else: cacheKey = ( userGroup, sessionData["setup"], typeName ) data = AccountingPlotHandler.__keysCache.get( cacheKey ) if not data: rpcClient = RPCClient( "Accounting/ReportGenerator" ) retVal = rpcClient.listUniqueKeyValues( typeName ) if 'rpcStub' in retVal: del( retVal[ 'rpcStub' ] ) if not retVal[ 'OK' ]: return retVal #Site ordering based on TierLevel / alpha if 'Site' in retVal[ 'Value' ]: siteLevel = {} for siteName in retVal[ 'Value' ][ 'Site' ]: sitePrefix = siteName.split( "." )[0].strip() level = gConfig.getValue( "/Resources/Sites/%s/%s/MoUTierLevel" % ( sitePrefix, siteName ), 10 ) if level not in siteLevel: siteLevel[ level ] = [] siteLevel[ level ].append( siteName ) orderedSites = [] for level in sorted( siteLevel ): orderedSites.extend( sorted( siteLevel[ level ] ) ) retVal[ 'Value' ][ 'Site' ] = orderedSites data = retVal AccountingPlotHandler.__keysCache.add( cacheKey, 300, data ) return data
def isDirectory( self, path ): """ Determine whether the path is a directory """ res = checkArgumentFormat( path ) if not res['OK']: return res urls = res['Value'] successful = {} failed = {} gLogger.debug( "DIPStorage.isDirectory: Attempting to determine whether %s paths are directories." % len( urls ) ) serviceClient = RPCClient( self.url ) for url in urls: res = serviceClient.getMetadata( url ) if res['OK']: if res['Value']['Exists']: if res['Value']['Type'] == 'Directory': gLogger.debug( "DIPStorage.isDirectory: Successfully obtained metadata for %s." % url ) successful[url] = True else: successful[url] = False else: failed[url] = 'Path does not exist' else: gLogger.error( "DIPStorage.isDirectory: Failed to get metadata for url", "%s: %s" % ( url, res['Message'] ) ) failed[url] = res['Message'] resDict = {'Failed':failed, 'Successful':successful} return S_OK( resDict )
def __useOldPolicyRes(self, name, policyName): """ Use the RSS Service to get an old policy result. If such result is older than 2 hours, it returns {'Status':'Unknown'} """ from DIRAC.Core.DISET.RPCClient import RPCClient rsS = RPCClient("ResourceStatus/ResourceManagement") res = rsS.getPolicyRes(name, policyName, True) if not res['OK']: raise RSSException, where(self, self.__useOldPolicyRes) + ' Could not get a policy result' res = res['Value'] if res == []: return {'Status':'Unknown'} oldStatus = res[0] oldReason = res[1] lastCheckTime = res[2] if ( lastCheckTime + datetime.timedelta(hours = 2) ) < datetime.datetime.utcnow(): return {'Status':'Unknown'} result = {} result['Status'] = oldStatus result['Reason'] = oldReason result['OLD'] = True result['PolicyName'] = policyName return result
def doCommand(self, CEs = None): """ Returns failed pilots using the DIRAC accounting system for every CE for the last self.args[0] hours :params: :attr:`CEs`: list of CEs (when not given, take every CE) :returns: """ if CEs is None: from DIRAC.Core.DISET.RPCClient import RPCClient RPC_RSS = RPCClient("ResourceStatus/ResourceStatus") CEs = RPC_RSS.getCEsList() if not CEs['OK']: raise RSSException, where(self, self.doCommand) + " " + CEs['Message'] else: CEs = CEs['Value'] if self.RPC is None: from DIRAC.Core.DISET.RPCClient import RPCClient self.RPC = RPCClient("Accounting/ReportGenerator", timeout = self.timeout) if self.client is None: from DIRAC.AccountingSystem.Client.ReportsClient import ReportsClient self.client = ReportsClient(rpcClient = self.RPC) fromD = datetime.datetime.utcnow()-datetime.timedelta(hours = self.args[0]) toD = datetime.datetime.utcnow() try: failed_pilots = self.client.getReport('Pilot', 'NumberOfPilots', fromD, toD, {'GridStatus':['Aborted'], 'GridCE':CEs}, 'GridCE') if not failed_pilots['OK']: raise RSSException, where(self, self.doCommand) + " " + failed_pilots['Message'] else: failed_pilots = failed_pilots['Value'] except: gLogger.exception("Exception when calling FailedPilotsByCESplitted_Command") return {} listOfCEs = failed_pilots['data'].keys() plotGran = failed_pilots['granularity'] singlePlots = {} for CE in listOfCEs: if CE in CEs: plot = {} plot['data'] = {CE: failed_pilots['data'][CE]} plot['granularity'] = plotGran singlePlots[CE] = plot resToReturn = {'Pilot': singlePlots} return resToReturn
def doCommand(self, sites = None): """ Returns running and runned jobs, querying the WMSHistory for the last self.args[0] hours :params: :attr:`sites`: list of sites (when not given, take every sites) :returns: """ if sites is None: from DIRAC.Core.DISET.RPCClient import RPCClient RPC_RSS = RPCClient("ResourceStatus/ResourceStatus") sites = RPC_RSS.getSitesList() if not sites['OK']: raise RSSException, where(self, self.doCommand) + " " + sites['Message'] else: sites = sites['Value'] if self.RPC is None: from DIRAC.Core.DISET.RPCClient import RPCClient self.RPC = RPCClient("Accounting/ReportGenerator", timeout = self.timeout) if self.client is None: from DIRAC.AccountingSystem.Client.ReportsClient import ReportsClient self.client = ReportsClient(rpcClient = self.RPC) fromD = datetime.datetime.utcnow()-datetime.timedelta(hours = self.args[0]) toD = datetime.datetime.utcnow() try: run_jobs = self.client.getReport('WMSHistory', 'NumberOfJobs', fromD, toD, {}, 'Site') if not run_jobs['OK']: raise RSSException, where(self, self.doCommand) + " " + run_jobs['Message'] else: run_jobs = run_jobs['Value'] except: gLogger.exception("Exception when calling RunningJobsBySiteSplitted_Command") return {} listOfSites = run_jobs['data'].keys() plotGran = run_jobs['granularity'] singlePlots = {} for site in listOfSites: if site in sites: plot = {} plot['data'] = {site: run_jobs['data'][site]} plot['granularity'] = plotGran singlePlots[site] = plot resToReturn = {'WMSHistory': singlePlots} return resToReturn
def do_regenerateBuckets( self, args ): """ Regenerate buckets for type. Can take a while. Usage : regenerateBuckets <typeName> <DIRACRoot>/DIRAC/AccountingSystem/Client/Types/<typeName> should exist and inherit the base type """ try: argList = args.split() if argList: typeName = argList[0].strip() else: gLogger.error( "No type name specified" ) return #Try to import the type try: typeModule = __import__( "DIRAC.AccountingSystem.Client.Types.%s" % typeName, globals(), locals(), typeName ) typeClass = getattr( typeModule, typeName ) except Exception, e: gLogger.error( "Can't load type %s: %s" % ( typeName, str(e) ) ) return gLogger.info( "Loaded type %s" % typeClass.__name__ ) typeDef = typeClass().getDefinition() acClient = RPCClient( "Accounting/DataStore" ) retVal = acClient.regenerateBuckets( typeDef[0] ) if retVal[ 'OK' ]: gLogger.info( "Buckets recalculated!" ) else: gLogger.error( "Error: %s" % retVal[ 'Message' ] )
def do_deleteType( self, args ): """ Delete a registered accounting type. Usage : deleteType <typeName> WARN! It will delete all data associated to that type! VERY DANGEROUS! If you screw it, you'll discover a new dimension of pain and doom! :) """ try: argList = args.split() if argList: typeName = argList[0].strip() else: gLogger.error( "No type name specified" ) return while True: choice = raw_input( "Are you completely sure you want to delete type %s and all it's data? yes/no [no]: " % typeName) choice = choice.lower() if choice in ( "yes", "y" ): break else: print "Delete aborted" return acClient = RPCClient( "Accounting/DataStore" ) retVal = acClient.deleteType( typeName ) if not retVal[ 'OK' ]: gLogger.error( "Error: %s" % retVal[ 'Message' ] ) return print "Hope you meant it, because it's done" except: self.showTraceback()
def __rescheduleFailedJob( self, jobID, message, stop = True ): """ Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager """ self.log.warn( 'Failure during %s' % ( message ) ) jobManager = RPCClient( 'WorkloadManagement/JobManager' ) jobReport = JobReport( int( jobID ), 'JobAgent@%s' % self.siteName ) #Setting a job parameter does not help since the job will be rescheduled, #instead set the status with the cause and then another status showing the #reschedule operation. jobReport.setJobStatus( status = 'Rescheduled', application = message, sendFlag = True ) self.log.info( 'Job will be rescheduled' ) result = jobManager.rescheduleJob( jobID ) if not result['OK']: self.log.error( result['Message'] ) return self.__finish( 'Problem Rescheduling Job', stop ) self.log.info( 'Job Rescheduled %s' % ( jobID ) ) return self.__finish( 'Job Rescheduled', stop )
def status(job_ids, statusmapping, pipe_out=True): '''Function to check the statuses and return the Ganga status of a job after looking it's DIRAC status against a Ganga one''' # Translate between the many statuses in DIRAC and the few in Ganga #return {'OK':True, 'Value':[['WIP', 'WIP', 'WIP', 'WIP', 'WIP']]} result = dirac.status(job_ids) if not result['OK']: return result status_list = [] bulk_status = result['Value'] for _id in job_ids: job_status = bulk_status.get(_id, {}) minor_status = job_status.get('MinorStatus', None) dirac_status = job_status.get('Status', None) dirac_site = job_status.get('Site', None) ganga_status = statusmapping.get(dirac_status, None) if ganga_status is None: ganga_status = 'failed' dirac_status = 'Unknown: No status for Job' #if dirac_status == 'Completed' and (minor_status not in ['Pending Requests']): # ganga_status = 'running' if minor_status in ['Uploading Output Data']: ganga_status = 'running' try: from DIRAC.Core.DISET.RPCClient import RPCClient monitoring = RPCClient('WorkloadManagement/JobMonitoring') app_status = monitoring.getJobAttributes(_id)['Value']['ApplicationStatus'] except: app_status = "unknown ApplicationStatus" status_list.append([minor_status, dirac_status, dirac_site, ganga_status, app_status]) return status_list
def getMessages(self, showFields=[], conds={}, beginDate=None, endDate=None, startRecord=0, maxRecords=100): """ Query the database for all messages satisfying 'conds' that were generated between beginDate and endDate """ loggingQuery = RPCClient("Framework/SystemLoggingReport", timeout=10) conds["beginDate"] = beginDate conds["endDate"] = endDate for key in showFields: if not conds.has_key(key): conds[key] = None result = loggingQuery.getMessages(conds, [], startRecord, maxRecords) if not result["OK"]: return result if showFields: columns = result["Value"]["ParameterNames"] fieldIndex = [] for field in showFields: fieldIndex.append(columns.index(field)) retRecords = [] for record in result["Value"]["Records"]: retRecords.append([record[index] for index in fieldIndex]) result["Value"]["ParameterNames"] = showFields result["Value"]["Records"] = retRecords return result
def sendStoredStatusInfo( self ): """ Send the job status information stored in the internal cache """ statusDict = {} for status, minor, dtime in self.jobStatusInfo: statusDict[dtime] = { 'Status': status, 'MinorStatus': minor, 'ApplicationStatus': '', 'Source': self.source } for appStatus, dtime in self.appStatusInfo: statusDict[dtime] = { 'Status': '', 'MinorStatus': '', 'ApplicationStatus': appStatus, 'Source': self.source } if statusDict: jobMonitor = RPCClient( 'WorkloadManagement/JobStateUpdate', timeout = 60 ) result = jobMonitor.setJobStatusBulk( self.jobID, statusDict ) if result['OK']: # Empty the internal status containers self.jobStatusInfo = [] self.appStatusInfo = [] return result else: return S_OK( 'Empty' )
def do_resetBucketLength(self, args): """ Set the bucket Length. Will trigger a recalculation of buckets. Can take a while. Usage : resetBucketLength <typeName> <DIRACRoot>/DIRAC/AccountingSystem/Client/Types/<typeName> should exist and inherit the base type """ try: argList = args.split() if argList: typeName = argList[0].strip() else: gLogger.error("No type name specified") return # Try to import the type try: typeModule = __import__("DIRAC.AccountingSystem.Client.Types.%s" % typeName, globals(), locals(), typeName) typeClass = getattr(typeModule, typeName) except Exception as e: gLogger.error("Can't load type %s: %s" % (typeName, str(e))) return gLogger.info("Loaded type %s" % typeClass.__name__) typeDef = typeClass().getDefinition() acClient = RPCClient("Accounting/DataStore") retVal = acClient.setBucketsLength(typeDef[0], typeDef[3]) if retVal['OK']: gLogger.info("Type registered successfully") else: gLogger.error("Error: %s" % retVal['Message']) except BaseException: self.showTraceback()
def getMessagesByDate(self, beginDate=None, endDate=None, startRecord=0, maxRecords=100): """ Query the database for all the messages between two given dates. If no date is provided then the records returned are those generated during the last 24 hours. """ loggingQuery = RPCClient("Framework/SystemLoggingReport", timeout=10) return loggingQuery.getMessages({"beginDate": beginDate, "endDate": endDate}, [], startRecord, maxRecords)
def addSiteInMask( self, site, comment, printOutput = False ): """Adds the site to the site mask. Example usage: >>> print diracAdmin.addSiteInMask() {'OK': True, 'Value': } @return: S_OK,S_ERROR """ result = self.__checkSiteIsValid( site ) if not result['OK']: return result mask = self.getSiteMask() if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: return S_ERROR( 'Site %s already in mask of allowed sites' % site ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator', timeout = 120 ) result = wmsAdmin.allowSite( site, comment ) if not result['OK']: return result if printOutput: print 'Allowing %s in site mask' % site return result
def banSiteFromMask( self, site, comment, printOutput = False ): """Removes the site from the site mask. Example usage: >>> print diracAdmin.banSiteFromMask() {'OK': True, 'Value': } @return: S_OK,S_ERROR """ result = self.__checkSiteIsValid( site ) if not result['OK']: return result mask = self.getSiteMask() if not mask['OK']: return mask siteMask = mask['Value'] if not site in siteMask: return S_ERROR( 'Site %s is already banned' % site ) wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator', timeout = 120 ) result = wmsAdmin.banSite( site, comment ) if not result['OK']: return result if printOutput: print 'Removing %s from site mask' % site return result
def resetJob( self, jobID ): """ Reset job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = False, timeout = self.timeout ) result = jobManager.resetJob( jobID ) return result
def main(): if len(sys.argv) < 2: print "At least one parameter (user group, e.g. dune_user) expected, got %s !" \ % (len(sys.argv)-1) print "Usage: jobs_by_vo.py <user group> -or- jobs_by_vo.py <user group> <site>" print "Example: ./jobs_by_vo.py dune_user LCG.UKI-LT2-IC-HEP.uk" print "Only available to dirac_admin." sys.exit(1) print '*** %s ***' % str(sys.argv[1]) if len(sys.argv) == 3: print '(at %s)' % str(sys.argv[2]) rpcClient = RPCClient( "WorkloadManagement/JobMonitoring" ) for jobstate in JOB_STATES: JOBFILTER = {} JOBFILTER['OwnerGroup'] = str(sys.argv[1]) JOBFILTER['Status'] = str(jobstate) if len(sys.argv) == 3: JOBFILTER['Site'] = str(sys.argv[2]) # print JOBFILTER jobs = rpcClient.getJobs(JOBFILTER) if not jobs["OK"]: print "Could not retrieve jobs." sys.exit(1) job_ids = jobs["Value"] print '{0:<10} {1:>6}'.format(str(jobstate)+':', len(job_ids))
def test_matcher( self ): # insert a proper DN to run the test resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB', 'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO', 'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 } matcher = RPCClient( 'WorkloadManagement/Matcher' ) JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) wmsClient = WMSClient() job = helloWorldJob() job.setDestination( 'DIRAC.Jenkins.org' ) job.setInputData( '/a/bbb' ) job.setType( 'User' ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' ) self.assert_( res['OK'] ) tqDB = TaskQueueDB() tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'OwnerGroup':'prod', 'Setup':'JenkinsSetup', 'CPUTime':86400} res = tqDB.insertJob( jobID, tqDefDict, 10 ) self.assert_( res['OK'] ) res = matcher.requestJob( resourceDescription ) print res self.assert_( res['OK'] ) wmsClient.deleteJob( jobID )
def killJob( self, jobID ): """ Kill running job. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = False, timeout = self.timeout ) result = jobManager.killJob( jobID ) return result
def setRequest(self,requestType,requestName,requestString,requestStatus='ToDo',url=''): """ Set request. URL can be supplied if not a all VOBOXes will be tried in random order. """ try: urls = [] if url: urls[url] urls.append(self.voBoxUrls) for url in urls: requestRPCClient = RPCClient(url) res = requestRPCClient.setRequest(requestType,requestName,requestStatus,requestString) if res['OK']: gLogger.info("Succeded setting request for %s at %s" % (requestName,url)) res["Server"] = url return res else: errKey = "Failed setting request at %s" % url errExpl = " : for %s because: %s" % (requestName,res['Message']) gLogger.error(errKey,errExpl) errKey = "Completely failed setting request" errExpl = " : %s\n%s\n%s" % (requestName,requestType,requestString) gLogger.fatal(errKey,errExpl) return S_ERROR(errKey) except Exception,x: errKey = "Completely failed setting request" errExpl = " : for %s with exception %s" % (requestName,str(x)) gLogger.exception(errKey,errExpl) return S_ERROR(errKey)
def getRequestSummary(self,url=''): """ Get the summary of requests in the RequestDBs. If a URL is not supplied will get status for all. """ try: if url: urls = [url] else: urls = self.voBoxUrls res = S_OK() for url in urls: requestRPCClient = RPCClient(url,timeout=120) res['Value'][url] = {} result = requestRPCClient.getRequestSummary() if result['OK']: gLogger.info("Succeded getting request summary at %s" % url) res['Value'][url] = result['Value'] else: errKey = "Failed getting request summary" errExpl = " : at %s because %s" % (url,result['Message']) gLogger.error(errKey,errExpl) return res except Exception,x: errKey = "Failed getting request summary" errExpl = " : with exception %s" % str(x) gLogger.exception(errKey,errExpl) return S_ERROR(errKey+errExpl)
def web_datasetListFiles(self): self.log.debug(self.request.arguments) dataset = None if self.request.arguments.get("dataset", None): dataset = self.request.arguments["dataset"][0] data = [] cache = [] if dataset: RPC = RPCClient("Transfer/Dataset") res = RPC.list(dataset) #self.log.always(res) # {'OK': True, # 'rpcStub': (('Transfer/Dataset', {'skipCACheck': False, 'keepAliveLapse': 150, 'delega tedGroup': 'bes_user', 'delegatedDN': '/C=CN/O=HEP/OU=PHYS/O=IHEP/CN=Tian Yan', 'timeout': 600}), 'list', ('jpsi-all-ok',)), # 'Value': [ # (176L, '/zhanggang_test/File/jpsi/6.6.4/mc/inclusive/round02/stream001/jpsi2009_stream001_run10005_file14', 7L), # (177L, '/zhanggang_test/File/jpsi/6.6.4/mc/inclusive /round02/stream001/jpsi2009_stream001_run10137_file7', 7L), # ] # } if res["OK"]: cache = res["Value"] for i,f,di in cache: data.append({ "id": i, "file": f, }) self.write({"result": data})
def getDirectoryMetadata( self, path ): """ Get metadata associated to the directory """ res = checkArgumentFormat( path ) if not res['OK']: return res urls = res['Value'] successful = {} failed = {} gLogger.debug( "DIPStorage.getFileMetadata: Attempting to obtain metadata for %s directories." % len( urls ) ) serviceClient = RPCClient( self.url ) for url in urls: res = serviceClient.getMetadata( url ) if res['OK']: if res['Value']['Exists']: if res['Value']['Type'] == 'Directory': res['Value']['Directory'] = True gLogger.debug( "DIPStorage.getFileMetadata: Successfully obtained metadata for %s." % url ) successful[url] = res['Value'] else: failed[url] = 'Supplied path is not a directory' else: failed[url] = 'Directory does not exist' else: gLogger.error( "DIPStorage.getFileMetadata: Failed to get metadata for url", "%s: %s" % ( url, res['Message'] ) ) failed[url] = res['Message'] resDict = {'Failed':failed, 'Successful':successful} return S_OK( resDict )
def getRequest(self,requestType,status): """ Get request from RequestDB. First try the local repository then if none available or error try random repository """ try: #Create list with two RequestDB URLs to try url = self.localUrl urls = [url] urls.append(self.voBoxUrls.pop()) for url in urls: requestRPCClient = RPCClient(url,timeout=120) res = requestRPCClient.getRequest(requestType,status) if res['OK']: if res['Request']: gLogger.info("Got '%s' request from RequestDB (%s) with status '%s'" % (requestType,url,status)) res['Server'] = url return res else: gLogger.info("Found no '%s' requests on RequestDB (%s) with status '%s'" % (requestType,url,status)) else: errKey = "Failed getting request from %s" % url errExpl = " : %s of %s because: %s" % (requestType,status,res['Message']) gLogger.error(errKey,errExpl) return res except Exception,x: errKey = "Failed to get request" errExpl = " : %s" %str(x) gLogger.exception(errKey,errExpl) return S_ERROR(errKey+errExpl)
def __getPilots( self ): """ prepare pilot objects for submission to CE """ rpcClient = RPCClient( "WorkloadManagement/Matcher" ) result = rpcClient.getMatchingTaskQueues( self.matchDict ) self.log.info( 'Matching result', result ) if not result['OK']: self.log.warn( result['Message'] ) return S_ERROR( result ) taskQueues = result['Value'] numberOfJobs = 0 for taskQueueID in taskQueues: numberOfJobs += taskQueues[taskQueueID]['Jobs'] if not numberOfJobs: return S_OK( 'No jobs selected for conditions: %s' % self.matchDict ) else: # numberOfPilots = len(jobIDs) pilots = [] pilot = self.__createPilotFile() proxy = self.__getProxy() self.log.verbose( '%s job(s) selected' % ( numberOfJobs ) ) # submit pilots for all jobs, but not more than configured in maxPilots for i in xrange( 0, min( self.maxPilots, numberOfJobs ) ): pilots.append( {'pilotFile':pilot, 'proxyString':proxy} ) return S_OK( pilots )
def web_getSelectionData(self): sData = self.getSessionData() group = sData["user"]["group"] user = sData["user"]["username"] callback = {} if len(self.request.arguments) > 0: tmp = {} for i in self.request.arguments: tmp[i] = str(self.request.arguments[i][0]).replace('"', '') callback["extra"] = tmp RPC = RPCClient("WorkloadManagement/WMSAdministrator") result = yield self.threadTask(RPC.getPilotMonitorSelectors) if result["OK"]: result = result["Value"] if result.has_key("Status") and len(result["Status"]) > 0: status = [] for i in result["Status"]: status.append([str(i)]) else: status = [["Nothing to display"]] callback["status"] = status if result.has_key("GridType") and len(result["GridType"]) > 0: gridtype = [] for i in result["GridType"]: gridtype.append([str(i)]) else: gridtype = [["Nothing to display"]] callback["gridtype"] = gridtype if result.has_key("OwnerGroup") and len(result["OwnerGroup"]) > 0: ownerGroup = [] for i in result["OwnerGroup"]: ownerGroup.append([str(i)]) else: ownerGroup = [["Nothing to display"]] callback["ownerGroup"] = ownerGroup if result.has_key("DestinationSite") and len( result["DestinationSite"]) > 0: ce = [] for i in result["DestinationSite"]: ce.append([str(i)]) else: ce = [["Nothing to display"]] callback["computingElement"] = ce if result.has_key("GridSite") and len(result["GridSite"]) > 0: tier1 = gConfig.getValue("/WebApp/PreferredSites", []) site = [] s = list(result["GridSite"]) for i in tier1: site.append([str(i)]) for i in s: if i not in tier1: site.append([str(i)]) else: site = [["Error during RPC call"]] callback["site"] = site if result.has_key("Broker") and len(result["Broker"]) > 0: broker = [] for i in result["Broker"]: broker.append([str(i)]) else: broker = [["Nothing to display"]] callback["broker"] = broker if result.has_key("Owner") and len(result["Owner"]) > 0: owner = [] for i in result["Owner"]: owner.append([str(i)]) else: owner = [["Nothing to display"]] callback["owner"] = owner self.finish(callback)
class JobsWMSCommand(Command): def __init__(self, args=None, clients=None): super(JobsWMSCommand, self).__init__(args, clients) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis['WMSAdministrator'] else: self.wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') def doCommand(self): """ Returns simple jobs efficiency :attr:`args`: - args[0]: string: should be a ValidElement - args[1]: string should be the name of the ValidElement returns: { 'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad' } """ if not 'siteName' in self.args: return self.returnERROR(S_ERROR('siteName is missing')) siteName = self.args['siteName'] # If siteName is None, we take all sites if siteName is None: siteName = CSHelpers.getSites() if not siteName['OK']: return self.returnERROR(siteName) siteName = siteName['Value'] results = self.wmsAdmin.getSiteSummaryWeb({'Site': siteName}, [], 0, 500) if not results['OK']: return self.returnERROR(results) results = results['Value'] if not 'ParameterNames' in results: return self.returnERROR(S_ERROR('Malformed result dictionary')) params = results['ParameterNames'] if not 'Records' in results: return self.returnERROR(S_ERROR('Malformed result dictionary')) records = results['Records'] jobResults = [] for record in records: jobDict = dict(zip(params, record)) try: jobDict['Efficiency'] = float(jobDict['Efficiency']) except KeyError, e: return self.returnERROR(S_ERROR(e)) except ValueError, e: return self.returnERROR(S_ERROR(e)) jobResults.append(jobDict)
class JobCommand(Command): """ Job "master" Command. """ def __init__(self, args=None, clients=None): super(JobCommand, self).__init__(args, clients) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis['WMSAdministrator'] else: self.wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient() def _storeCommand(self, result): """ Stores the results of doNew method on the database. """ for jobDict in result: resQuery = self.rmClient.addOrModifyJobCache( jobDict['Site'], jobDict['MaskStatus'], jobDict['Efficiency'], jobDict['Status']) if not resQuery['OK']: return resQuery return S_OK() def _prepareCommand(self): """ JobCommand requires one arguments: - name : <str> """ if not 'name' in self.args: return S_ERROR('"name" not found in self.args') name = self.args['name'] return S_OK(name) def doNew(self, masterParams=None): """ Gets the parameters to run, either from the master method or from its own arguments. It contacts the WMSAdministrator with a list of site names, or a single site. If there are jobs, are recorded and then returned. """ if masterParams is not None: name = masterParams else: params = self._prepareCommand() if not params['OK']: return params name = params['Value'] # selectDict, sortList, startItem, maxItems # Returns statistics of Last day ! results = self.wmsAdmin.getSiteSummaryWeb({'Site': name}, [], 0, 0) if not results['OK']: return results results = results['Value'] if not 'ParameterNames' in results: return S_ERROR('Wrong result dictionary, missing "ParameterNames"') params = results['ParameterNames'] if not 'Records' in results: return S_ERROR('Wrong formed result dictionary, missing "Records"') records = results['Records'] uniformResult = [] for record in records: # This returns a dictionary with the following keys # 'Site', 'GridType', 'Country', 'Tier', 'MaskStatus', 'Received', # 'Checking', 'Staging', 'Waiting', 'Matched', 'Running', 'Stalled', # 'Done', 'Completed', 'Failed', 'Efficiency', 'Status' jobDict = dict(zip(params, record)) # We cast efficiency to a float jobDict['Efficiency'] = float(jobDict['Efficiency']) uniformResult.append(jobDict) storeRes = self._storeCommand(uniformResult) if not storeRes['OK']: return storeRes return S_OK(uniformResult) def doCache(self): """ Method that reads the cache table and tries to read from it. It will return a list of dictionaries if there are results. """ params = self._prepareCommand() if not params['OK']: return params name = params['Value'] result = self.rmClient.selectJobCache(name) if result['OK']: result = S_OK( [dict(zip(result['Columns'], res)) for res in result['Value']]) return result def doMaster(self): """ Master method. Gets all sites and calls doNew method. """ siteNames = CSHelpers.getSites() if not siteNames['OK']: return siteNames siteNames = siteNames['Value'] jobsResults = self.doNew(siteNames) if not jobsResults['OK']: self.metrics['failed'].append(jobsResults['Message']) return S_OK(self.metrics)
def __getRPCClient( self ): if self.rpcClient: return self.rpcClient return RPCClient( self.serviceName )
class WMSClient(object): def __init__(self, jobManagerClient=None, sbRPCClient=None, sbTransferClient=None, useCertificates=False, timeout=600): """ WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.timeout = timeout self.jobManager = jobManagerClient self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient=sbRPCClient, transferClient=sbTransferClient, useCertificates=useCertificates) ############################################################################### def __getInputSandboxEntries(self, classAdJob): if classAdJob.lookupAttribute("InputSandbox"): inputSandbox = classAdJob.get_expression("InputSandbox") inputSandbox = inputSandbox.replace('","', "\n") inputSandbox = inputSandbox.replace('{', "") inputSandbox = inputSandbox.replace('}', "") inputSandbox = inputSandbox.replace('"', "") inputSandbox = inputSandbox.replace(',', "") inputSandbox = inputSandbox.split() else: inputSandbox = [] return inputSandbox def __uploadInputSandbox(self, classAdJob, jobDescriptionObject=None): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) realFiles = [] badFiles = [] diskFiles = [] for isFile in inputSandbox: if not isFile.startswith(('lfn:', 'LFN:', 'SB:', '%s', '%(')): realFiles.append(isFile) stringIOFiles = [] stringIOFilesSize = 0 if jobDescriptionObject is not None: if isinstance(jobDescriptionObject, StringIO.StringIO): stringIOFiles = [jobDescriptionObject] stringIOFilesSize = len(jobDescriptionObject.buf) gLogger.debug("Size of the stringIOFiles: " + str(stringIOFilesSize)) else: return S_ERROR("jobDescriptionObject is not a StringIO object") # Check real files for isFile in realFiles: if not os.path.exists( isFile ): # we are passing in real files, we expect them to be on disk badFiles.append(isFile) gLogger.warn("inputSandbox file/directory " + isFile + " not found. Keep looking for the others") continue diskFiles.append(isFile) diskFilesSize = File.getGlobbedTotalSize(diskFiles) gLogger.debug("Size of the diskFiles: " + str(diskFilesSize)) totalSize = diskFilesSize + stringIOFilesSize gLogger.verbose("Total size of the inputSandbox: " + str(totalSize)) okFiles = stringIOFiles + diskFiles if badFiles: result = S_ERROR('Input Sandbox is not valid') result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates) result = self.sandboxClient.uploadFilesAsSandbox(okFiles) if not result['OK']: return result inputSandbox.append(result['Value']) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK() def submitJob(self, jdl, jobDescriptionObject=None): """ Submit one job specified by its JDL to WMS """ if os.path.exists(jdl): fic = open(jdl, "r") jdlString = fic.read() fic.close() else: # If file JDL does not exist, assume that the JDL is passed as a string jdlString = jdl jdlString = jdlString.strip() # Strip of comments in the jdl string newJdlList = [] for line in jdlString.split('\n'): if not line.strip().startswith('#'): newJdlList.append(line) jdlString = '\n'.join(newJdlList) # Check the validity of the input JDL if jdlString.find("[") != 0: jdlString = "[%s]" % jdlString classAdJob = ClassAd(jdlString) if not classAdJob.isOK(): return S_ERROR('Invalid job JDL') # Check the size and the contents of the input sandbox result = self.__uploadInputSandbox(classAdJob, jobDescriptionObject) if not result['OK']: return result # Submit the job now and get the new job ID if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) result = self.jobManager.submitJob(classAdJob.asJDL()) if 'requireProxyUpload' in result and result['requireProxyUpload']: gLogger.warn("Need to upload the proxy") return result def killJob(self, jobID): """ Kill running job. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.killJob(jobID) def deleteJob(self, jobID): """ Delete job(s) from the WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.deleteJob(jobID) def rescheduleJob(self, jobID): """ Reschedule job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.rescheduleJob(jobID) def resetJob(self, jobID): """ Reset job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.resetJob(jobID)
@type job: integer or list of integers @return: S_OK,S_ERROR """ if type( jobID ) == type( " " ): try: jobID = int( jobID ) except Exception, x: return self._errorReport( str( x ), 'Expected integer or convertible integer for existing jobID' ) elif type( jobID ) == type( [] ): try: jobID = [int( job ) for job in jobID] except Exception, x: return self._errorReport( str( x ), 'Expected integer or convertible integer for existing jobIDs' ) jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = False ) result = jobManager.resetJob( jobID ) return result ############################################################################# def getJobPilotOutput( self, jobID, directory = '' ): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> print dirac.getJobPilotOutput(12345) {'OK': True, StdOut:'',StdError:''} @param job: JobID @type job: integer or string @return: S_OK,S_ERROR
def finalizeRequest(self, requestID, jobID, useCertificates=True): """ check request status and perform finalization if necessary update the request status and the corresponding job parameter :param self: self reference :param str requestID: request id :param int jobID: job id """ # FIXME: use JobStateUpdateClient stateServer = RPCClient("WorkloadManagement/JobStateUpdate", useCertificates=useCertificates) # Checking if to update the job status - we should fail here, so it will be re-tried later # Checking the state, first res = self.getRequestStatus(requestID) if not res['OK']: self.log.error( "finalizeRequest: failed to get request", "request: %s status: %s" % (requestID, res["Message"])) return res if res["Value"] != "Done": return S_ERROR( "The request %s isn't 'Done' but '%s', this should never happen, why are we here?" % (requestID, res['Value'])) # The request is 'Done', let's update the job status. If we fail, we should re-try later # FIXME: use JobMonitoringClient monitorServer = RPCClient("WorkloadManagement/JobMonitoring", useCertificates=useCertificates) res = monitorServer.getJobSummary(int(jobID)) if not res["OK"]: self.log.error("finalizeRequest: Failed to get job status", "JobID: %d" % jobID) return res elif not res['Value']: self.log.info( "finalizeRequest: job %d does not exist (anymore): finalizing" % jobID) return S_OK() else: jobStatus = res["Value"]['Status'] jobMinorStatus = res["Value"]["MinorStatus"] jobAppStatus = '' newJobStatus = '' if jobStatus == JobStatus.STALLED: # If job is stalled, find the previous status from the logging info res = monitorServer.getJobLoggingInfo(int(jobID)) if not res['OK']: self.log.error( "finalizeRequest: Failed to get job logging info", "JobID: %d" % jobID) return res # Check the last status was Stalled and get the one before if len(res['Value'] ) >= 2 and res['Value'][-1][0] == JobStatus.STALLED: jobStatus, jobMinorStatus, jobAppStatus = res['Value'][ -2][:3] newJobStatus = jobStatus # update the job pending request digest in any case since it is modified self.log.info( "finalizeRequest: Updating request digest for job %d" % jobID) digest = self.getDigest(requestID) if digest["OK"]: digest = digest["Value"] self.log.verbose(digest) res = stateServer.setJobParameter(jobID, "PendingRequest", digest) if not res["OK"]: self.log.info( "finalizeRequest: Failed to set job %d parameter: %s" % (jobID, res["Message"])) return res else: self.log.error( "finalizeRequest: Failed to get request digest for %s: %s" % (requestID, digest["Message"])) if jobStatus == JobStatus.COMPLETED: # What to do? Depends on what we have in the minorStatus if jobMinorStatus == JobMinorStatus.PENDING_REQUESTS: newJobStatus = JobStatus.DONE elif jobMinorStatus == JobMinorStatus.APP_ERRORS: newJobStatus = JobStatus.FAILED else: self.log.error( "finalizeRequest: Unexpected jobMinorStatus", "(got %s)" % jobMinorStatus) return S_ERROR("Unexpected jobMinorStatus") if newJobStatus: self.log.info( "finalizeRequest: Updating job status for %d to %s/Requests done" % (jobID, newJobStatus)) else: self.log.info( "finalizeRequest: Updating job minor status", "for %d to 'Requests done' (current status is %s)" % (jobID, jobStatus)) stateUpdate = stateServer.setJobStatus(jobID, newJobStatus, "Requests done", 'RMS') if jobAppStatus and stateUpdate['OK']: stateUpdate = stateServer.setJobApplicationStatus( jobID, jobAppStatus, 'RMS') if not stateUpdate["OK"]: self.log.error( "finalizeRequest: Failed to set job status", "JobID: %d, error: %s" % (jobID, stateUpdate['Message'])) return stateUpdate return S_OK(newJobStatus)
def web_getSelectionData(self): sData = self.getSessionData() callback = {} group = sData["user"]["group"] user = sData["user"]["username"] if user == "Anonymous": self.finish({ "success": "false", "result": [], "total": 0, "error": "Insufficient rights" }) else: RPC = RPCClient("RequestManagement/ReqManager") ### R E Q U E S T T Y P E result = yield self.threadTask(RPC.getDistinctValuesWeb, "Type") if result["OK"]: reqtype = list() if len(result["Value"]) > 0: for i in result["Value"]: reqtype.append([str(i)]) else: reqtype = [["Nothing to display"]] else: reqtype = [["Error during RPC call"]] callback["operationType"] = reqtype ### U S E R result = yield self.threadTask(RPC.getDistinctValuesWeb, "OwnerDN") if result["OK"]: owner = [] for dn in result["Value"]: owner.append([dn]) if len(owner) < 2: owner = [["Nothing to display"]] else: owner = [["Error during RPC call"]] callback["owner"] = owner ### G R O U P result = yield self.threadTask(RPC.getDistinctValuesWeb, "OwnerGroup") gLogger.info("getDistinctValuesWeb(OwnerGroup)", result) if result["OK"]: ownerGroup = list() if len(result["Value"]) > 0: for i in result["Value"]: ownerGroup.append([str(i)]) else: ownerGroup = [["Nothing to display"]] else: ownerGroup = [["Error during RPC call"]] callback["ownerGroup"] = ownerGroup ### S T A T U S result = yield self.threadTask(RPC.getDistinctValuesWeb, "Status") if result["OK"]: status = list() if len(result["Value"]) > 0: for i in result["Value"]: status.append([str(i)]) else: status = [["Nothing to display"]] else: status = [["Error during RPC call"]] callback["status"] = status self.finish(callback)
def web_getRequestMonitorData(self): RPC = RPCClient("RequestManagement/ReqManager", timeout=600) callback = {} req = self.__request() result = yield self.threadTask(RPC.getRequestSummaryWeb, req, self.globalSort, self.pageNumber, self.numberOfJobs) if not result["OK"]: self.finish({ "success": "false", "result": [], "total": 0, "error": result["Message"] }) return result = result["Value"] if not result.has_key("TotalRecords"): self.finish({ "success": "false", "result": [], "total": -1, "error": "Data structure is corrupted" }) return if not (result["TotalRecords"] > 0): self.finish({ "success": "false", "result": [], "total": 0, "error": "There were no data matching your selection" }) return if not (result.has_key("ParameterNames") and result.has_key("Records")): self.finish({ "success": "false", "result": [], "total": -1, "error": "Data structure is corrupted" }) return if not (len(result["ParameterNames"]) > 0): self.finish({ "success": "false", "result": [], "total": -1, "error": "ParameterNames field is missing" }) return if not (len(result["Records"]) > 0): self.finish({ "success": "false", "result": [], "total": 0, "Message": "There are no data to display" }) return callback = [] jobs = result["Records"] head = result["ParameterNames"] headLength = len(head) jobs = result["Records"] head = result["ParameterNames"] headLength = len(head) for i in jobs: tmp = {} for j in range(0, headLength): if j == 2: if i[j] == "None": i[j] = "-" tmp[head[j]] = i[j] callback.append(tmp) total = result["TotalRecords"] total = result["TotalRecords"] timestamp = Time.dateTime().strftime("%Y-%m-%d %H:%M [UTC]") if result.has_key("Extras"): st = self.__dict2string({}) extra = result["Extras"] callback = { "success": "true", "result": callback, "total": total, "extra": extra, "request": st, "date": timestamp } else: callback = { "success": "true", "result": callback, "total": total, "date": timestamp } self.finish(callback)
# -*- coding: utf-8 -*- import DIRAC from DIRAC import gLogger from DIRAC.Core.Base import Script Script.setUsageMessage(""" Show a list of datasets """) Script.parseCommandLine( ignoreErrors = True ) from DIRAC.Core.DISET.RPCClient import RPCClient transferRequest = RPCClient("Transfer/Dataset") condDict = {} orderby = [] start = 0 limit = 50 res = transferRequest.showtotal(condDict) if not res["OK"]: gLogger.error(res) DIRAC.exit(-1) print "Total:", res["Value"] res = transferRequest.show(condDict, orderby, start, limit)
class WMSClient(object): def __init__(self, jobManagerClient=None, sbRPCClient=None, sbTransferClient=None, useCertificates=False, timeout=600): """ WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.timeout = timeout self.jobManager = jobManagerClient self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient=sbRPCClient, transferClient=sbTransferClient, useCertificates=useCertificates) ############################################################################### def __getInputSandboxEntries(self, classAdJob): if classAdJob.lookupAttribute("InputSandbox"): inputSandbox = classAdJob.get_expression("InputSandbox") inputSandbox = inputSandbox.replace('","', "\n") inputSandbox = inputSandbox.replace('{', "") inputSandbox = inputSandbox.replace('}', "") inputSandbox = inputSandbox.replace('"', "") inputSandbox = inputSandbox.replace(',', "") inputSandbox = inputSandbox.split() else: inputSandbox = [] return inputSandbox def __uploadInputSandbox(self, classAdJob): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) badFiles = [] okFiles = [] realFiles = [] for isFile in inputSandbox: valid = True for tag in ( 'lfn:', 'LFN:', 'SB:', '%s' ): # in case of parametric input sandbox, there is %s passed, so have to ignore it also if isFile.find(tag) == 0: valid = False break if valid: realFiles.append(isFile) # If there are no files, skip! if not realFiles: return S_OK() # Check real files for isFile in realFiles: if not os.path.exists(isFile): badFiles.append(isFile) gLogger.warn("inputSandbox file/directory " + isFile + " not found. Keep looking for the others") continue okFiles.append(isFile) totalSize = File.getGlobbedTotalSize(okFiles) gLogger.verbose("Total size of the inputSandbox: " + str(totalSize)) if badFiles: result = S_ERROR('Input Sandbox is not valid') result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates) result = self.sandboxClient.uploadFilesAsSandbox(okFiles) if not result['OK']: return result inputSandbox.append(result['Value']) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK() def submitJob(self, jdl): """ Submit one job specified by its JDL to WMS """ if os.path.exists(jdl): fic = open(jdl, "r") jdlString = fic.read() fic.close() else: # If file JDL does not exist, assume that the JDL is passed as a string jdlString = jdl # Check the validity of the input JDL jdlString = jdlString.strip() if jdlString.find("[") != 0: jdlString = "[%s]" % jdlString classAdJob = ClassAd(jdlString) if not classAdJob.isOK(): return S_ERROR('Invalid job JDL') # Check the size and the contents of the input sandbox result = self.__uploadInputSandbox(classAdJob) if not result['OK']: return result # Submit the job now and get the new job ID if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) result = self.jobManager.submitJob(classAdJob.asJDL()) if 'requireProxyUpload' in result and result['requireProxyUpload']: gLogger.warn("Need to upload the proxy") return result def killJob(self, jobID): """ Kill running job. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.killJob(jobID) def deleteJob(self, jobID): """ Delete job(s) from the WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.deleteJob(jobID) def rescheduleJob(self, jobID): """ Reschedule job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.rescheduleJob(jobID) def resetJob(self, jobID): """ Reset job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.resetJob(jobID)
def web_getStatisticsData(self): req = self.__request() paletteColor = Palette() RPC = RPCClient("WorkloadManagement/WMSAdministrator") selector = self.request.arguments["statsField"][0] if selector == 'Site': selector = "GridSite" if selector == "Computing Element": selector = "DestinationSite" elif selector == "Owner Group": selector = "OwnerGroup" elif selector == "Owner": selector = "OwnerDN" result = yield self.threadTask(RPC.getPilotStatistics, selector, req) if not result['OK']: if 'FromDate' in req: del req['FromDate'] if 'LastUpdate' in req: del req['LastUpdate'] if 'ToDate' in req: del req['ToDate'] result = yield self.threadTask(RPC.getCounters, "PilotAgents", [selector], req) statistics = {} if result['OK']: for status, count in result['Value']: if "OwnerDN" in status: userName = getUsernameForDN(status['OwnerDN']) if userName['OK']: status['OwnerDN'] = userName['Value'] statistics[status[selector]] = count result = S_OK(statistics) if result["OK"]: callback = [] result = dict(result["Value"]) keylist = result.keys() keylist.sort() if selector == "Site": tier1 = gConfig.getValue("/WebApp/PreferredSites", []) if len(tier1) > 0: tier1.sort() for i in tier1: if result.has_key(i): countryCode = i.rsplit(".", 1)[1] callback.append({ "key": i, "value": result[i], "code": countryCode, "color": paletteColor.getColor(countryCode) }) for key in keylist: if selector == "Site" and tier1: if key not in tier1: try: countryCode = key.rsplit(".", 1)[1] except: countryCode = "Unknown" callback.append({ "key": key, "value": result[key], "code": countryCode, "color": paletteColor.getColor(key) }) elif selector == "Site" and not tier1: try: countryCode = key.rsplit(".", 1)[1] except: countryCode = "Unknown" callback.append({ "key": key, "value": result[key], "code": countryCode, "color": paletteColor.getColor(key) }) else: callback.append({ "key": key, "value": result[key], "code": "", "color": paletteColor.getColor(key) }) callback = {"success": "true", "result": callback} else: callback = {"success": "false", "error": result["Message"]} self.finish(callback)
def setSiteStatus(self, site, status, comment='No comment'): """ Set the status of a site in the 'SiteStatus' table of RSS examples >>> siteStatus.banSite( 'site1.test.test' ) S_OK() >>> siteStatus.banSite( None ) S_ERROR( ... ) :Parameters: **site** - `String` the site that is going to be banned **comment** - `String` reason for banning :return: S_OK() || S_ERROR() """ if not status: return S_ERROR(DErrno.ERESUNK, 'status parameter is empty') # fix case sensitive string status = status.capitalize() allowedStateList = [ 'Active', 'Banned', 'Degraded', 'Probing', 'Error', 'Unknown' ] if status not in allowedStateList: return S_ERROR(errno.EINVAL, 'Not a valid status, parameter rejected') if self.rssFlag: result = getProxyInfo() if result['OK']: tokenOwner = result['Value']['username'] else: return S_ERROR("Unable to get user proxy info %s " % result['Message']) tokenExpiration = datetime.utcnow() + timedelta(days=1) self.rssCache.acquireLock() try: result = self.rsClient.modifyStatusElement( 'Site', 'Status', status=status, name=site, tokenExpiration=tokenExpiration, reason=comment, tokenOwner=tokenOwner) if result['OK']: self.rssCache.refreshCache() else: _msg = 'Error updating status of site %s to %s' % (site, status) gLogger.warn('RSS: %s' % _msg) # Release lock, no matter what. finally: self.rssCache.releaseLock() else: if status in ['Active', 'Degraded']: result = RPCClient( 'WorkloadManagement/WMSAdministrator').allowSite() else: result = RPCClient( 'WorkloadManagement/WMSAdministrator').banSite() return result
class Publisher: """ Class Publisher is in charge of getting dispersed information, to be published on the web. """ ############################################################################# def __init__(self, VOExtension, rsDBIn=None, commandCallerIn=None, infoGetterIn=None, WMSAdminIn=None): """ Standard constructor :params: :attr:`VOExtension`: string, VO Extension (e.g. 'LHCb') :attr:`rsDBIn`: optional ResourceStatusDB object (see :class: `DIRAC.ResourceStatusSystem.DB.ResourceStatusDB.ResourceStatusDB`) :attr:`commandCallerIn`: optional CommandCaller object (see :class: `DIRAC.ResourceStatusSystem.Command.CommandCaller.CommandCaller`) :attr:`infoGetterIn`: optional InfoGetter object (see :class: `DIRAC.ResourceStatusSystem.Utilities.InfoGetter.InfoGetter`) :attr:`WMSAdminIn`: optional RPCClient object for WMSAdmin (see :class: `DIRAC.Core.DISET.RPCClient.RPCClient`) """ self.configModule = __import__( VOExtension + "DIRAC.ResourceStatusSystem.Policy.Configurations", globals(), locals(), ['*']) if rsDBIn is not None: self.rsDB = rsDBIn else: from DIRAC.ResourceStatusSystem.DB.ResourceStatusDB import ResourceStatusDB self.rsDB = ResourceStatusDB() if commandCallerIn is not None: self.cc = commandCallerIn else: from DIRAC.ResourceStatusSystem.Command.CommandCaller import CommandCaller self.cc = CommandCaller() if infoGetterIn is not None: self.ig = infoGetterIn else: from DIRAC.ResourceStatusSystem.Utilities.InfoGetter import InfoGetter self.ig = InfoGetter(VOExtension) if WMSAdminIn is not None: self.WMSAdmin = WMSAdminIn else: from DIRAC.Core.DISET.RPCClient import RPCClient self.WMSAdmin = RPCClient("WorkloadManagement/WMSAdministrator") self.threadPool = ThreadPool(2, 5) self.lockObj = threading.RLock() self.infoForPanel_res = {} ############################################################################# def getInfo(self, granularity, name, useNewRes=False): """ Standard method to get all the info to be published This method uses a ThreadPool (:class:`DIRAC.Core.Utilities.ThreadPool.ThreadPool`) with 2-5 threads. The threaded method is :meth:`DIRAC.ResourceStatusSystem.Utilities.Publisher.Publisher.getInfoForPanel` :params: :attr:`granularity`: string - a ValidRes :attr:`name`: string - name of the Validres :attr:`useNewRes`: boolean. When set to true, will get new results, otherwise it will get cached results (where available). """ if granularity not in ValidRes: raise InvalidRes, where(self, self.getInfo) self.infoForPanel_res = {} status = None formerStatus = None siteType = None serviceType = None resourceType = None if granularity in ('Resource', 'Resources'): try: resourceType = self.rsDB.getMonitoredsList( 'Resource', ['ResourceType'], resourceName=name)[0][0] except IndexError: return "%s does not exist!" % name if granularity in ('StorageElement', 'StorageElements'): try: siteType = self.rsDB.getMonitoredsList( 'StorageElement', ['SiteType'], storageElementName=name)[0][0] except IndexError: return "%s does not exist!" % name paramNames = [ 'Type', 'Group', 'Name', 'Policy', 'DIRAC Status', 'RSS Status', 'Reason', 'Description' ] infoToGet = self.ig.getInfoToApply(('view_info', ), granularity, status=status, formerStatus=formerStatus, siteType=siteType, serviceType=serviceType, resourceType=resourceType, useNewRes=useNewRes)[0]['Panels'] infoToGet_res = {} recordsList = [] infosForPolicy = {} for panel in infoToGet.keys(): (granularityForPanel, nameForPanel) = self.__getNameForPanel(granularity, name, panel) if not self._resExist(granularityForPanel, nameForPanel): # completeInfoForPanel_res = None continue #take composite RSS result for name nameStatus_res = self._getStatus(nameForPanel, panel) recordBase = [None, None, None, None, None, None, None, None] recordBase[1] = panel.replace('_Panel', '') recordBase[2] = nameForPanel #nameForPanel try: recordBase[4] = nameStatus_res[nameForPanel][ 'DIRACStatus'] #DIRAC Status except: pass recordBase[5] = nameStatus_res[nameForPanel][ 'RSSStatus'] #RSS Status record = copy.deepcopy(recordBase) record[0] = 'ResultsForResource' recordsList.append(record) #take info that goes into the panel infoForPanel = infoToGet[panel] for info in infoForPanel: self.threadPool.generateJobAndQueueIt( self.getInfoForPanel, args=(info, granularityForPanel, nameForPanel)) self.threadPool.processAllResults() for policy in [x.keys()[0] for x in infoForPanel]: record = copy.deepcopy(recordBase) record[0] = 'SpecificInformation' record[3] = policy #policyName record[4] = None #DIRAC Status record[5] = self.infoForPanel_res[policy][ 'Status'] #RSS status for the policy record[6] = self.infoForPanel_res[policy]['Reason'] #Reason record[7] = self.infoForPanel_res[policy]['desc'] #Description recordsList.append(record) infosForPolicy[policy] = self.infoForPanel_res[policy]['infos'] infoToGet_res['TotalRecords'] = len(recordsList) infoToGet_res['ParameterNames'] = paramNames infoToGet_res['Records'] = recordsList infoToGet_res['Extras'] = infosForPolicy return infoToGet_res ############################################################################# def getInfoForPanel(self, info, granularityForPanel, nameForPanel): #get single RSS policy results policyResToGet = info.keys()[0] pol_res = self.rsDB.getPolicyRes(nameForPanel, policyResToGet) if pol_res != []: pol_res_dict = {'Status': pol_res[0], 'Reason': pol_res[1]} else: pol_res_dict = {'Status': 'Unknown', 'Reason': 'Unknown'} self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet] = pol_res_dict finally: self.lockObj.release() #get policy description desc = self._getPolicyDesc(policyResToGet) #get other info othersInfo = info.values()[0] if not isinstance(othersInfo, list): othersInfo = [othersInfo] info_res = {} for oi in othersInfo: format = oi.keys()[0] what = oi.values()[0] info_bit_got = self._getInfo(granularityForPanel, nameForPanel, format, what) info_res[format] = info_bit_got self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet]['infos'] = info_res self.infoForPanel_res[policyResToGet]['desc'] = desc finally: self.lockObj.release() ############################################################################# def _getStatus(self, name, panel): #get RSS status RSSStatus = self._getInfoFromRSSDB(name, panel)[0][1] #get DIRAC status if panel in ('Site_Panel', 'SE_Panel'): if panel == 'Site_Panel': DIRACStatus = self.WMSAdmin.getSiteMaskLogging(name) if DIRACStatus['OK']: DIRACStatus = DIRACStatus['Value'][name].pop()[0] else: raise RSSException, where(self, self._getStatus) elif panel == 'SE_Panel': ra = getStorageElementStatus(name, 'ReadAccess')['Value'] wa = getStorageElementStatus(name, 'WriteAccess')['Value'] DIRACStatus = {'ReadAccess': ra, 'WriteAccess': wa} status = { name: { 'RSSStatus': RSSStatus, 'DIRACStatus': DIRACStatus } } else: status = {name: {'RSSStatus': RSSStatus}} return status ############################################################################# def _getInfo(self, granularity, name, format, what): if format == 'RSS': info_bit_got = self._getInfoFromRSSDB(name, what) else: if isinstance(what, dict): command = what['CommandIn'] extraArgs = what['args'] else: command = what extraArgs = None info_bit_got = self.cc.commandInvocation(granularity, name, None, None, command, extraArgs) try: info_bit_got = info_bit_got['Result'] except: pass return info_bit_got ############################################################################# def _getInfoFromRSSDB(self, name, what): paramsL = ['Status'] siteName = None serviceName = None resourceName = None storageElementName = None serviceType = None gridSiteName = None if what == 'ServiceOfSite': gran = 'Service' paramsL.insert(0, 'ServiceName') paramsL.append('Reason') siteName = name elif what == 'ResOfCompService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorEl': gran = 'StorageElements' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') storageElementName = name elif what == 'StorageElementsOfSite': gran = 'StorageElements' paramsL.insert(0, 'StorageElementName') paramsL.append('Reason') if '@' in name: DIRACsiteName = name.split('@').pop() else: DIRACsiteName = name gridSiteName = getGOCSiteName(DIRACsiteName) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'Site_Panel': gran = 'Site' paramsL.insert(0, 'SiteName') siteName = name elif what == 'Service_Computing_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_Storage_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VO-BOX_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VOMS_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Resource_Panel': gran = 'Resource' paramsL.insert(0, 'ResourceName') resourceName = name elif what == 'SE_Panel': gran = 'StorageElement' paramsL.insert(0, 'StorageElementName') storageElementName = name info_bit_got = self.rsDB.getMonitoredsList( gran, paramsList=paramsL, siteName=siteName, serviceName=serviceName, serviceType=serviceType, resourceName=resourceName, storageElementName=storageElementName, gridSiteName=gridSiteName) return info_bit_got ############################################################################# def _getPolicyDesc(self, policyName): return self.configModule.Policies[policyName]['Description'] ############################################################################# def __getNameForPanel(self, granularity, name, panel): if granularity in ('Site', 'Sites'): if panel == 'Service_Computing_Panel': granularity = 'Service' name = 'Computing@' + name elif panel == 'Service_Storage_Panel': granularity = 'Service' name = 'Storage@' + name elif panel == 'OtherServices_Panel': granularity = 'Service' name = 'OtherS@' + name elif panel == 'Service_VOMS_Panel': granularity = 'Service' name = 'VOMS@' + name elif panel == 'Service_VO-BOX_Panel': granularity = 'Service' name = 'VO-BOX@' + name # else: # granularity = granularity # name = name # else: # granularity = granularity # name = name return (granularity, name) ############################################################################# def _resExist(self, granularity, name): siteName = None serviceName = None resourceName = None storageElementName = None if granularity in ('Site', 'Sites'): siteName = name elif granularity in ('Service', 'Services'): serviceName = name elif granularity in ('Resource', 'Resources'): resourceName = name elif granularity in ('StorageElement', 'StorageElements'): storageElementName = name res = self.rsDB.getMonitoredsList( granularity, siteName=siteName, serviceName=serviceName, resourceName=resourceName, storageElementName=storageElementName) if res == []: return False else: return True
def getPFNBase( self ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'getPFNBase', [], {} )
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() #Add all submit pools self.directorDict['SubmitPool'] = self.am_getOption("SubmitPools") rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(self.directorDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] self.log.info('Found %s TaskQueues' % len(taskQueueDict)) if not taskQueueDict: self.log.info('No TaskQueue to Process') return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info('Sum of Priorities %s' % prioritySum) if waitingJobs == 0: self.log.info('No waiting Jobs') return S_OK('No waiting Jobs') if prioritySum <= 0: return S_ERROR('Wrong TaskQueue Priorities') self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration') / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration') / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours")) for taskQueueID in taskQueueDict: self.log.verbose('Processing TaskQueue', taskQueueID) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList }, None, timeLimitToConsider) if not result['OK']: self.log.error('Fail to get Number of Waiting pilots', result['Message']) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots) result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info('Number of pilots to be Submitted %s' % self.toSubmitPilots) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info('Number of pilots Submitted %s' % self.submittedPilots) return S_OK()
def createDirectory( self, path ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'createDirectory', [path], {} )
def finalizeRequest(self, requestID, jobID): """ check request status and perform finalization if necessary update the request status and the corresponding job parameter :param self: self reference :param str requestID: request id :param int jobID: job id """ stateServer = RPCClient("WorkloadManagement/JobStateUpdate", useCertificates=True) # Checking if to update the job status - we should fail here, so it will be re-tried later # Checking the state, first res = self.getRequestStatus(requestID) if not res['OK']: self.log.error( "finalizeRequest: failed to get request", "request: %s status: %s" % (requestID, res["Message"])) return res if res["Value"] != "Done": return S_ERROR( "The request %s isn't 'Done' but '%s', this should never happen, why are we here?" % (requestID, res['Value'])) # The request is 'Done', let's update the job status. If we fail, we should re-try later monitorServer = RPCClient("WorkloadManagement/JobMonitoring", useCertificates=True) res = monitorServer.getJobPrimarySummary(int(jobID)) if not res["OK"]: self.log.error("finalizeRequest: Failed to get job status", "JobID: %d" % jobID) return S_ERROR("finalizeRequest: Failed to get job %d status" % jobID) elif not res['Value']: self.log.info( "finalizeRequest: job %d does not exist (anymore): finalizing" % jobID) return S_OK() else: jobStatus = res["Value"]["Status"] jobMinorStatus = res["Value"]["MinorStatus"] # update the job pending request digest in any case since it is modified self.log.info( "finalizeRequest: Updating request digest for job %d" % jobID) digest = self.getDigest(requestID) if digest["OK"]: digest = digest["Value"] self.log.verbose(digest) res = stateServer.setJobParameter(jobID, "PendingRequest", digest) if not res["OK"]: self.log.info( "finalizeRequest: Failed to set job %d parameter: %s" % (jobID, res["Message"])) return res else: self.log.error( "finalizeRequest: Failed to get request digest for %s: %s" % (requestID, digest["Message"])) stateUpdate = None if jobStatus == 'Completed': # What to do? Depends on what we have in the minorStatus if jobMinorStatus == "Pending Requests": self.log.info( "finalizeRequest: Updating job status for %d to Done/Requests done" % jobID) stateUpdate = stateServer.setJobStatus( jobID, "Done", "Requests done", "") elif jobMinorStatus == "Application Finished With Errors": self.log.info( "finalizeRequest: Updating job status for %d to Failed/Requests done" % jobID) stateUpdate = stateServer.setJobStatus( jobID, "Failed", "Requests done", "") if not stateUpdate: self.log.info( "finalizeRequest: Updating job minor status for %d to Requests done (status is %s)" % (jobID, jobStatus)) stateUpdate = stateServer.setJobStatus(jobID, jobStatus, "Requests done", "") if not stateUpdate["OK"]: self.log.error( "finalizeRequest: Failed to set job status", "JobID: %d status: %s" % (jobID, stateUpdate['Message'])) return stateUpdate return S_OK()
def releaseFile( self, path ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'releaseFile', [path], {} )
def removeDirectory( self, path, recursive = False ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'removeDirectory', [path], {'recursive':recursive} )
def prestageFileStatus( self, path ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'prestageFileStatus', [path], {} )
def getDirectorySize( self, path ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'getDirectorySize', [path], {} )
def getFileMetadata( self, path ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'getFileMetadata', [path], {} )
def pinFile( self, path, lifetime = 60 * 60 * 24 ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'pinFile', [path], {'lifetime':lifetime} )
class MCExtensionAgent(DIRACMCExtensionAgent): """ MCExtensionAgent """ def __init__(self, *args, **kwargs): """ c'tor """ DIRACMCExtensionAgent.__init__(self, *args, **kwargs) self.rpcProductionRequest = None self.transClient = None self.enableFlag = True # default values self.cpuE = 1 self.cpuTimeAvg = 200000 self.cpuNormalizationFactorAvg = 1.0 # Artificial boost of the number of events requested to be created self.extensionFactorBoost = 20 # Meaning 20% more than what is calculated ############################################################################# def initialize(self): """ Logs some parameters and initializes the clients """ self.extensionFactorBoost = self.am_getOption( 'extensionFactorBoost', self.extensionFactorBoost) self.rpcProductionRequest = RPCClient( 'ProductionManagement/ProductionRequest') self.transClient = TransformationClient() self.log.info('Will consider the following transformation types: %s' % str(self.transformationTypes)) self.log.info('Will create a maximum of %s tasks per iteration' % self.maxIterationTasks) return S_OK() ############################################################################# def execute(self): """ The MCExtensionAgent execution method. """ self.enableFlag = self.am_getOption('EnableFlag', 'True') if not self.enableFlag == 'True': self.log.info( "MCExtensionAgent is disabled by configuration option EnableFlag" ) return S_OK('Disabled via CS flag') # done every cycle, as they may have changed self._getCPUParameters() # get the production requests in which we are interested productionRequests = self.rpcProductionRequest.getProductionRequestSummary( 'Active', 'Simulation') if productionRequests['OK']: productionRequests = productionRequests['Value'] self.log.info( "Requests considered: %s" % ', '.join([str(prod) for prod in productionRequests.keys()])) else: message = "RPC call to ProductionRequest service failed : %s" % productionRequests[ 'Message'] self.log.error(message) return S_ERROR(message) for productionRequestID, productionRequestSummary in productionRequests.items( ): ret = self._checkProductionRequest(productionRequestID, productionRequestSummary) if not ret['OK']: return ret return S_OK() ############################################################################# def _getCPUParameters(self): """ Get the CPUTimeAvg and CPUNormalizationFactorAvg from config, or as a fail-over, there are some defaults """ op = Operations() self.cpuTimeAvg = op.getValue('Transformations/cpuTimeAvg', self.cpuTimeAvg) self.log.verbose("cpuTimeAvg = %d" % self.cpuTimeAvg) try: self.cpuNormalizationFactorAvg = getCPUNormalizationFactorAvg() self.log.verbose("cpuNormalizationFactorAvg = %d" % self.cpuNormalizationFactorAvg) except RuntimeError: self.log.info( "Could not get CPUNormalizationFactorAvg from config, defaulting to %d" % self.cpuNormalizationFactorAvg) ############################################################################# def _checkProductionRequest(self, productionRequestID, productionRequestSummary): """ Check if a production request need to be extended and do it if needed """ # check if enough events have been produced missingEvents = productionRequestSummary[ 'reqTotal'] - productionRequestSummary['bkTotal'] self.log.info("Missing events for production request %d: %d" % (productionRequestID, missingEvents)) if productionRequestSummary['bkTotal'] > 0 and missingEvents <= 0: message = "Enough events produced for production request %d" % productionRequestID self.log.verbose(message) return S_OK(message) # get the associated productions/transformations progress productionsProgress = self.rpcProductionRequest.getProductionProgressList( long(productionRequestID)) if productionsProgress['OK']: productionsProgress = productionsProgress['Value'] else: message = 'Failed to get productions progress : %s' % productionsProgress[ 'Message'] self.log.error(message) return S_ERROR(message) productionsProgress = productionsProgress['Rows'] self.log.verbose("Progress for production request %d: %s" % (productionRequestID, str(productionsProgress))) # get the informations for the productions/transformations productions = [] simulation = None for productionProgress in productionsProgress: productionID = productionProgress['ProductionID'] production = self.transClient.getTransformation(productionID) if not production['OK']: message = 'Failed to get informations on production %d : %s' % ( productionID, production['Message']) self.log.error(message) return S_ERROR(message) production = production['Value'] productions.append(production) # determine which one is the simulation production if production['Type'] in self.transformationTypes: simulation = production simulationID = productionID for prodProgress in productionsProgress: if prodProgress['ProductionID'] == simulationID: simulationProgress = prodProgress self.log.info( "Progress for the simulation production %d of request %d: %s" % (simulationID, productionRequestID, str(simulationProgress))) if simulation is None: message = 'Failed to get simulation production for request %d' % productionRequestID self.log.error(message) return S_ERROR(message) if simulation['Status'].lower() != 'idle': # the simulation is still producing events message = "Simulation for production request %d is not Idle (%s)" % ( productionRequestID, simulation['Status']) self.log.verbose(message) return S_OK(message) # Checking how long ago this production became 'Idle' res = self.transClient.getTransformationLogging(simulationID) if not res['OK']: return res lastLoggingEntry = res['Value'][-1] if ('idle' in lastLoggingEntry['Message'].lower()) and \ ((datetime.datetime.utcnow() - lastLoggingEntry['MessageDate']).seconds < 900): self.log.verbose( "Prod %d is in 'Idle' for less than 15 minutes, waiting a bit" % simulationID) return S_OK( "Prod %d is in 'Idle' for less than 15 minutes, waiting a bit" % simulationID) if simulationProgress['BkEvents'] < productionRequestSummary[ 'reqTotal']: # the number of events produced by the simulation is of the order of the number of events requested # -> there is probably no stripping production, no extension factor necessary return self._extendProduction(simulation, 1.0, missingEvents) else: # the number of events produced by the simulation is more than the number of events requested, yet events are missing # -> there is probably a stripping production, an extension factor is needed to account for stripped events # some events may still be processed (eg. merged), so wait that all the productions are idle if all(production['Status'].lower() == 'idle' for production in productions): try: extensionFactor = float( simulationProgress['BkEvents']) / float( productionRequestSummary['bkTotal']) return self._extendProduction(simulation, extensionFactor, missingEvents) except ZeroDivisionError: return S_OK() else: return S_OK() ############################################################################# def _extendProduction(self, production, extensionFactor, eventsNeeded): """ Extends a production to produce eventsNeeded*extensionFactor more events. """ productionID = production['TransformationID'] cpuEProd = getProductionParameterValue(production['Body'], 'CPUe') if cpuEProd is None: self.log.warn( "CPUe for transformation %d is not set, skipping for now" % productionID) return S_OK() cpuE = int(round(float(cpuEProd))) self.log.info( "Extending production %d, that is still missing %d events. \ Extension factor = %d, boost = %d" % (productionID, eventsNeeded, extensionFactor, self.extensionFactorBoost)) eventsToProduce = eventsNeeded * extensionFactor * ( float(100 + self.extensionFactorBoost) / 100) max_e = getEventsToProduce(cpuE, self.cpuTimeAvg, self.cpuNormalizationFactorAvg) numberOfTasks = int(math.ceil(float(eventsToProduce) / float(max_e))) self.log.info("Extending production %d by %d tasks" % (productionID, numberOfTasks)) # extend the transformation by the determined number of tasks res = self.transClient.extendTransformation(productionID, numberOfTasks) if not res['OK']: message = 'Failed to extend transformation %d : %s' % ( productionID, res['Message']) self.log.error(message) return S_ERROR(message) else: message = "Successfully extended transformation %d by %d tasks" % ( productionID, numberOfTasks) self.log.info(message) res = self.transClient.setTransformationParameter( productionID, 'Status', 'Active') if not res['OK']: message = 'Failed to set transformation %d to Active' % productionID self.log.error(message) return S_ERROR(message) return S_OK(message)
def getTransportURL( self, path, protocols = False ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'getTransportURL', [path], {'protocols':protocols} )
class PilotsWMSCommand(Command): def __init__(self, args=None, clients=None): super(PilotsWMSCommand, self).__init__(args, clients) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis['WMSAdministrator'] else: self.wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') def doCommand(self): """ # Returns simple pilots efficiency # # :attr:`args`: # - args[0]: string - should be a ValidElement # # - args[1]: string - should be the name of the ValidElement # # returns: # { # 'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad' # } """ if not 'element' in self.args: return self.returnERROR(S_ERROR('element is missing')) element = self.args['element'] if not 'siteName' in self.args: return self.returnERROR(S_ERROR('siteName is missing')) siteName = self.args['siteName'] # If siteName is None, we take all sites if siteName is None: siteName = CSHelpers.getSites() if not siteName['OK']: return self.returnERROR(siteName) siteName = siteName['Value'] if element == 'Site': results = self.wmsAdmin.getPilotSummaryWeb({'GridSite': siteName}, [], 0, 300) elif element == 'Resource': results = self.wmsAdmin.getPilotSummaryWeb( {'ExpandSite': siteName}, [], 0, 300) else: return self.returnERROR(S_ERROR('%s is a wrong element' % element)) if not results['OK']: return self.returnERROR(results) results = results['Value'] if not 'ParameterNames' in results: return self.returnERROR(S_ERROR('Malformed result dictionary')) params = results['ParameterNames'] if not 'Records' in results: return self.returnERROR(S_ERROR('Malformed result dictionary')) records = results['Records'] pilotResults = [] for record in records: pilotDict = dict(zip(params, record)) try: pilotDict['PilotsPerJob'] = float(pilotDict['PilotsPerJob']) pilotDict['PilotsJobEff'] = float(pilotDict['PilotsJobEff']) except KeyError, e: return self.returnERROR(S_ERROR(e)) except ValueError, e: return self.returnERROR(S_ERROR(e)) pilotResults.append(pilotDict)
def exists( self, path ): client = RPCClient( self.url ) return client.callProxyMethod( self.name, 'exists', [path], {} )
class PilotCommand(Command): ''' Pilot "master" Command. ''' def __init__(self, args=None, clients=None): super(PilotCommand, self).__init__(args, clients) if 'WMSAdministrator' in self.apis: self.wmsAdmin = self.apis['WMSAdministrator'] else: self.wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient() def _storeCommand(self, result): ''' Stores the results of doNew method on the database. ''' for pilotDict in result: resQuery = self.rmClient.addOrModifyPilotCache( pilotDict['Site'], pilotDict['CE'], pilotDict['PilotsPerJob'], pilotDict['PilotJobEff'], pilotDict['Status']) if not resQuery['OK']: return resQuery return S_OK() def _prepareCommand(self): ''' JobCommand requires one arguments: - name : <str> ''' if not 'name' in self.args: return S_ERROR('"name" not found in self.args') name = self.args['name'] if not 'element' in self.args: return S_ERROR('element is missing') element = self.args['element'] if element not in ['Site', 'Resource']: return S_ERROR('"%s" is not Site nor Resource' % element) return S_OK((element, name)) def doNew(self, masterParams=None): if masterParams is not None: element, name = masterParams else: params = self._prepareCommand() if not params['OK']: return params element, name = params['Value'] wmsDict = {} if element == 'Site': wmsDict = {'GridSite': name} elif element == 'Resource': wmsDict = {'ExpandSite': name} else: # You should never see this error return S_ERROR('"%s" is not Site nor Resource' % element) wmsResults = self.wmsAdmin.getPilotSummaryWeb(wmsDict, [], 0, 0) if not wmsResults['OK']: return wmsResults wmsResults = wmsResults['Value'] if not 'ParameterNames' in wmsResults: return S_ERROR('Wrong result dictionary, missing "ParameterNames"') params = wmsResults['ParameterNames'] if not 'Records' in wmsResults: return S_ERROR('Wrong formed result dictionary, missing "Records"') records = wmsResults['Records'] uniformResult = [] for record in records: # This returns a dictionary with the following keys: # 'Site', 'CE', 'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', # 'Done', 'Aborted', 'Done_Empty', 'Aborted_Hour', 'Total', 'PilotsPerJob', # 'PilotJobEff', 'Status', 'InMask' pilotDict = dict(zip(params, record)) pilotDict['PilotsPerJob'] = float(pilotDict['PilotsPerJob']) pilotDict['PilotJobEff'] = float(pilotDict['PilotJobEff']) uniformResult.append(pilotDict) storeRes = self._storeCommand(uniformResult) if not storeRes['OK']: return storeRes return S_OK(uniformResult) def doCache(self): params = self._prepareCommand() if not params['OK']: return params element, name = params['Value'] if element == 'Site': # WMS returns Site entries with CE = 'Multiple' site, ce = name, 'Multiple' elif element == 'Resource': site, ce = None, name else: # You should never see this error return S_ERROR('"%s" is not Site nor Resource' % element) result = self.rmClient.selectPilotCache(site, ce) if result['OK']: result = S_OK( [dict(zip(result['Columns'], res)) for res in result['Value']]) return result def doMaster(self): siteNames = CSHelpers.getSites() if not siteNames['OK']: return siteNames siteNames = siteNames['Value'] ces = CSHelpers.getComputingElements() if not ces['OK']: return ces ces = ces['Value'] pilotResults = self.doNew(('Site', siteNames)) if not pilotResults['OK']: self.metrics['failed'].append(pilotResults['Message']) pilotResults = self.doNew(('Resource', ces)) if not pilotResults['OK']: self.metrics['failed'].append(pilotResults['Message']) return S_OK(self.metrics)
def setUp(self): self.publisher = RPCClient("ResourceStatus/Publisher") gLogger.setLevel('DEBUG')