def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] abortedPilots = 0 result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 23300 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] lastUpdateTime = pilotDict[pRef]['LastUpdateTime'] sinceLastUpdate = dateTime() - lastUpdateTime if oldStatus == ceStatus and ceStatus != "Unknown": # Normal status did not change, continue continue elif ceStatus == "Unknown" and oldStatus == "Unknown": if sinceLastUpdate < 3600*second: # Allow 1 hour of Unknown status assuming temporary problems on the CE continue else: newStatus = 'Aborted' elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Possible problems on the CE, let's keep the Unknown status for a while newStatus = 'Unknown' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) if newStatus == "Aborted": abortedPilots += 1 # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # If something wrong in the queue, make a pause for the job submission if abortedPilots: self.failedQueues[queue] += 1 # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
def updatePilotStatus(self): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'Status': TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup }) if not result['OK']: self.log.error('Failed to select pilots: %s' % result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue #print "AT >>> pilotRefs", pilotRefs result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] #print "AT >>> pilotDict", pilotDict stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]['PilotStamp']) else: stampedPilotRefs = list(pilotRefs) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, 500) result = ce.getJobStatus(stampedPilotRefs) if not result['OK']: self.log.error('Failed to get pilots status from CE', '%s: %s' % (ceName, result['Message'])) continue pilotCEDict = result['Value'] #print "AT >>> pilotCEDict", pilotCEDict for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown': # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info('Updating status to %s for pilot %s' % (newStatus, pRef)) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector') # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower( ) == 'false' and self.getOutput: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message']) else: self.log.warn( 'Empty pilot output not stored to PilotDB') # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid(120): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000) if not result['OK']: return result ce.setProxy(self.proxy, 940) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'OutputReady': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error('Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error('Failed to store pilot output', result['Message']) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'AccountingSent': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] result = self.sendPilotAccounting(pilotDict) if not result['OK']: self.log.error('Failed to send pilot agent accounting') return S_OK()
def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 500 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
def getGridJobOutput(pilotReference): """ Get the pilot job standard output and standard error files for the Grid job reference :param str pilotReference: a grid (job) pilot reference """ result = pilotAgentsDB.getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] # FIXME: What if the OutputSandBox is not StdOut and StdErr, what do we do with other files? result = pilotAgentsDB.getPilotOutput(pilotReference) if result['OK']: stdout = result['Value']['StdOut'] error = result['Value']['StdErr'] if stdout or error: resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] return S_OK(resultDict) else: gLogger.warn('Empty pilot output found', 'for %s' % pilotReference) # Instantiate the appropriate CE ceFactory = ComputingElementFactory() result = getQueue(pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue']) if not result['OK']: return result queueDict = result['Value'] gridEnv = getGridEnv() queueDict['GridEnv'] = gridEnv queueDict['WorkingDirectory'] = mkdtemp() result = ceFactory.getCE(pilotDict['GridType'], pilotDict['DestinationSite'], queueDict) if not result['OK']: shutil.rmtree(queueDict['WorkingDirectory']) return result ce = result['Value'] groupVOMS = getGroupOption(group, 'VOMSRole', group) result = gProxyManager.getPilotProxyFromVOMSGroup(owner, groupVOMS) if not result['OK']: gLogger.error( 'Could not get proxy:', 'User "%s" Group "%s" : %s' % (owner, groupVOMS, result['Message'])) return S_ERROR("Failed to get the pilot's owner proxy") proxy = result['Value'] ce.setProxy(proxy) pilotStamp = pilotDict['PilotStamp'] pRef = pilotReference if pilotStamp: pRef = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRef) if not result['OK']: shutil.rmtree(queueDict['WorkingDirectory']) return result stdout, error = result['Value'] if stdout: result = pilotAgentsDB.storePilotOutput(pilotReference, stdout, error) if not result['OK']: gLogger.error('Failed to store pilot output:', result['Message']) resultDict = {} resultDict['StdOut'] = stdout resultDict['StdErr'] = error resultDict['OwnerDN'] = owner resultDict['OwnerGroup'] = group resultDict['FileList'] = [] shutil.rmtree(queueDict['WorkingDirectory']) return S_OK(resultDict)
def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] abortedPilots = 0 result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break # This proxy is used for checking the pilot status and renewals # We really need at least a few hours otherwise the renewed # proxy may expire before we check again... result = ce.isProxyValid( 3*3600 ) if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 23300 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] lastUpdateTime = pilotDict[pRef]['LastUpdateTime'] sinceLastUpdate = dateTime() - lastUpdateTime if oldStatus == ceStatus and ceStatus != "Unknown": # Normal status did not change, continue continue elif ceStatus == "Unknown" and oldStatus == "Unknown": if sinceLastUpdate < 3600*second: # Allow 1 hour of Unknown status assuming temporary problems on the CE continue else: newStatus = 'Aborted' elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Possible problems on the CE, let's keep the Unknown status for a while newStatus = 'Unknown' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) if newStatus == "Aborted": abortedPilots += 1 # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # If something wrong in the queue, make a pause for the job submission if abortedPilots: self.failedQueues[queue] += 1 # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
def updatePilotStatus(self): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]["CE"] ceName = self.queueDict[queue]["CEName"] queueName = self.queueDict[queue]["QueueName"] ceType = self.queueDict[queue]["CEType"] siteName = self.queueDict[queue]["Site"] result = pilotAgentsDB.selectPilots( { "DestinationSite": ceName, "Queue": queueName, "GridType": ceType, "GridSite": siteName, "Status": TRANSIENT_PILOT_STATUS, "OwnerDN": self.pilotDN, "OwnerGroup": self.pilotGroup, } ) if not result["OK"]: self.log.error("Failed to select pilots: %s" % result["Message"]) continue pilotRefs = result["Value"] if not pilotRefs: continue # print "AT >>> pilotRefs", pilotRefs result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result["OK"]: self.log.error("Failed to get pilots info from DB", result["Message"]) continue pilotDict = result["Value"] # print "AT >>> pilotDict", pilotDict stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]["PilotStamp"]: stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]["PilotStamp"]) else: stampedPilotRefs = list(pilotRefs) break result = ce.isProxyValid() if not result["OK"]: result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 600) if not result["OK"]: return result self.proxy = result["Value"] ce.setProxy(self.proxy, 500) result = ce.getJobStatus(stampedPilotRefs) if not result["OK"]: self.log.error("Failed to get pilots status from CE", "%s: %s" % (ceName, result["Message"])) continue pilotCEDict = result["Value"] # print "AT >>> pilotCEDict", pilotCEDict for pRef in pilotRefs: newStatus = "" oldStatus = pilotDict[pRef]["Status"] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = "Aborted" elif ceStatus != "Unknown": # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info("Updating status to %s for pilot %s" % (newStatus, pRef)) result = pilotAgentsDB.setPilotStatus(pRef, newStatus, "", "Updated by SiteDirector") # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]["OutputReady"].lower() == "false" and self.getOutput: self.log.info("Retrieving output for pilot %s" % pRef) pilotStamp = pilotDict[pRef]["PilotStamp"] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ":::" + pilotStamp result = ce.getJobOutput(pRefStamp) if not result["OK"]: self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"])) else: output, error = result["Value"] if output: result = pilotAgentsDB.storePilotOutput(pRef, output, error) if not result["OK"]: self.log.error("Failed to store pilot output", result["Message"]) else: self.log.warn("Empty pilot output not stored to PilotDB") # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]["CE"] if not ce.isProxyValid(120): result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 1000) if not result["OK"]: return result ce.setProxy(self.proxy, 940) ceName = self.queueDict[queue]["CEName"] queueName = self.queueDict[queue]["QueueName"] ceType = self.queueDict[queue]["CEType"] siteName = self.queueDict[queue]["Site"] result = pilotAgentsDB.selectPilots( { "DestinationSite": ceName, "Queue": queueName, "GridType": ceType, "GridSite": siteName, "OutputReady": "False", "Status": FINAL_PILOT_STATUS, } ) if not result["OK"]: self.log.error("Failed to select pilots", result["Message"]) continue pilotRefs = result["Value"] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result["OK"]: self.log.error("Failed to get pilots info from DB", result["Message"]) continue pilotDict = result["Value"] if self.getOutput: for pRef in pilotRefs: self.log.info("Retrieving output for pilot %s" % pRef) pilotStamp = pilotDict[pRef]["PilotStamp"] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ":::" + pilotStamp result = ce.getJobOutput(pRefStamp) if not result["OK"]: self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"])) else: output, error = result["Value"] result = pilotAgentsDB.storePilotOutput(pRef, output, error) if not result["OK"]: self.log.error("Failed to store pilot output", result["Message"]) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( { "DestinationSite": ceName, "Queue": queueName, "GridType": ceType, "GridSite": siteName, "AccountingSent": "False", "Status": FINAL_PILOT_STATUS, } ) if not result["OK"]: self.log.error("Failed to select pilots", result["Message"]) continue pilotRefs = result["Value"] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result["OK"]: self.log.error("Failed to get pilots info from DB", result["Message"]) continue pilotDict = result["Value"] result = self.sendPilotAccounting(pilotDict) if not result["OK"]: self.log.error("Failed to send pilot agent accounting") return S_OK()