Example #1
0
    def initialize(self):
        """agent initialisation

        reading and setting config opts

        :param self: self reference
        """
        # # shifter proxy
        # See cleanContent method: this proxy will be used ALSO when the file catalog used
        # is the DIRAC File Catalog (DFC).
        # This is possible because of unset of the "UseServerCertificate" option
        self.shifterProxy = self.am_getOption("shifterProxy",
                                              self.shifterProxy)

        # # transformations types
        self.dataProcTTypes = Operations().getValue(
            "Transformations/DataProcessing", self.dataProcTTypes)
        self.dataManipTTypes = Operations().getValue(
            "Transformations/DataManipulation", self.dataManipTTypes)
        agentTSTypes = self.am_getOption("TransformationTypes", [])
        if agentTSTypes:
            self.transformationTypes = sorted(agentTSTypes)
        else:
            self.transformationTypes = sorted(self.dataProcTTypes +
                                              self.dataManipTTypes)
        self.log.info("Will consider the following transformation types: %s" %
                      str(self.transformationTypes))
        # # directory locations
        self.directoryLocations = sorted(
            self.am_getOption("DirectoryLocations", self.directoryLocations))
        self.log.info(
            "Will search for directories in the following locations: %s" %
            str(self.directoryLocations))
        # # transformation metadata
        self.transfidmeta = self.am_getOption("TransfIDMeta",
                                              self.transfidmeta)
        self.log.info("Will use %s as metadata tag name for TransformationID" %
                      self.transfidmeta)
        # # archive periof in days
        self.archiveAfter = self.am_getOption("ArchiveAfter",
                                              self.archiveAfter)  # days
        self.log.info("Will archive Completed transformations after %d days" %
                      self.archiveAfter)
        # # transformation log SEs
        self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE)
        self.log.info("Will remove logs found on storage element: %s" %
                      self.logSE)

        # # transformation client
        self.transClient = TransformationClient()
        # # wms client
        self.wmsClient = WMSClient()
        # # request client
        self.reqClient = ReqClient()
        # # file catalog client
        self.metadataClient = FileCatalogClient()
        # # job monitoring client
        self.jobMonitoringClient = JobMonitoringClient()

        return S_OK()
Example #2
0
def checkJobStateTransition(jobID,
                            candidateState,
                            currentStatus=None,
                            jobMonitoringClient=None):
    """Utility to check if a job state transition is allowed"""
    if not currentStatus:
        if not jobMonitoringClient:
            from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient

            jobMonitoringClient = JobMonitoringClient()

        res = jobMonitoringClient.getJobsStatus(jobID)
        if not res["OK"]:
            return res
        try:
            currentStatus = res["Value"][jobID]["Status"]
        except KeyError:
            return S_ERROR("Job does not exist")

    res = JobsStateMachine(currentStatus).getNextState(candidateState)
    if not res["OK"]:
        return res

    # If the JobsStateMachine does not accept the candidate, return an ERROR
    if candidateState != res["Value"]:
        gLogger.error(
            "Job Status Error",
            "%s can't move from %s to %s" %
            (jobID, currentStatus, candidateState),
        )
        return S_ERROR("Job state transition not allowed")
    return S_OK()
    def __init__(self, *args, **kwargs):
        """Initialize the agent, clients, default values."""
        AgentModule.__init__(self, *args, **kwargs)
        self.name = "ComponentSupervisionAgent"
        self.setup = "DIRAC-Production"
        self.enabled = False
        self.restartAgents = False
        self.restartExecutors = False
        self.restartServices = False
        self.controlComponents = False
        self.commitURLs = False
        self.doNotRestartInstancePattern = ["RequestExecutingAgent"]
        self.diracLocation = rootPath

        self.sysAdminClient = SystemAdministratorClient(socket.getfqdn())
        self.jobMonClient = JobMonitoringClient()
        self.nClient = NotificationClient()
        self.csAPI = None
        self.agents = dict()
        self.executors = dict()
        self.services = dict()
        self._tornadoPort = "8443"
        self.errors = list()
        self.accounting = defaultdict(dict)

        self.addressTo = []
        self.addressFrom = ""
        self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn(
        )
Example #4
0
  def test_ParametricChain(self):
    """ This test will submit a parametric job which should generate 3 actual jobs
    """
    wmsClient = WMSClient()
    jobStateUpdate = JobStateUpdateClient()
    jobMonitor = JobMonitoringClient()

    # create the job
    job = parametricJob()
    jobDescription = createFile(job)

    # submit the job
    result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
    self.assertTrue(result['OK'])
    jobIDList = result['Value']
    self.assertEqual(len(jobIDList), 3)

    result = jobMonitor.getJobsParameters(jobIDList, ['JobName'])
    self.assertTrue(result['OK'])
    jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']]
    self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)]))

    for jobID in jobIDList:
      result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')
      self.assertTrue(result['OK'])

    result = wmsClient.deleteJob(jobIDList)
    self.assertTrue(result['OK'])

    for jobID in jobIDList:
      result = jobMonitor.getJobStatus(jobID)
      self.assertTrue(result['OK'])
      self.assertEqual(result['Value'], 'Deleted')
Example #5
0
  def test_ParametricChain(self):
    """ This test will submit a parametric job which should generate 3 actual jobs
    """
    wmsClient = WMSClient()
    jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')
    jobMonitor = JobMonitoringClient()

    # create the job
    job = parametricJob()
    jobDescription = createFile(job)

    # submit the job
    result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
    self.assertTrue(result['OK'])
    jobIDList = result['Value']
    self.assertEqual(len(jobIDList), 3)

    result = jobMonitor.getJobsParameters(jobIDList, ['JobName'])
    self.assertTrue(result['OK'])
    jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']]
    self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)]))

    for jobID in jobIDList:
      result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')
      self.assertTrue(result['OK'])

    result = wmsClient.deleteJob(jobIDList)
    self.assertTrue(result['OK'])

    for jobID in jobIDList:
      result = jobMonitor.getJobStatus(jobID)
      self.assertTrue(result['OK'])
      self.assertEqual(result['Value'], 'Deleted')
Example #6
0
    def __init__(self,
                 transClient=None,
                 logger=None,
                 submissionClient=None,
                 jobMonitoringClient=None,
                 outputDataModule=None,
                 jobClass=None,
                 opsH=None,
                 destinationPlugin=None,
                 ownerDN=None,
                 ownerGroup=None):
        """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

        if not logger:
            logger = gLogger.getSubLogger('WorkflowTasks')

        super(WorkflowTasks, self).__init__(transClient, logger)

        useCertificates = True if (bool(ownerDN)
                                   and bool(ownerGroup)) else False
        if not submissionClient:
            self.submissionClient = WMSClient(useCertificates=useCertificates,
                                              delegatedDN=ownerDN,
                                              delegatedGroup=ownerGroup)
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not jobClass:
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            self.opsH = Operations()
        else:
            self.opsH = opsH

        if not outputDataModule:
            self.outputDataModule = self.opsH.getValue(
                "Transformations/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule

        if not destinationPlugin:
            self.destinationPlugin = self.opsH.getValue(
                'Transformations/DestinationPlugin', 'BySE')
        else:
            self.destinationPlugin = destinationPlugin

        self.destinationPlugin_o = None

        self.outputDataModule_o = None
Example #7
0
 def initialize(self):
     """ Initialize the agent.
 """
     self.am_setOption("PollingTime", 60)
     self.ovc = OverlaySystemClient()
     self.jobmon = JobMonitoringClient()
     return S_OK()
Example #8
0
  def execute(self):
    """ Run this.
    """
    if not self.workflowStatus['OK'] or not self.stepStatus['OK']:
      LOG.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK']))
      return S_OK('Workflow status is not OK')
    result = self.resolveInputVariables()
    if not result['OK']:
      LOG.error("Failed to resolve input parameters:", result["Message"])
      return result
    if not self.srmfiles:
      LOG.error('Files txt was not found correctly: %s' % self.srmfiles)
      return S_ERROR('Files txt was not found correctly')
    
    if not isinstance( self.files[0], dict ):
      LOG.error('Files were not found correctly: %s' % self.files)
      return S_ERROR('Files were not found correctly')

    ##Now need to check that there are not that many concurrent jobs getting the overlay at the same time
    max_concurrent_running = self.ops.getValue('/GetSRM/MaxConcurrentRunning', 100)
    error_count = 0
    while 1:
      if error_count > 10 :
        LOG.error('JobDB Content does not return expected dictionary')
        return S_ERROR('Failed to get number of concurrent overlay jobs')
      jobMonitor = JobMonitoringClient()
      res = jobMonitor.getCurrentJobCounters({'ApplicationStatus':'Downloading SRM files'})
      if not res['OK']:
        error_count += 1 
        time.sleep(60)
        continue
      running = 0
      if 'Running' in res['Value']:
        running = res['Value']['Running']
      if running < max_concurrent_running:
        break
      else:
        time.sleep(60)        

    self.setApplicationStatus('Downloading SRM files')
    for filed in self.files:
      if 'file' not in filed or 'site' not in filed:
        LOG.error('Dictionnary does not contain correct keys')
        return S_ERROR('Dictionnary does not contain correct keys')
      start = os.getcwd()
      downloadDir = tempfile.mkdtemp(prefix = 'InputData_%s' % (self.counter), dir = start)
      os.chdir(downloadDir)
      storageElement = StorageElement( filed['site'] )
      result = storageElement.getFile( filed['file'] )
      if result['Value']['Failed']:
        result = storageElement.getFile( filed['file'] )
      os.chdir(start)
      if result['Value']['Failed']:
        LOG.error("Failed to get the file from storage:", result['Value']['Failed'])
        return result
      self.counter += 1
      
       
    return S_OK()
Example #9
0
  def deleteJobOversizedSandbox(self, jobIDList):
    """ Delete the job oversized sandbox files from storage elements
    """

    failed = {}
    successful = {}

    result = JobMonitoringClient().getJobParameters(jobIDList, 'OutputSandboxLFN')
    if not result['OK']:
      return result
    osLFNList = result['Value']
    if not osLFNList:
      return S_OK({'Successful': successful, 'Failed': failed})

    # Schedule removal of the LFNs now
    for jobID, outputSandboxLFNdict in osLFNList.iteritems():
      lfn = outputSandboxLFNdict['OutputSandboxLFN']
      result = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup'])
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup)
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn

    result = {'Successful': successful, 'Failed': failed}
    return S_OK(result)
Example #10
0
  def __init__(self, *args, **kwargs):
    """Initialize the agent, clients, default values."""
    AgentModule.__init__(self, *args, **kwargs)
    self.name = 'MonitorAgents'
    self.setup = "Production"
    self.enabled = False
    self.restartAgents = False
    self.restartExecutors = False
    self.restartServices = False
    self.controlComponents = False
    self.commitURLs = False
    self.diracLocation = "/opt/dirac/pro"

    self.sysAdminClient = SystemAdministratorClient(socket.gethostname())
    self.jobMonClient = JobMonitoringClient()
    self.nClient = NotificationClient()
    self.csAPI = None
    self.agents = dict()
    self.executors = dict()
    self.services = dict()
    self.errors = list()
    self.accounting = defaultdict(dict)

    self.addressTo = ["*****@*****.**"]
    self.addressFrom = "*****@*****.**"
    self.emailSubject = "MonitorAgents on %s" % socket.gethostname()
Example #11
0
 def initialize(self):
   """ Initialize the agent.
   """
   self.am_setOption( "PollingTime", 60 )
   self.ovc = OverlaySystemClient()
   self.jobmon = JobMonitoringClient()
   return S_OK()
Example #12
0
    def test_ParametricChain(self):
        """This test will submit a parametric job which should generate 3 actual jobs"""
        wmsClient = WMSClient()
        jobStateUpdate = JobStateUpdateClient()
        jobMonitor = JobMonitoringClient()

        # create the job
        job = parametricJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res["OK"], res.get("Message"))
        jobIDList = res["Value"]
        self.assertEqual(len(jobIDList), 3, msg="Got %s" % str(jobIDList))

        res = jobMonitor.getJobsParameters(jobIDList, ["JobName"])
        self.assertTrue(res["OK"], res.get("Message"))
        jobNames = [res["Value"][jobID]["JobName"] for jobID in res["Value"]]
        self.assertEqual(
            set(jobNames),
            set(["parametric_helloWorld_%s" % nJob for nJob in range(3)]))

        for jobID in jobIDList:
            res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING,
                                              "checking", "source")
            self.assertTrue(res["OK"], res.get("Message"))

        res = wmsClient.deleteJob(jobIDList)
        self.assertTrue(res["OK"], res.get("Message"))
        print(res)

        for jobID in jobIDList:
            res = jobMonitor.getJobsStatus(jobID)
            self.assertTrue(res["OK"], res.get("Message"))
            self.assertEqual(res["Value"][jobID]["Status"],
                             JobStatus.DELETED,
                             msg="Got %s" % str(res["Value"]))
Example #13
0
  def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None,
               outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None,
               ownerDN=None, ownerGroup=None):
    """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

    if not logger:
      logger = gLogger.getSubLogger('WorkflowTasks')

    super(WorkflowTasks, self).__init__(transClient, logger)

    useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False
    if not submissionClient:
      self.submissionClient = WMSClient(useCertificates=useCertificates,
                                        delegatedDN=ownerDN,
                                        delegatedGroup=ownerGroup)
    else:
      self.submissionClient = submissionClient

    if not jobMonitoringClient:
      self.jobMonitoringClient = JobMonitoringClient()
    else:
      self.jobMonitoringClient = jobMonitoringClient

    if not jobClass:
      self.jobClass = Job
    else:
      self.jobClass = jobClass

    if not opsH:
      self.opsH = Operations()
    else:
      self.opsH = opsH

    if not outputDataModule:
      self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "")
    else:
      self.outputDataModule = outputDataModule

    if not destinationPlugin:
      self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE')
    else:
      self.destinationPlugin = destinationPlugin

    self.destinationPlugin_o = None

    self.outputDataModule_o = None
Example #14
0
    def __init__(self,
                 transClient=None,
                 logger=None,
                 submissionClient=None,
                 jobMonitoringClient=None,
                 outputDataModule=None,
                 jobClass=None,
                 opsH=None):
        """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

        if not logger:
            logger = gLogger.getSubLogger('WorkflowTasks')

        super(WorkflowTasks, self).__init__(transClient, logger)

        if not submissionClient:
            from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
            self.submissionClient = WMSClient()
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not outputDataModule:
            self.outputDataModule = gConfig.getValue(
                "/DIRAC/VOPolicy/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule

        if not jobClass:
            from DIRAC.Interfaces.API.Job import Job
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
            self.opsH = Operations()
        else:
            self.opsH = opsH
Example #15
0
    def deleteJobOversizedSandbox(self, jobIDList):
        """ Delete the job oversized sandbox files from storage elements
    """

        failed = {}
        successful = {}

        lfnDict = {}
        for jobID in jobIDList:
            result = JobMonitoringClient().getJobParameter(
                jobID, 'OutputSandboxLFN')
            if result['OK']:
                lfn = result['Value'].get('OutputSandboxLFN')
                if lfn:
                    lfnDict[lfn] = jobID
                else:
                    successful[jobID] = 'No oversized sandbox found'
            else:
                gLogger.error('Error interrogating JobDB: %s' %
                              result['Message'])

        if not lfnDict:
            return S_OK({'Successful': successful, 'Failed': failed})

        # Schedule removal of the LFNs now

        for lfn, jobID in lfnDict.items():
            result = self.jobDB.getJobAttributes(jobID,
                                                 ['OwnerDN', 'OwnerGroup'])
            if not result['OK']:
                failed[jobID] = lfn
                continue
            if not result['Value']:
                failed[jobID] = lfn
                continue

            ownerDN = result['Value']['OwnerDN']
            ownerGroup = result['Value']['OwnerGroup']
            result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup)
            if not result['OK']:
                failed[jobID] = lfn
            else:
                successful[jobID] = lfn

        result = {'Successful': successful, 'Failed': failed}
        return S_OK(result)
Example #16
0
    def deleteJobOversizedSandbox(self, jobIDList):
        """
        Deletes the job oversized sandbox files from storage elements.
        Creates a request in RMS if not immediately possible.

        :param list jobIDList: list of job IDs
        :returns: S_OK/S_ERROR
        """

        failed = {}
        successful = {}

        result = JobMonitoringClient().getJobParameters(
            jobIDList, ["OutputSandboxLFN"])
        if not result["OK"]:
            return result
        osLFNDict = result["Value"]
        if not osLFNDict:
            return S_OK({"Successful": successful, "Failed": failed})
        osLFNDict = dict(osLFN for osLFN in osLFNDict.items() if osLFN[1])

        self.log.verbose("Deleting oversized sandboxes", osLFNDict)
        # Schedule removal of the LFNs now
        for jobID, outputSandboxLFNdict in osLFNDict.items(
        ):  # can be an iterator
            lfn = outputSandboxLFNdict["OutputSandboxLFN"]
            result = self.jobDB.getJobAttributes(jobID,
                                                 ["OwnerDN", "OwnerGroup"])
            if not result["OK"]:
                failed[jobID] = lfn
                continue
            if not result["Value"]:
                failed[jobID] = lfn
                continue

            ownerDN = result["Value"]["OwnerDN"]
            ownerGroup = result["Value"]["OwnerGroup"]
            result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup)
            if not result["OK"]:
                failed[jobID] = lfn
            else:
                successful[jobID] = lfn

        result = {"Successful": successful, "Failed": failed}
        return S_OK(result)
Example #17
0
    def __init__(self,
                 transClient=None,
                 logger=None,
                 submissionClient=None,
                 jobMonitoringClient=None,
                 outputDataModule=None,
                 jobClass=None,
                 opsH=None):
        """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

        if not logger:
            logger = gLogger.getSubLogger('WorkflowTasks')

        super(WorkflowTasks, self).__init__(transClient, logger)

        if not submissionClient:
            self.submissionClient = WMSClient()
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not jobClass:
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            self.opsH = Operations()
        else:
            self.opsH = opsH

        if not outputDataModule:
            self.outputDataModule = self.opsH.getValue(
                "Transformations/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule
Example #18
0
class ResetCounters(AgentModule):
    """ Reset the number of jobs at all sites: some sites are not updated properly, so 
  once in a while it's needed to restore the correct number of jobs.
  It does not need to be exact, but enough to clear some of the jobs.
  """
    def initialize(self):
        """ Initialize the agent.
    """
        self.am_setOption("PollingTime", 60)
        self.ovc = OverlaySystemClient()
        self.jobmon = JobMonitoringClient()
        return S_OK()

    def execute(self):
        """ This is called by the Agent Reactor
    """
        res = self.ovc.getSites()
        if not res['OK']:
            return res
        sitedict = {}
        sites = res['Value']
        gLogger.info("Will update info for sites %s" % sites)
        for site in sites:
            attribdict = {
                "Site": site,
                "ApplicationStatus": 'Getting overlay files'
            }
            res = self.jobmon.getCurrentJobCounters(attribdict)
            if not res['OK']:
                continue
            if 'Running' in res['Value']:
                sitedict[site] = res['Value']['Running']
            else:
                sitedict[site] = 0
        gLogger.info("Setting new values %s" % sitedict)
        res = self.ovc.setJobsAtSites(sitedict)
        if not res['OK']:
            gLogger.error(res['Message'])
            return res

        return S_OK()
Example #19
0
  def __getJobPilotStatus(self, jobID):
    """ Get the job pilot status
    """
    result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference')
    if not result['OK']:
      return result
    pilotReference = result['Value'].get('Pilot_Reference', 'Unknown')
    if pilotReference == 'Unknown':
      # There is no pilot reference, hence its status is unknown
      return S_OK('NoPilot')

    result = PilotManagerClient().getPilotInfo(pilotReference)
    if not result['OK']:
      if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
        self.log.warn("No pilot found", "for job %d: %s" % (jobID, result['Message']))
        return S_OK('NoPilot')
      self.log.error('Failed to get pilot information',
                     'for job %d: %s' % (jobID, result['Message']))
      return result
    pilotStatus = result['Value'][pilotReference]['Status']

    return S_OK(pilotStatus)
Example #20
0
class ResetCounters ( AgentModule ):
  """ Reset the number of jobs at all sites: some sites are not updated properly, so 
  once in a while it's needed to restore the correct number of jobs.
  It does not need to be exact, but enough to clear some of the jobs.
  """
  def initialize(self):
    """ Initialize the agent.
    """
    self.am_setOption( "PollingTime", 60 )
    self.ovc = OverlaySystemClient()
    self.jobmon = JobMonitoringClient()
    return S_OK()
  
  def execute(self):
    """ This is called by the Agent Reactor
    """
    res = self.ovc.getSites()
    if not res['OK']:
      return res
    sitedict = {}
    sites = res['Value']
    gLogger.info("Will update info for sites %s" % sites)
    for site in sites:
      attribdict = {"Site" : site, "ApplicationStatus": 'Getting overlay files'}
      res = self.jobmon.getCurrentJobCounters(attribdict)
      if not res['OK']:
        continue
      if res['Value'].has_key('Running'):
        sitedict[site] = res['Value']['Running']
      else:
        sitedict[site] = 0
    gLogger.info("Setting new values %s" % sitedict)    
    res = self.ovc.setJobsAtSites(sitedict)
    if not res['OK']:
      gLogger.error(res['Message'])
      return res
    
    return S_OK()
Example #21
0
    def _getJobPilotStatus(self, jobID):
        """Get the job pilot status"""
        result = JobMonitoringClient().getJobParameter(jobID,
                                                       "Pilot_Reference")
        if not result["OK"]:
            return result
        pilotReference = result["Value"].get("Pilot_Reference", "Unknown")
        if pilotReference == "Unknown":
            # There is no pilot reference, hence its status is unknown
            return S_OK("NoPilot")

        result = PilotManagerClient().getPilotInfo(pilotReference)
        if not result["OK"]:
            if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
                self.log.warn("No pilot found",
                              "for job %d: %s" % (jobID, result["Message"]))
                return S_OK("NoPilot")
            self.log.error("Failed to get pilot information",
                           "for job %d: %s" % (jobID, result["Message"]))
            return result
        pilotStatus = result["Value"][pilotReference]["Status"]

        return S_OK(pilotStatus)
Example #22
0
    def __getJobPilotStatus(self, jobID):
        """ Get the job pilot status
    """
        result = JobMonitoringClient().getJobParameter(jobID,
                                                       'Pilot_Reference')
        if not result['OK']:
            return result
        pilotReference = result['Value'].get('Pilot_Reference')
        if not pilotReference:
            # There is no pilot reference, hence its status is unknown
            return S_OK('NoPilot')

        result = PilotManagerClient().getPilotInfo(pilotReference)
        if not result['OK']:
            if "No pilots found" in result['Message']:
                self.log.warn(result['Message'])
                return S_OK('NoPilot')
            self.log.error('Failed to get pilot information',
                           'for job %d: ' % jobID + result['Message'])
            return S_ERROR('Failed to get the pilot status')
        pilotStatus = result['Value'][pilotReference]['Status']

        return S_OK(pilotStatus)
Example #23
0
class TransformationCleaningAgent(AgentModule):
    """
    .. class:: TransformationCleaningAgent

    :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance
    :param ~TransformationClient.TransformationClient transClient: TransformationClient instance
    :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance

    """
    def __init__(self, *args, **kwargs):
        """c'tor"""
        AgentModule.__init__(self, *args, **kwargs)

        self.shifterProxy = None

        # # transformation client
        self.transClient = None
        # # wms client
        self.wmsClient = None
        # # request client
        self.reqClient = None
        # # file catalog client
        self.metadataClient = None

        # # transformations types
        self.transformationTypes = None
        # # directory locations
        self.directoryLocations = ["TransformationDB", "MetadataCatalog"]
        # # transformation metadata
        self.transfidmeta = "TransformationID"
        # # archive periof in days
        self.archiveAfter = 7
        # # transformation log SEs
        self.logSE = "LogSE"
        # # enable/disable execution
        self.enableFlag = "True"

        self.dataProcTTypes = ["MCSimulation", "Merge"]
        self.dataManipTTypes = ["Replication", "Removal"]

    def initialize(self):
        """agent initialisation

        reading and setting config opts

        :param self: self reference
        """
        # # shifter proxy
        # See cleanContent method: this proxy will be used ALSO when the file catalog used
        # is the DIRAC File Catalog (DFC).
        # This is possible because of unset of the "UseServerCertificate" option
        self.shifterProxy = self.am_getOption("shifterProxy",
                                              self.shifterProxy)

        # # transformations types
        self.dataProcTTypes = Operations().getValue(
            "Transformations/DataProcessing", self.dataProcTTypes)
        self.dataManipTTypes = Operations().getValue(
            "Transformations/DataManipulation", self.dataManipTTypes)
        agentTSTypes = self.am_getOption("TransformationTypes", [])
        if agentTSTypes:
            self.transformationTypes = sorted(agentTSTypes)
        else:
            self.transformationTypes = sorted(self.dataProcTTypes +
                                              self.dataManipTTypes)
        self.log.info("Will consider the following transformation types: %s" %
                      str(self.transformationTypes))
        # # directory locations
        self.directoryLocations = sorted(
            self.am_getOption("DirectoryLocations", self.directoryLocations))
        self.log.info(
            "Will search for directories in the following locations: %s" %
            str(self.directoryLocations))
        # # transformation metadata
        self.transfidmeta = self.am_getOption("TransfIDMeta",
                                              self.transfidmeta)
        self.log.info("Will use %s as metadata tag name for TransformationID" %
                      self.transfidmeta)
        # # archive periof in days
        self.archiveAfter = self.am_getOption("ArchiveAfter",
                                              self.archiveAfter)  # days
        self.log.info("Will archive Completed transformations after %d days" %
                      self.archiveAfter)
        # # transformation log SEs
        self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE)
        self.log.info("Will remove logs found on storage element: %s" %
                      self.logSE)

        # # transformation client
        self.transClient = TransformationClient()
        # # wms client
        self.wmsClient = WMSClient()
        # # request client
        self.reqClient = ReqClient()
        # # file catalog client
        self.metadataClient = FileCatalogClient()
        # # job monitoring client
        self.jobMonitoringClient = JobMonitoringClient()

        return S_OK()

    #############################################################################
    def execute(self):
        """execution in one agent's cycle

        :param self: self reference
        """

        self.enableFlag = self.am_getOption("EnableFlag", self.enableFlag)
        if self.enableFlag != "True":
            self.log.info(
                "TransformationCleaningAgent is disabled by configuration option EnableFlag"
            )
            return S_OK("Disabled via CS flag")

        # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
        res = self.transClient.getTransformations({
            "Status":
            "Cleaning",
            "Type":
            self.transformationTypes
        })
        if res["OK"]:
            for transDict in res["Value"]:
                if self.shifterProxy:
                    self._executeClean(transDict)
                else:
                    self.log.info(
                        "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeClean)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])
        else:
            self.log.error("Failed to get transformations", res["Message"])

        # Obtain the transformations in RemovingFiles status and removes the output files
        res = self.transClient.getTransformations({
            "Status":
            "RemovingFiles",
            "Type":
            self.transformationTypes
        })
        if res["OK"]:
            for transDict in res["Value"]:
                if self.shifterProxy:
                    self._executeRemoval(transDict)
                else:
                    self.log.info(
                        "Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeRemoval)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])
        else:
            self.log.error("Could not get the transformations", res["Message"])

        # Obtain the transformations in Completed status and archive if inactive for X days
        olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter)
        res = self.transClient.getTransformations(
            {
                "Status": "Completed",
                "Type": self.transformationTypes
            },
            older=olderThanTime,
            timeStamp="LastUpdate")
        if res["OK"]:
            for transDict in res["Value"]:
                if self.shifterProxy:
                    self._executeArchive(transDict)
                else:
                    self.log.info(
                        "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeArchive)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])
        else:
            self.log.error("Could not get the transformations", res["Message"])
        return S_OK()

    def finalize(self):
        """Only at finalization: will clean ancient transformations (remnants)

            1) get the transformation IDs of jobs that are older than 1 year
            2) find the status of those transformations. Those "Cleaned" and "Archived" will be
               cleaned and archived (again)

        Why doing this here? Basically, it's a race:

        1) the production manager submits a transformation
        2) the TransformationAgent, and a bit later the WorkflowTaskAgent, put such transformation in their internal queue,
           so eventually during their (long-ish) cycle they'll work on it.
        3) 1 minute after creating the transformation, the production manager cleans it (by hand, for whatever reason).
           So, the status is changed to "Cleaning"
        4) the TransformationCleaningAgent cleans what has been created (maybe, nothing),
           then sets the transformation status to "Cleaned" or "Archived"
        5) a bit later the TransformationAgent, and later the WorkflowTaskAgent, kick in,
           creating tasks and jobs for a production that's effectively cleaned (but these 2 agents don't know yet).

        Of course, one could make one final check in TransformationAgent or WorkflowTaskAgent,
        but these 2 agents are already doing a lot of stuff, and are pretty heavy.
        So, we should just clean from time to time.
        What I added here is done only when the agent finalize, and it's quite light-ish operation anyway.
        """
        res = self.jobMonitoringClient.getJobGroups(
            None,
            datetime.utcnow() - timedelta(days=365))
        if not res["OK"]:
            self.log.error("Failed to get job groups", res["Message"])
            return res
        transformationIDs = res["Value"]
        if transformationIDs:
            res = self.transClient.getTransformations(
                {"TransformationID": transformationIDs})
            if not res["OK"]:
                self.log.error("Failed to get transformations", res["Message"])
                return res
            transformations = res["Value"]
            toClean = []
            toArchive = []
            for transDict in transformations:
                if transDict["Status"] == "Cleaned":
                    toClean.append(transDict)
                if transDict["Status"] == "Archived":
                    toArchive.append(transDict)

            for transDict in toClean:
                if self.shifterProxy:
                    self._executeClean(transDict)
                else:
                    self.log.info(
                        "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeClean)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])

            for transDict in toArchive:
                if self.shifterProxy:
                    self._executeArchive(transDict)
                else:
                    self.log.info(
                        "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeArchive)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])

            # Remove JobIDs that were unknown to the TransformationSystem
            jobGroupsToCheck = [
                str(transDict["TransformationID"]).zfill(8)
                for transDict in toClean + toArchive
            ]
            res = self.jobMonitoringClient.getJobs(
                {"JobGroup": jobGroupsToCheck})
            if not res["OK"]:
                return res
            jobIDsToRemove = [int(jobID) for jobID in res["Value"]]
            res = self.__removeWMSTasks(jobIDsToRemove)
            if not res["OK"]:
                return res

        return S_OK()

    def _executeClean(self, transDict):
        """Clean transformation."""
        # if transformation is of type `Replication` or `Removal`, there is nothing to clean.
        # We just archive
        if transDict["Type"] in self.dataManipTTypes:
            res = self.archiveTransformation(transDict["TransformationID"])
            if not res["OK"]:
                self.log.error(
                    "Problems archiving transformation",
                    "%s: %s" % (transDict["TransformationID"], res["Message"]))
        else:
            res = self.cleanTransformation(transDict["TransformationID"])
            if not res["OK"]:
                self.log.error(
                    "Problems cleaning transformation",
                    "%s: %s" % (transDict["TransformationID"], res["Message"]))

    def _executeRemoval(self, transDict):
        """Remove files from given transformation."""
        res = self.removeTransformationOutput(transDict["TransformationID"])
        if not res["OK"]:
            self.log.error(
                "Problems removing transformation",
                "%s: %s" % (transDict["TransformationID"], res["Message"]))

    def _executeArchive(self, transDict):
        """Archive the given transformation."""
        res = self.archiveTransformation(transDict["TransformationID"])
        if not res["OK"]:
            self.log.error(
                "Problems archiving transformation",
                "%s: %s" % (transDict["TransformationID"], res["Message"]))

        return S_OK()

    #############################################################################
    #
    # Get the transformation directories for checking
    #

    def getTransformationDirectories(self, transID):
        """get the directories for the supplied transformation from the transformation system.
            These directories are used by removeTransformationOutput and cleanTransformation for removing output.

        :param self: self reference
        :param int transID: transformation ID
        """
        self.log.verbose(
            "Cleaning Transformation directories of transformation %d" %
            transID)
        directories = []
        if "TransformationDB" in self.directoryLocations:
            res = self.transClient.getTransformationParameters(
                transID, ["OutputDirectories"])
            if not res["OK"]:
                self.log.error("Failed to obtain transformation directories",
                               res["Message"])
                return res
            transDirectories = []
            if res["Value"]:
                if not isinstance(res["Value"], list):
                    try:
                        transDirectories = ast.literal_eval(res["Value"])
                    except Exception:
                        # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]'
                        transDirectories.append(res["Value"])
                else:
                    transDirectories = res["Value"]
            directories = self._addDirs(transID, transDirectories, directories)

        if "MetadataCatalog" in self.directoryLocations:
            res = self.metadataClient.findDirectoriesByMetadata(
                {self.transfidmeta: transID})
            if not res["OK"]:
                self.log.error("Failed to obtain metadata catalog directories",
                               res["Message"])
                return res
            transDirectories = res["Value"]
            directories = self._addDirs(transID, transDirectories, directories)

        if not directories:
            self.log.info("No output directories found")
        directories = sorted(directories)
        return S_OK(directories)

    @classmethod
    def _addDirs(cls, transID, newDirs, existingDirs):
        """append unique :newDirs: list to :existingDirs: list

        :param self: self reference
        :param int transID: transformationID
        :param list newDirs: src list of paths
        :param list existingDirs: dest list of paths
        """
        for folder in newDirs:
            transStr = str(transID).zfill(8)
            if re.search(transStr, str(folder)):
                if folder not in existingDirs:
                    existingDirs.append(os.path.normpath(folder))
        return existingDirs

    #############################################################################
    #
    # These are the methods for performing the cleaning of catalogs and storage
    #

    def cleanContent(self, directory):
        """wipe out everything from catalog under folder :directory:

        :param self: self reference
        :params str directory: folder name
        """
        self.log.verbose("Cleaning Catalog contents")
        res = self.__getCatalogDirectoryContents([directory])
        if not res["OK"]:
            return res
        filesFound = res["Value"]
        if not filesFound:
            self.log.info(
                "No files are registered in the catalog directory %s" %
                directory)
            return S_OK()
        self.log.info(
            "Attempting to remove possible remnants from the catalog and storage",
            "(n=%d)" % len(filesFound))

        # Executing with shifter proxy
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "false")
        res = DataManager().removeFile(filesFound, force=True)
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "true")

        if not res["OK"]:
            return res
        realFailure = False
        for lfn, reason in res["Value"]["Failed"].items():
            if "File does not exist" in str(reason):
                self.log.warn("File %s not found in some catalog: " % (lfn))
            else:
                self.log.error("Failed to remove file found in the catalog",
                               "%s %s" % (lfn, reason))
                realFailure = True
        if realFailure:
            return S_ERROR("Failed to remove all files found in the catalog")
        return S_OK()

    def __getCatalogDirectoryContents(self, directories):
        """get catalog contents under paths :directories:

        :param self: self reference
        :param list directories: list of paths in catalog
        """
        self.log.info("Obtaining the catalog contents for %d directories:" %
                      len(directories))
        for directory in directories:
            self.log.info(directory)
        activeDirs = directories
        allFiles = {}
        fc = FileCatalog()
        while activeDirs:
            currentDir = activeDirs[0]
            res = returnSingleResult(fc.listDirectory(currentDir))
            activeDirs.remove(currentDir)
            if not res["OK"] and "Directory does not exist" in res[
                    "Message"]:  # FIXME: DFC should return errno
                self.log.info("The supplied directory %s does not exist" %
                              currentDir)
            elif not res["OK"]:
                if "No such file or directory" in res["Message"]:
                    self.log.info("%s: %s" % (currentDir, res["Message"]))
                else:
                    self.log.error(
                        "Failed to get directory %s content" % currentDir,
                        res["Message"])
            else:
                dirContents = res["Value"]
                activeDirs.extend(dirContents["SubDirs"])
                allFiles.update(dirContents["Files"])
        self.log.info("", "Found %d files" % len(allFiles))
        return S_OK(list(allFiles))

    def cleanTransformationLogFiles(self, directory):
        """clean up transformation logs from directory :directory:

        :param self: self reference
        :param str directory: folder name
        """
        self.log.verbose("Removing log files found in the directory",
                         directory)
        res = returnSingleResult(
            StorageElement(self.logSE).removeDirectory(directory,
                                                       recursive=True))
        if not res["OK"]:
            if cmpError(res, errno.ENOENT):  # No such file or directory
                self.log.warn("Transformation log directory does not exist",
                              directory)
                return S_OK()
            self.log.error("Failed to remove log files", res["Message"])
            return res
        self.log.info("Successfully removed transformation log directory")
        return S_OK()

    #############################################################################
    #
    # These are the functional methods for archiving and cleaning transformations
    #

    def removeTransformationOutput(self, transID):
        """This just removes any mention of the output data from the catalog and storage"""
        self.log.info("Removing output data for transformation %s" % transID)
        res = self.getTransformationDirectories(transID)
        if not res["OK"]:
            self.log.error("Problem obtaining directories for transformation",
                           "%s with result '%s'" % (transID, res))
            return S_OK()
        directories = res["Value"]
        for directory in directories:
            if not re.search("/LOG/", directory):
                res = self.cleanContent(directory)
                if not res["OK"]:
                    return res

        self.log.info("Removed %d directories from the catalog \
      and its files from the storage for transformation %s" %
                      (len(directories), transID))
        # Clean ALL the possible remnants found in the metadata catalog
        res = self.cleanMetadataCatalogFiles(transID)
        if not res["OK"]:
            return res
        self.log.info("Successfully removed output of transformation", transID)
        # Change the status of the transformation to RemovedFiles
        res = self.transClient.setTransformationParameter(
            transID, "Status", "RemovedFiles")
        if not res["OK"]:
            self.log.error(
                "Failed to update status of transformation %s to RemovedFiles"
                % (transID), res["Message"])
            return res
        self.log.info("Updated status of transformation %s to RemovedFiles" %
                      (transID))
        return S_OK()

    def archiveTransformation(self, transID):
        """This just removes job from the jobDB and the transformation DB

        :param self: self reference
        :param int transID: transformation ID
        """
        self.log.info("Archiving transformation %s" % transID)
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res["OK"]:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res["OK"]:
            return res
        self.log.info("Successfully archived transformation %d" % transID)
        # Change the status of the transformation to archived
        res = self.transClient.setTransformationParameter(
            transID, "Status", "Archived")
        if not res["OK"]:
            self.log.error(
                "Failed to update status of transformation %s to Archived" %
                (transID), res["Message"])
            return res
        self.log.info("Updated status of transformation %s to Archived" %
                      (transID))
        return S_OK()

    def cleanTransformation(self, transID):
        """This removes what was produced by the supplied transformation,
        leaving only some info and log in the transformation DB.
        """
        self.log.info("Cleaning transformation", transID)
        res = self.getTransformationDirectories(transID)
        if not res["OK"]:
            self.log.error("Problem obtaining directories for transformation",
                           "%s with result '%s'" % (transID, res["Message"]))
            return S_OK()
        directories = res["Value"]
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res["OK"]:
            return res
        # Clean the log files for the jobs
        for directory in directories:
            if re.search("/LOG/", directory):
                res = self.cleanTransformationLogFiles(directory)
                if not res["OK"]:
                    return res
            res = self.cleanContent(directory)
            if not res["OK"]:
                return res

        # Clean ALL the possible remnants found
        res = self.cleanMetadataCatalogFiles(transID)
        if not res["OK"]:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res["OK"]:
            return res
        self.log.info("Successfully cleaned transformation", transID)
        res = self.transClient.setTransformationParameter(
            transID, "Status", "Cleaned")
        if not res["OK"]:
            self.log.error(
                "Failed to update status of transformation %s to Cleaned" %
                (transID), res["Message"])
            return res
        self.log.info("Updated status of transformation",
                      "%s to Cleaned" % (transID))
        return S_OK()

    def cleanMetadataCatalogFiles(self, transID):
        """wipe out files from catalog"""
        res = self.metadataClient.findFilesByMetadata(
            {self.transfidmeta: transID})
        if not res["OK"]:
            return res
        fileToRemove = res["Value"]
        if not fileToRemove:
            self.log.info("No files found for transID", transID)
            return S_OK()

        # Executing with shifter proxy
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "false")
        res = DataManager().removeFile(fileToRemove, force=True)
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "true")

        if not res["OK"]:
            return res
        for lfn, reason in res["Value"]["Failed"].items():
            self.log.error("Failed to remove file found in metadata catalog",
                           "%s %s" % (lfn, reason))
        if res["Value"]["Failed"]:
            return S_ERROR(
                "Failed to remove all files found in the metadata catalog")
        self.log.info("Successfully removed all files found in the DFC")
        return S_OK()

    #############################################################################
    #
    # These are the methods for removing the jobs from the WMS and transformation DB
    #

    def cleanTransformationTasks(self, transID):
        """clean tasks from WMS, or from the RMS if it is a DataManipulation transformation"""
        self.log.verbose("Cleaning Transformation tasks of transformation",
                         transID)
        res = self.__getTransformationExternalIDs(transID)
        if not res["OK"]:
            return res
        externalIDs = res["Value"]
        if externalIDs:
            res = self.transClient.getTransformationParameters(
                transID, ["Type"])
            if not res["OK"]:
                self.log.error("Failed to determine transformation type")
                return res
            transType = res["Value"]
            if transType in self.dataProcTTypes:
                res = self.__removeWMSTasks(externalIDs)
            else:
                res = self.__removeRequests(externalIDs)
            if not res["OK"]:
                return res
        return S_OK()

    def __getTransformationExternalIDs(self, transID):
        """collect all ExternalIDs for transformation :transID:

        :param self: self reference
        :param int transID: transforamtion ID
        """
        res = self.transClient.getTransformationTasks(
            condDict={"TransformationID": transID})
        if not res["OK"]:
            self.log.error(
                "Failed to get externalIDs for transformation %d" % transID,
                res["Message"])
            return res
        externalIDs = [taskDict["ExternalID"] for taskDict in res["Value"]]
        self.log.info("Found %d tasks for transformation" % len(externalIDs))
        return S_OK(externalIDs)

    def __removeRequests(self, requestIDs):
        """This will remove requests from the RMS system -"""
        rIDs = [int(int(j)) for j in requestIDs if int(j)]
        for reqID in rIDs:
            self.reqClient.cancelRequest(reqID)

        return S_OK()

    def __removeWMSTasks(self, transJobIDs):
        """delete jobs (mark their status as "JobStatus.DELETED") and their requests from the system

        :param self: self reference
        :param list trasnJobIDs: job IDs
        """
        # Prevent 0 job IDs
        jobIDs = [int(j) for j in transJobIDs if int(j)]
        allRemove = True
        for jobList in breakListIntoChunks(jobIDs, 500):

            res = self.wmsClient.killJob(jobList)
            if res["OK"]:
                self.log.info("Successfully killed %d jobs from WMS" %
                              len(jobList))
            elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs"
                                               not in res) and ("FailedJobIDs"
                                                                not in res):
                self.log.info("Found jobs which did not exist in the WMS",
                              "(n=%d)" % len(res["InvalidJobIDs"]))
            elif "NonauthorizedJobIDs" in res:
                self.log.error("Failed to kill jobs because not authorized",
                               "(n=%d)" % len(res["NonauthorizedJobIDs"]))
                allRemove = False
            elif "FailedJobIDs" in res:
                self.log.error("Failed to kill jobs",
                               "(n=%d)" % len(res["FailedJobIDs"]))
                allRemove = False

            res = self.wmsClient.deleteJob(jobList)
            if res["OK"]:
                self.log.info("Successfully deleted jobs from WMS",
                              "(n=%d)" % len(jobList))
            elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs"
                                               not in res) and ("FailedJobIDs"
                                                                not in res):
                self.log.info("Found jobs which did not exist in the WMS",
                              "(n=%d)" % len(res["InvalidJobIDs"]))
            elif "NonauthorizedJobIDs" in res:
                self.log.error("Failed to delete jobs because not authorized",
                               "(n=%d)" % len(res["NonauthorizedJobIDs"]))
                allRemove = False
            elif "FailedJobIDs" in res:
                self.log.error("Failed to delete jobs",
                               "(n=%d)" % len(res["FailedJobIDs"]))
                allRemove = False

        if not allRemove:
            return S_ERROR("Failed to delete all remnants from WMS")
        self.log.info("Successfully deleted all tasks from the WMS")

        if not jobIDs:
            self.log.info(
                "JobIDs not present, unable to delete associated requests.")
            return S_OK()

        failed = 0
        failoverRequests = {}
        res = self.reqClient.getRequestIDsForJobs(jobIDs)
        if not res["OK"]:
            self.log.error("Failed to get requestID for jobs.", res["Message"])
            return res
        failoverRequests.update(res["Value"]["Successful"])
        if not failoverRequests:
            return S_OK()
        for jobID, requestID in res["Value"]["Successful"].items():
            # Put this check just in case, tasks must have associated jobs
            if jobID == 0 or jobID == "0":
                continue
            res = self.reqClient.cancelRequest(requestID)
            if not res["OK"]:
                self.log.error("Failed to remove request from RequestDB",
                               res["Message"])
                failed += 1
            else:
                self.log.verbose("Removed request %s associated to job %d." %
                                 (requestID, jobID))

        if failed:
            self.log.info("Successfully removed requests",
                          "(n=%d)" % (len(failoverRequests) - failed))
            self.log.info("Failed to remove requests", "(n=%d)" % failed)
            return S_ERROR("Failed to remove all the request from RequestDB")
        self.log.info(
            "Successfully removed all the associated failover requests")
        return S_OK()
Example #24
0
    def __init__(self, *args, **kwargs):
        AgentModule.__init__(self, *args, **kwargs)
        self.name = 'DataRecoveryAgent'
        self.enabled = False
        self.getJobInfoFromJDLOnly = False

        self.__getCSOptions()

        self.jobStatus = [
            'Failed', 'Done'
        ]  # This needs to be both otherwise we cannot account for all cases

        self.jobMon = JobMonitoringClient()
        self.fcClient = FileCatalogClient()
        self.tClient = TransformationClient()
        self.reqClient = ReqClient()
        self.diracAPI = Dirac()
        self.inputFilesProcessed = set()
        self.todo = {'NoInputFiles':
                     [dict(Message="NoInputFiles: OutputExists: Job 'Done'",
                           ShortMessage="NoInputFiles: job 'Done' ",
                           Counter=0,
                           Check=lambda job: job.allFilesExist() and job.status == 'Failed',
                           Actions=lambda job, tInfo: [job.setJobDone(tInfo)],
                           ),
                      dict(Message="NoInputFiles: OutputMissing: Job 'Failed'",
                           ShortMessage="NoInputFiles: job 'Failed' ",
                           Counter=0,
                           Check=lambda job: job.allFilesMissing() and job.status == 'Done',
                           Actions=lambda job, tInfo: [job.setJobFailed(tInfo)],
                           ),
                      ],
                     'InputFiles':
                     [ \
                     # must always be first!

                         dict(Message="One of many Successful: clean others",
                              ShortMessage="Other Tasks --> Keep",
                              Counter=0,
                              Check=lambda job: job.allFilesExist() and job.otherTasks and \
                              not set(job.inputFiles).issubset(self.inputFilesProcessed),
                              Actions=lambda job, tInfo: [self.inputFilesProcessed.update(job.inputFiles),
                                                          job.setJobDone(tInfo),
                                                          job.setInputProcessed(tInfo)]
                              ),
                         dict(Message="Other Task processed Input, no Output: Fail",
                              ShortMessage="Other Tasks --> Fail",
                              Counter=0,
                              Check=lambda job: set(job.inputFiles).issubset(self.inputFilesProcessed) and \
                              job.allFilesMissing() and job.status != 'Failed',
                              Actions=lambda job, tInfo: [job.setJobFailed(tInfo)]
                              ),
                         dict(Message="Other Task processed Input: Fail and clean",
                              ShortMessage="Other Tasks --> Cleanup",
                              Counter=0,
                              Check=lambda job: set(job.inputFiles).issubset(
                                  self.inputFilesProcessed) and not job.allFilesMissing(),
                              Actions=lambda job, tInfo: [job.setJobFailed(tInfo), job.cleanOutputs(tInfo)]
                              ),
                         dict(Message="InputFile(s) missing: mark job 'Failed', mark input 'Deleted', clean",
                              ShortMessage="Input Missing --> Job 'Failed, Input 'Deleted', Cleanup",
                              Counter=0,
                              Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \
                              not job.allTransFilesDeleted(),
                              Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo),
                                                          job.setInputDeleted(tInfo)],
                              ),
                         dict(Message="InputFile(s) Deleted, output Exists: mark job 'Failed', clean",
                              ShortMessage="Input Deleted --> Job 'Failed, Cleanup",
                              Counter=0,
                              Check=lambda job: job.inputFiles and job.allInputFilesMissing() and \
                              job.allTransFilesDeleted() and not job.allFilesMissing(),
                              Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setJobFailed(tInfo)],
                              ),
                         # All Output Exists
                         dict(Message="Output Exists, job Failed, input not Processed --> Job Done, Input Processed",
                              ShortMessage="Output Exists --> Job Done, Input Processed",
                              Counter=0,
                              Check=lambda job: job.allFilesExist() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              not job.allFilesProcessed() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setJobDone(tInfo), job.setInputProcessed(tInfo)]
                              ),
                         dict(Message="Output Exists, job Failed, input Processed --> Job Done",
                              ShortMessage="Output Exists --> Job Done",
                              Counter=0,
                              Check=lambda job: job.allFilesExist() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              job.allFilesProcessed() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setJobDone(tInfo)]
                              ),
                         dict(Message="Output Exists, job Done, input not Processed --> Input Processed",
                              ShortMessage="Output Exists --> Input Processed",
                              Counter=0,
                              Check=lambda job: job.allFilesExist() and \
                              not job.otherTasks and \
                              job.status == 'Done' and \
                              not job.allFilesProcessed() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setInputProcessed(tInfo)]
                              ),
                         # outputmissing
                         dict(Message="Output Missing, job Failed, input Assigned, MaxError --> Input MaxReset",
                              ShortMessage="Max ErrorCount --> Input MaxReset",
                              Counter=0,
                              Check=lambda job: job.allFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              job.allFilesAssigned() and \
                              not set(job.inputFiles).issubset(self.inputFilesProcessed) and \
                              job.allInputFilesExist() and \
                              job.checkErrorCount(),
                              Actions=lambda job, tInfo: [job.setInputMaxReset(tInfo)]
                              ),
                         dict(Message="Output Missing, job Failed, input Assigned --> Input Unused",
                              ShortMessage="Output Missing --> Input Unused",
                              Counter=0,
                              Check=lambda job: job.allFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              job.allFilesAssigned() and \
                              not set(job.inputFiles).issubset(self.inputFilesProcessed) and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setInputUnused(tInfo)]
                              ),
                         dict(Message="Output Missing, job Done, input Assigned --> Job Failed, Input Unused",
                              ShortMessage="Output Missing --> Job Failed, Input Unused",
                              Counter=0,
                              Check=lambda job: job.allFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Done' and \
                              job.allFilesAssigned() and \
                              not set(job.inputFiles).issubset(self.inputFilesProcessed) and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.setInputUnused(tInfo), job.setJobFailed(tInfo)]
                              ),
                         # some files missing, needing cleanup. Only checking for
                         # assigned, because processed could mean an earlier job was
                         # succesful and this one is just the duplicate that needed
                         # to be removed! But we check for other tasks earlier, so
                         # this should not happen
                         dict(Message="Some missing, job Failed, input Assigned --> cleanup, Input 'Unused'",
                              ShortMessage="Output Missing --> Cleanup, Input Unused",
                              Counter=0,
                              Check=lambda job: job.someFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Failed' and \
                              job.allFilesAssigned() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [job.cleanOutputs(tInfo), job.setInputUnused(tInfo)]
                              ),
                         dict(Message="Some missing, job Done, input Assigned --> cleanup, job Failed, Input 'Unused'",
                              ShortMessage="Output Missing --> Cleanup, Job Failed, Input Unused",
                              Counter=0,
                              Check=lambda job: job.someFilesMissing() and \
                              not job.otherTasks and \
                              job.status == 'Done' and \
                              job.allFilesAssigned() and \
                              job.allInputFilesExist(),
                              Actions=lambda job, tInfo: [
                                  job.cleanOutputs(tInfo), job.setInputUnused(tInfo), job.setJobFailed(tInfo)]
                              ),
                         dict(Message="Some missing, job Done --> job Failed",
                              ShortMessage="Output Missing, Done --> Job Failed",
                              Counter=0,
                              Check=lambda job: not job.allFilesExist() and job.status == 'Done',
                              Actions=lambda job, tInfo: [job.setJobFailed(tInfo)]
                              ),
                         dict(Message="Something Strange",
                              ShortMessage="Strange",
                              Counter=0,
                              Check=lambda job: job.status not in ("Failed", "Done"),
                              Actions=lambda job, tInfo: []
                              ),
                         # should always be the last one!
                         dict(Message="Failed Hard",
                              ShortMessage="Failed Hard",
                              Counter=0,
                              Check=lambda job: False,  # never
                              Actions=lambda job, tInfo: []
                              ),
                     ]
                     }
        self.jobCache = defaultdict(lambda: (0, 0))
        # Notification options
        self.notesToSend = ""
        self.subject = "DataRecoveryAgent"
        self.startTime = time.time()
Example #25
0
    def getProductionApplicationSummary(self,
                                        productionID,
                                        status=None,
                                        minorStatus=None,
                                        printOutput=False):
        """Returns an application status summary for the productions in the system. If printOutput is
       specified, the result is printed to the screen.  This queries the WMS
       for the given productionID and provides an up-to-date snapshot of the application status
       combinations and associated WMS JobIDs.
    """
        if not isinstance(productionID, (int, long, str)):
            return self._errorReport(
                'Expected string, long or int for production ID')

        statusDict = self.getProdJobMetadata(productionID, status, minorStatus)
        if not statusDict['OK']:
            self.log.warn('Could not get production metadata information')
            return statusDict

        jobIDs = list(statusDict['Value'])
        if not jobIDs:
            return S_ERROR('No JobIDs with matching conditions found')

        self.log.verbose('Considering %s jobs with selected conditions' %
                         (len(jobIDs)))
        # now need to get the application status information
        result = JobMonitoringClient().getJobsApplicationStatus(jobIDs)
        if not result['OK']:
            self.log.warn('Could not get application status for jobs list')
            return result

        appStatus = result['Value']
        #    self._prettyPrint(appStatus)
        #    self._prettyPrint(statusDict['Value'])
        # Now format the result.
        summary = {}
        submittedJobs = 0
        doneJobs = 0
        for job, atts in statusDict['Value'].iteritems():
            for key, val in atts.iteritems():
                if key == 'Status':
                    uniqueStatus = val.capitalize()
                    if uniqueStatus not in summary:
                        summary[uniqueStatus] = {}
                    if atts['MinorStatus'] not in summary[uniqueStatus]:
                        summary[uniqueStatus][atts['MinorStatus']] = {}
                    if appStatus[job]['ApplicationStatus'] not in summary[
                            uniqueStatus][atts['MinorStatus']]:
                        summary[uniqueStatus][atts['MinorStatus']][
                            appStatus[job]['ApplicationStatus']] = {}
                        summary[uniqueStatus][atts['MinorStatus']][
                            appStatus[job]['ApplicationStatus']]['Total'] = 1
                        submittedJobs += 1
                        if uniqueStatus == 'Done':
                            doneJobs += 1
                        summary[uniqueStatus][atts['MinorStatus']][
                            appStatus[job]['ApplicationStatus']]['JobList'] = [
                                job
                            ]
                    else:
                        if appStatus[job]['ApplicationStatus'] not in summary[
                                uniqueStatus][atts['MinorStatus']]:
                            summary[uniqueStatus][atts['MinorStatus']] = {}
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]['ApplicationStatus']] = {}
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['Total'] = 1
                            submittedJobs += 1
                            if uniqueStatus == 'Done':
                                doneJobs += 1
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['JobList'] = [job]
                        else:
                            current = summary[uniqueStatus][
                                atts['MinorStatus']][appStatus[job][
                                    'ApplicationStatus']]['Total']
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['Total'] = current + 1
                            submittedJobs += 1
                            if uniqueStatus == 'Done':
                                doneJobs += 1
                            jobList = summary[uniqueStatus][
                                atts['MinorStatus']][appStatus[job][
                                    'ApplicationStatus']]['JobList']
                            jobList.append(job)
                            summary[uniqueStatus][atts['MinorStatus']][
                                appStatus[job]
                                ['ApplicationStatus']]['JobList'] = jobList

        if not printOutput:
            result = S_OK()
            if not status and not minorStatus:
                result['Totals'] = {
                    'Submitted': int(submittedJobs),
                    'Done': int(doneJobs)
                }
            result['Value'] = summary
            return result

        # If a printed summary is requested
        statAdj = int(0.5 * self.prodAdj)
        mStatAdj = int(2.0 * self.prodAdj)
        totalAdj = int(0.5 * self.prodAdj)
        exAdj = int(0.5 * self.prodAdj)
        message = '\nJob Summary for ProductionID %s considering status %s' % (
            productionID, status)
        if minorStatus:
            message += 'and MinorStatus = %s' % (minorStatus)

        message += ':\n\n'
        message += 'Status'.ljust(statAdj) + 'MinorStatus'.ljust(mStatAdj) + 'ApplicationStatus'.ljust(mStatAdj) + \
            'Total'.ljust(totalAdj) + 'Example'.ljust(exAdj) + '\n'
        for stat, metadata in summary.iteritems():
            message += '\n'
            for minor, appInfo in metadata.iteritems():
                message += '\n'
                for appStat, jobInfo in appInfo.iteritems():
                    message += stat.ljust(statAdj) + minor.ljust(mStatAdj) + appStat.ljust(mStatAdj) + \
                        str(jobInfo['Total']).ljust(totalAdj) + str(jobInfo['JobList'][0]).ljust(exAdj) + '\n'

        # self._prettyPrint(summary)
        if status or minorStatus:
            return S_OK(summary)

        result = self.getProductionProgress(productionID)
        if not result['OK']:
            self.log.warn('Could not get production progress information')
            return result

        if 'Created' in result['Value']:
            createdJobs = int(result['Value']['Created']) + submittedJobs
        else:
            createdJobs = submittedJobs

        percSub = int(100 * submittedJobs / createdJobs)
        percDone = int(100 * doneJobs / createdJobs)
        print '\nCurrent status of production %s:\n' % productionID
        print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '%  ( ' + str(submittedJobs).ljust(7) + \
            'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
        print 'Done'.ljust(12) + str(percDone).ljust(3) + '%  ( ' + str(doneJobs).ljust(7) + \
            'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )'
        result = S_OK()
        result['Totals'] = {
            'Submitted': int(submittedJobs),
            'Created': int(createdJobs),
            'Done': int(doneJobs)
        }
        result['Value'] = summary
        # self.pPrint(result)
        return result
Example #26
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """ # Now, let's submit some jobs. Different sites, types, inputs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        jobIDs = []
        dests = ['DIRAC.site1.org', 'DIRAC.site2.org']
        lfnss = [['/a/1.txt', '/a/2.txt'],
                 ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
        types = ['User', 'Test']
        for dest in dests:
            for lfns in lfnss:
                for jobType in types:
                    job = helloWorldJob()
                    job.setDestination(dest)
                    job.setInputData(lfns)
                    job.setType(jobType)
                    jobDescription = createFile(job)
                    res = wmsClient.submitJob(
                        job._toJDL(xmlFile=jobDescription))
                    self.assert_(res['OK'])
                    jobID = res['Value']
                    jobIDs.append(jobID)

        res = jobMonitor.getSites()
        self.assert_(res['OK'])
        self.assert_(
            set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch']))
        res = jobMonitor.getJobTypes()
        self.assert_(res['OK'])
        self.assertEqual(sorted(res['Value']), sorted(types))
        res = jobMonitor.getApplicationStates()
        self.assert_(res['OK'])
        self.assertEqual(sorted(res['Value']), sorted(['Unknown']))

        res = jobMonitor.getOwners()
        self.assert_(res['OK'])
        res = jobMonitor.getOwnerGroup()
        self.assert_(res['OK'])
        res = jobMonitor.getProductionIds()
        self.assert_(res['OK'])
        res = jobMonitor.getJobGroups()
        self.assert_(res['OK'])
        res = jobMonitor.getStates()
        self.assert_(res['OK'])
        self.assert_(
            sorted(res['Value']) in [['Received'],
                                     sorted(['Received', 'Waiting'])])
        res = jobMonitor.getMinorStates()
        self.assert_(res['OK'])
        self.assert_(
            sorted(res['Value']) in [['Job accepted'],
                                     sorted(['Job accepted', 'matching'])])
        self.assert_(res['OK'])
        res = jobMonitor.getJobs()
        self.assert_(res['OK'])
        self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value']))
        #     res = jobMonitor.getCounters(attrList)
        #     self.assert_( res['OK'] )
        res = jobMonitor.getCurrentJobCounters()
        self.assert_(res['OK'])
        try:
            self.assert_(
                res['Value'].get('Received') + res['Value'].get('Waiting') >=
                long(len(dests) * len(lfnss) * len(types)))
        except TypeError:
            pass
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assert_(res['OK'])
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assert_(res['OK'])

        res = jobStateUpdate.setJobStatusBulk(
            jobID, {
                str(datetime.datetime.utcnow()): {
                    'Status': 'Running',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                }
            })
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']})
        self.assert_(res['OK'])

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
Example #27
0
    def test_FullChain(self):
        """ This test will

        - call all the WMSClient methods
          that will end up calling all the JobManager service methods
        - use the JobMonitoring to verify few properties
        - call the JobCleaningAgent to eliminate job entries from the DBs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        # create the job
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(isinstance(res['Value'], int),
                        msg="Got %s" % type(res['Value']))
        self.assertEqual(res['Value'],
                         res['JobID'],
                         msg="Got %s, expected %s" %
                         (str(res['Value']), res['JobID']))
        jobID = res['JobID']
        jobID = res['Value']

        # updating the status
        res = jobStateUpdate.setJobStatus(jobID, 'Running',
                                          'Executing Minchiapp', 'source')
        self.assertTrue(res['OK'], res.get('Message'))

        # reset the job
        res = wmsClient.resetJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))

        # reschedule the job
        res = wmsClient.rescheduleJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value'],
                         'Received',
                         msg="Got %s" % str(res['Value']))
        res = jobMonitor.getJobsMinorStatus([jobID])
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(
            res['Value'],
            {jobID: {
                'MinorStatus': 'Job Rescheduled',
                'JobID': jobID
            }},
            msg="Got %s" % str(res['Value']))
        res = jobMonitor.getJobsApplicationStatus([jobID])
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(
            res['Value'],
            {jobID: {
                'ApplicationStatus': 'Unknown',
                'JobID': jobID
            }},
            msg="Got %s" % str(res['Value']))

        # updating the status again
        res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching',
                                          'source')
        self.assertTrue(res['OK'], res.get('Message'))

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value'],
                         'Killed',
                         msg="Got %s" % str(res['Value']))

        # updating the status aaaagain
        res = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')
        self.assertTrue(res['OK'], res.get('Message'))

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(
            res['Value'], 'Done', msg="Got %s" %
            str(res['Value']))  # this time it won't kill... it's done!

        # delete the job - this will just set its status to "deleted"
        res = wmsClient.deleteJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value'],
                         'Deleted',
                         msg="Got %s" % str(res['Value']))
Example #28
0
class WorkflowTasks(TaskBase):
  """ Handles jobs
  """

  def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None,
               outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None,
               ownerDN=None, ownerGroup=None):
    """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

    if not logger:
      logger = gLogger.getSubLogger('WorkflowTasks')

    super(WorkflowTasks, self).__init__(transClient, logger)

    useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False
    if not submissionClient:
      self.submissionClient = WMSClient(useCertificates=useCertificates,
                                        delegatedDN=ownerDN,
                                        delegatedGroup=ownerGroup)
    else:
      self.submissionClient = submissionClient

    if not jobMonitoringClient:
      self.jobMonitoringClient = JobMonitoringClient()
    else:
      self.jobMonitoringClient = jobMonitoringClient

    if not jobClass:
      self.jobClass = Job
    else:
      self.jobClass = jobClass

    if not opsH:
      self.opsH = Operations()
    else:
      self.opsH = opsH

    if not outputDataModule:
      self.outputDataModule = self.opsH.getValue("Transformations/OutputDataModule", "")
    else:
      self.outputDataModule = outputDataModule

    if not destinationPlugin:
      self.destinationPlugin = self.opsH.getValue('Transformations/DestinationPlugin', 'BySE')
    else:
      self.destinationPlugin = destinationPlugin

    self.destinationPlugin_o = None

    self.outputDataModule_o = None

  def prepareTransformationTasks(self, transBody, taskDict, owner='', ownerGroup='',
                                 ownerDN='', bulkSubmissionFlag=False):
    """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works.

    :param transBody: transformation job template
    :param taskDict: dictionary of per task parameters
    :param owner: owner of the transformation
    :param ownerGroup: group of the owner of the transformation
    :param ownerDN: DN of the owner of the transformation
    :return:  S_OK/S_ERROR with updated taskDict
    """

    if (not owner) or (not ownerGroup):
      res = getProxyInfo(False, False)
      if not res['OK']:
        return res
      proxyInfo = res['Value']
      owner = proxyInfo['username']
      ownerGroup = proxyInfo['group']

    if not ownerDN:
      res = getDNForUsername(owner)
      if not res['OK']:
        return res
      ownerDN = res['Value'][0]

    if bulkSubmissionFlag:
      return self.__prepareTasksBulk(transBody, taskDict, owner, ownerGroup, ownerDN)
    return self.__prepareTasks(transBody, taskDict, owner, ownerGroup, ownerDN)

  def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup, ownerDN):
    """ Prepare transformation tasks with a single job object for bulk submission
    """
    if taskDict:
      transID = taskDict.values()[0]['TransformationID']
    else:
      return S_OK({})

    # Prepare the bulk Job object with common parameters
    oJob = self.jobClass(transBody)
    method = 'prepareTransformationTasksBulk'

    self._logVerbose('Setting job owner:group to %s:%s' % (owner, ownerGroup),
                     transID=transID, method=method)
    oJob.setOwner(owner)
    oJob.setOwnerGroup(ownerGroup)
    oJob.setOwnerDN(ownerDN)

    jobType = oJob.workflow.findParameter('JobType').getValue()
    transGroup = str(transID).zfill(8)

    # Verify that the JOB_ID parameter is added to the workflow
    if not oJob.workflow.findParameter('JOB_ID'):
      oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000', "Initial JOB_ID")

    if oJob.workflow.findParameter('PRODUCTION_ID'):
      oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8))  # pylint: disable=protected-access
    else:
      oJob._addParameter(oJob.workflow,  # pylint: disable=protected-access
                         'PRODUCTION_ID',
                         'string',
                         str(transID).zfill(8),
                         "Production ID")
    oJob.setType(jobType)
    self._logVerbose('Adding default transformation group of %s' % (transGroup),
                     transID=transID, method=method)
    oJob.setJobGroup(transGroup)

    if int(transID) in [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]:
      self._handleHospital(oJob)

    # Collect per job parameters sequences
    paramSeqDict = {}
    # tasks must be sorted because we use bulk submission and we must find the correspondance
    for taskID in sorted(taskDict):
      paramsDict = taskDict[taskID]
      seqDict = {}

      # Handle destination site
      sites = self._handleDestination(paramsDict)
      if not sites:
        self._logError('Could not get a list a sites', transID=transID)
        return S_ERROR(ETSUKN, "Can not evaluate destination site")
      else:
        self._logVerbose('Setting Site: ', str(sites), transID=transID)
        seqDict['Site'] = sites

      seqDict['JobName'] = self._transTaskName(transID, taskID)
      seqDict['JOB_ID'] = str(taskID).zfill(8)

      self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)),
                     transID=transID, method=method)

      # Handle Input Data
      inputData = paramsDict.get('InputData')
      if inputData:
        self._logVerbose('Setting input data to %s' % inputData,
                         transID=transID, method=method)
        seqDict['InputData'] = inputData
      elif paramSeqDict.get('InputData') is not None:
        self._logError("Invalid mixture of jobs with and without input data")
        return S_ERROR(ETSDATA, "Invalid mixture of jobs with and without input data")

      for paramName, paramValue in paramsDict.iteritems():
        if paramName not in ('InputData', 'Site', 'TargetSE'):
          if paramValue:
            self._logVerbose('Setting %s to %s' % (paramName, paramValue),
                             transID=transID, method=method)
            seqDict[paramName] = paramValue

      outputParameterList = []
      if self.outputDataModule:
        res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID,  # pylint: disable=protected-access
                                  'TaskID': taskID, 'InputData': inputData})
        if not res['OK']:
          self._logError("Failed to generate output data", res['Message'],
                         transID=transID, method=method)
          continue
        for name, output in res['Value'].iteritems():
          seqDict[name] = output
          outputParameterList.append(name)
          if oJob.workflow.findParameter(name):
            oJob._setParamValue(name, "%%(%s)s" % name)  # pylint: disable=protected-access
          else:
            oJob._addParameter(oJob.workflow,  # pylint: disable=protected-access
                               name,
                               'JDL',
                               "%%(%s)s" % name,
                               name)

      for pName, seq in seqDict.iteritems():
        paramSeqDict.setdefault(pName, []).append(seq)

    for paramName, paramSeq in paramSeqDict.iteritems():
      if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'] + outputParameterList:
        oJob.setParameterSequence(paramName, paramSeq, addToWorkflow=paramName)
      else:
        oJob.setParameterSequence(paramName, paramSeq)

    taskDict['BulkJobObject'] = oJob
    return S_OK(taskDict)

  def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN):
    """ Prepare transformation tasks with a job object per task
    """
    method = '__prepareTasks'
    startTime = time.time()
    oJobTemplate = self.jobClass(transBody)
    oJobTemplate.setOwner(owner)
    oJobTemplate.setOwnerGroup(ownerGroup)
    oJobTemplate.setOwnerDN(ownerDN)
    site = oJobTemplate.workflow.findParameter('Site').getValue()
    jobType = oJobTemplate.workflow.findParameter('JobType').getValue()
    templateOK = False
    getOutputDataTiming = 0.
    for taskID, paramsDict in taskDict.iteritems():
      # Create a job for each task and add it to the taskDict
      if not templateOK:
        templateOK = True
        # Update the template with common information
        transID = paramsDict['TransformationID']
        self._logVerbose('Job owner:group to %s:%s' % (owner, ownerGroup),
                         transID=transID, method=method)
        transGroup = str(transID).zfill(8)
        self._logVerbose('Adding default transformation group of %s' % (transGroup),
                         transID=transID, method=method)
        oJobTemplate.setJobGroup(transGroup)
        if oJobTemplate.workflow.findParameter('PRODUCTION_ID'):
          oJobTemplate._setParamValue('PRODUCTION_ID', str(transID).zfill(8))
        else:
          oJobTemplate._addParameter(oJobTemplate.workflow,
                                     'PRODUCTION_ID',
                                     'string',
                                     str(transID).zfill(8),
                                     "Production ID")
        if not oJobTemplate.workflow.findParameter('JOB_ID'):
          oJobTemplate._addParameter(oJobTemplate.workflow,
                                     'JOB_ID',
                                     'string',
                                     '00000000',
                                     "Initial JOB_ID")

      paramsDict['Site'] = site
      paramsDict['JobType'] = jobType
      # Now create the job from the template
      oJob = copy.deepcopy(oJobTemplate)
      constructedName = self._transTaskName(transID, taskID)
      self._logVerbose('Setting task name to %s' % constructedName,
                       transID=transID, method=method)
      oJob.setName(constructedName)
      oJob._setParamValue('JOB_ID', str(taskID).zfill(8))
      inputData = None

      self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' % (transID, taskID, str(paramsDict)),
                     transID=transID, method=method)

      # These helper functions do the real job
      sites = self._handleDestination(paramsDict)
      if not sites:
        self._logError('Could not get a list a sites',
                       transID=transID, method=method)
        paramsDict['TaskObject'] = ''
        continue
      else:
        self._logDebug('Setting Site: ', str(sites),
                       transID=transID, method=method)
        res = oJob.setDestination(sites)
        if not res['OK']:
          self._logError('Could not set the site: %s' % res['Message'],
                         transID=transID, method=method)
          continue

      self._handleInputs(oJob, paramsDict)
      self._handleRest(oJob, paramsDict)

      hospitalTrans = [int(x) for x in self.opsH.getValue("Hospital/Transformations", [])]
      if int(transID) in hospitalTrans:
        self._handleHospital(oJob)

      paramsDict['TaskObject'] = ''
      if self.outputDataModule:
        getOutputDataTiming -= time.time()
        res = self.getOutputData({'Job': oJob._toXML(), 'TransformationID': transID,
                                  'TaskID': taskID, 'InputData': inputData})
        getOutputDataTiming += time.time()
        if not res['OK']:
          self._logError("Failed to generate output data", res['Message'],
                         transID=transID, method=method)
          continue
        for name, output in res['Value'].iteritems():
          oJob._addJDLParameter(name, ';'.join(output))
      paramsDict['TaskObject'] = oJob
    if taskDict:
      self._logVerbose('Average getOutputData time: %.1f per task' % (getOutputDataTiming / len(taskDict)),
                       transID=transID, method=method)
      self._logInfo('Prepared %d tasks' % len(taskDict),
                    transID=transID, method=method, reftime=startTime)
    return S_OK(taskDict)

  #############################################################################

  def _handleDestination(self, paramsDict):
    """ Handle Sites and TargetSE in the parameters
    """

    try:
      sites = ['ANY']
      if paramsDict['Site']:
        # 'Site' comes from the XML and therefore is ; separated
        sites = fromChar(paramsDict['Site'], sepChar=';')
    except KeyError:
      pass

    if self.destinationPlugin_o:
      destinationPlugin_o = self.destinationPlugin_o
    else:
      res = self.__generatePluginObject(self.destinationPlugin)
      if not res['OK']:
        self._logFatal("Could not generate a destination plugin object")
        return res
      destinationPlugin_o = res['Value']
      self.destinationPlugin_o = destinationPlugin_o

    destinationPlugin_o.setParameters(paramsDict)
    destSites = destinationPlugin_o.run()
    if not destSites:
      return sites

    # Now we need to make the AND with the sites, if defined
    if sites != ['ANY']:
      # Need to get the AND
      destSites &= set(sites)

    return list(destSites)

  def _handleInputs(self, oJob, paramsDict):
    """ set job inputs (+ metadata)
    """
    inputData = paramsDict.get('InputData')
    transID = paramsDict['TransformationID']
    if inputData:
      self._logVerbose('Setting input data to %s' % inputData,
                       transID=transID, method='handleInputs')
      oJob.setInputData(inputData)

  def _handleRest(self, oJob, paramsDict):
    """ add as JDL parameters all the other parameters that are not for inputs or destination
    """
    transID = paramsDict['TransformationID']
    for paramName, paramValue in paramsDict.iteritems():
      if paramName not in ('InputData', 'Site', 'TargetSE'):
        if paramValue:
          self._logDebug('Setting %s to %s' % (paramName, paramValue),
                         transID=transID, method='handleRest')
          oJob._addJDLParameter(paramName, paramValue)

  def _handleHospital(self, oJob):
    """ Optional handle of hospital jobs
    """
    oJob.setType('Hospital')
    oJob.setInputDataPolicy('download', dataScheduling=False)
    hospitalSite = self.opsH.getValue("Hospital/HospitalSite", 'DIRAC.JobDebugger.ch')
    oJob.setDestination(hospitalSite)
    hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", [])
    if hospitalCEs:
      oJob._addJDLParameter('GridCE', hospitalCEs)

  def __generatePluginObject(self, plugin):
    """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name
    """
    try:
      plugModule = __import__(self.pluginLocation, globals(), locals(), ['TaskManagerPlugin'])
    except ImportError as e:
      self._logException("Failed to import 'TaskManagerPlugin' %s: %s" % (plugin, e))
      return S_ERROR()
    try:
      plugin_o = getattr(plugModule, 'TaskManagerPlugin')('%s' % plugin, operationsHelper=self.opsH)
      return S_OK(plugin_o)
    except AttributeError as e:
      self._logException("Failed to create %s(): %s." % (plugin, e))
      return S_ERROR()

  #############################################################################

  def getOutputData(self, paramDict):
    """ Get the list of job output LFNs from the provided plugin
    """
    if not self.outputDataModule_o:
      # Create the module object
      moduleFactory = ModuleFactory()

      moduleInstance = moduleFactory.getModule(self.outputDataModule, None)
      if not moduleInstance['OK']:
        return moduleInstance
      self.outputDataModule_o = moduleInstance['Value']
    # This is the "argument" to the module, set it and then execute
    self.outputDataModule_o.paramDict = paramDict
    return self.outputDataModule_o.execute()

  def submitTransformationTasks(self, taskDict):
    """ Submit the tasks
    """
    if 'BulkJobObject' in taskDict:
      return self.__submitTransformationTasksBulk(taskDict)
    return self.__submitTransformationTasks(taskDict)

  def __submitTransformationTasksBulk(self, taskDict):
    """ Submit jobs in one go with one parametric job
    """
    if not taskDict:
      return S_OK(taskDict)
    startTime = time.time()

    oJob = taskDict.pop('BulkJobObject')
    # we can only do this, once the job has been popped, or we _might_ crash
    transID = taskDict.values()[0]['TransformationID']
    if oJob is None:
      self._logError('no bulk Job object found', transID=transID, method='submitTransformationTasksBulk')
      return S_ERROR(ETSUKN, 'No bulk job object provided for submission')

    result = self.submitTaskToExternal(oJob)
    if not result['OK']:
      return result

    jobIDList = result['Value']
    if len(jobIDList) != len(taskDict):
      for task in taskDict.values():
        task['Success'] = False
      return S_ERROR(ETSUKN, 'Submitted less number of jobs than requested tasks')
    # Get back correspondance with tasks sorted by ID
    for jobID, taskID in zip(jobIDList, sorted(taskDict)):
      taskDict[taskID]['ExternalID'] = jobID
      taskDict[taskID]['Success'] = True

    submitted = len(jobIDList)
    self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime),
                  transID=transID, method='submitTransformationTasksBulk')
    return S_OK(taskDict)

  def __submitTransformationTasks(self, taskDict):
    """ Submit jobs one by one
    """
    method = 'submitTransformationTasks'
    submitted = 0
    failed = 0
    startTime = time.time()
    for task in taskDict.itervalues():
      transID = task['TransformationID']
      if not task['TaskObject']:
        task['Success'] = False
        failed += 1
        continue
      res = self.submitTaskToExternal(task['TaskObject'])
      if res['OK']:
        task['ExternalID'] = res['Value']
        task['Success'] = True
        submitted += 1
      else:
        self._logError("Failed to submit task to WMS", res['Message'],
                       transID=transID, method=method)
        task['Success'] = False
        failed += 1
    if submitted:
      self._logInfo('Submitted %d tasks to WMS in %.1f seconds' % (submitted, time.time() - startTime),
                    transID=transID, method=method)
    if failed:
      self._logError('Failed to submit %d tasks to WMS.' % (failed),
                     transID=transID, method=method)
    return S_OK(taskDict)

  def submitTaskToExternal(self, job):
    """ Submits a single job to the WMS.
    """
    if isinstance(job, basestring):
      try:
        oJob = self.jobClass(job)
      except Exception as x:  # pylint: disable=broad-except
        self._logException("Failed to create job object", '', x)
        return S_ERROR("Failed to create job object")
    elif isinstance(job, self.jobClass):
      oJob = job
    else:
      self._logError("No valid job description found")
      return S_ERROR("No valid job description found")

    workflowFileObject = StringIO.StringIO(oJob._toXML())
    jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject)
    return self.submissionClient.submitJob(jdl, workflowFileObject)

  def updateTransformationReservedTasks(self, taskDicts):
    transID = None
    jobNames = [self._transTaskName(taskDict['TransformationID'], taskDict['TaskID']) for taskDict in taskDicts]
    res = self.jobMonitoringClient.getJobs({'JobName': jobNames})
    if not res['OK']:
      self._logError("Failed to get task from WMS", res['Message'],
                     transID=transID, method='updateTransformationReservedTasks')
      return res
    jobNameIDs = {}
    for wmsID in res['Value']:
      res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID))
      if not res['OK']:
        self._logWarn("Failed to get task summary from WMS", res['Message'],
                      transID=transID, method='updateTransformationReservedTasks')
      else:
        jobNameIDs[res['Value']['JobName']] = int(wmsID)

    noTask = list(set(jobNames) - set(jobNameIDs))
    return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs})

  def getSubmittedTaskStatus(self, taskDicts):
    """
    Check the status of a list of tasks and return lists of taskIDs for each new status
    """
    if taskDicts:
      wmsIDs = [int(taskDict['ExternalID']) for taskDict in taskDicts if int(taskDict['ExternalID'])]
      transID = taskDicts[0]['TransformationID']
    else:
      return S_OK({})
    res = self.jobMonitoringClient.getJobsStatus(wmsIDs)
    if not res['OK']:
      self._logWarn("Failed to get job status from the WMS system",
                    transID=transID)
      return res
    statusDict = res['Value']
    updateDict = {}
    for taskDict in taskDicts:
      taskID = taskDict['TaskID']
      wmsID = int(taskDict['ExternalID'])
      if not wmsID:
        continue
      oldStatus = taskDict['ExternalStatus']
      newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed')
      if oldStatus != newStatus:
        if newStatus == "Removed":
          self._logVerbose('Production/Job %d/%d removed from WMS while it is in %s status' %
                           (transID, taskID, oldStatus), transID=transID)
          newStatus = "Failed"
        self._logVerbose('Setting job status for Production/Job %d/%d to %s' % (transID, taskID, newStatus),
                         transID=transID)
        updateDict.setdefault(newStatus, []).append(taskID)
    return S_OK(updateDict)

  def getSubmittedFileStatus(self, fileDicts):
    """
    Check the status of a list of files and return the new status of each LFN
    """
    if not fileDicts:
      return S_OK({})
    # All files are from the same transformation
    transID = fileDicts[0]['TransformationID']
    taskFiles = {}
    for fileDict in fileDicts:
      jobName = self._transTaskName(transID, fileDict['TaskID'])
      taskFiles.setdefault(jobName, {})[fileDict['LFN']] = fileDict['Status']

    res = self.updateTransformationReservedTasks(fileDicts)
    if not res['OK']:
      self._logWarn("Failed to obtain taskIDs for files",
                    transID=transID)
      return res
    noTasks = res['Value']['NoTasks']
    taskNameIDs = res['Value']['TaskNameIDs']

    updateDict = {}
    for jobName in noTasks:
      for lfn, oldStatus in taskFiles[jobName].iteritems():
        if oldStatus != 'Unused':
          updateDict[lfn] = 'Unused'

    res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values())
    if not res['OK']:
      self._logWarn("Failed to get job status from the WMS system",
                    transID=transID)
      return res
    statusDict = res['Value']
    for jobName, wmsID in taskNameIDs.iteritems():
      jobStatus = statusDict.get(wmsID, {}).get('Status')
      newFileStatus = {'Done': 'Processed',
                       'Completed': 'Processed',
                       'Failed': 'Unused'}.get(jobStatus)
      if newFileStatus:
        for lfn, oldStatus in taskFiles[jobName].iteritems():
          if newFileStatus != oldStatus:
            updateDict[lfn] = newFileStatus
    return S_OK(updateDict)
    jobs = []
    for arg in Script.getPositionalArgs():
        try:
            jobs += [int(job) for job in arg.split(',')]
        except ValueError:
            gLogger.fatal("Invalid list of jobIDs")
            DIRAC.exit(2)

    from DIRAC.DataManagementSystem.Client.DataManager import DataManager
    from LHCbDIRAC.BookkeepingSystem.Client.BookkeepingClient import BookkeepingClient
    from DIRAC.Core.Utilities.SiteSEMapping import getSEsForSite
    dm = DataManager()
    bk = BookkeepingClient()

    monitoring = JobMonitoringClient()

    if not jobs:
        conditions = {
            'Status': 'Failed',
            'MinorStatus': 'Maximum of reschedulings reached',
            'ApplicationStatus': 'Failed Input Data Resolution '
        }
        prStr = 'all jobs'
        if production:
            prStr = 'production %s' % ' '.join(production)
            if len(production) == 1:
                production = production[0]
            conditions['JobGroup'] = production
        if userName:
            prStr = 'user %s' % userName
Example #30
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """ # Now, let's submit some jobs. Different sites, types, inputs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        jobIDs = []
        lfnss = [['/a/1.txt', '/a/2.txt'],
                 ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
        types = ['User', 'Test']
        for lfns in lfnss:
            for jobType in types:
                job = helloWorldJob()
                job.setDestination('DIRAC.Jenkins.ch')
                job.setInputData(lfns)
                job.setType(jobType)
                jobDescription = createFile(job)
                res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
                self.assertTrue(res['OK'], res.get('Message'))
                jobID = res['Value']
            jobIDs.append(jobID)

        res = jobMonitor.getSites()
        print(res)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'},
                        msg="Got %s" % res['Value'])
        res = jobMonitor.getJobTypes()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(types),
                         msg="Got %s" % str(sorted(res['Value'])))
        res = jobMonitor.getApplicationStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(['Unknown']),
                         msg="Got %s" % sorted(str(res['Value'])))

        res = jobMonitor.getOwners()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getOwnerGroup()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getProductionIds()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobGroups()
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_empty = res['Value']
        res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow())
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanNow = res['Value']
        self.assertEqual(resJG_empty, resJG_olderThanNow)
        res = jobMonitor.getJobGroups(
            None,
            datetime.datetime.utcnow() - datetime.timedelta(days=365))
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanOneYear = res['Value']
        self.assertTrue(
            set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)))
        res = jobMonitor.getStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Received'],
                                     sorted(['Received', 'Waiting'])])
        res = jobMonitor.getMinorStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Job accepted'],
                                     sorted(
                                         ['Job accepted', 'Job Rescheduled'])])
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobs()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value']))
        #     res = jobMonitor.getCounters(attrList)
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getCurrentJobCounters()
        self.assertTrue(res['OK'], res.get('Message'))
        try:
            self.assertTrue(
                res['Value'].get('Received') +
                res['Value'].get('Waiting') >= int(len(lfnss) * len(types)))
        except TypeError:
            pass
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assertTrue(res['OK'], res.get('Message'))

        res = jobStateUpdate.setJobStatusBulk(
            jobID, {
                str(datetime.datetime.utcnow()): {
                    'Status': 'Running',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                }
            })
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']})
        self.assertTrue(res['OK'], res.get('Message'))

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
Example #31
0
class MonitorAgents(AgentModule):
  """MonitorAgents class."""

  def __init__(self, *args, **kwargs):
    """Initialize the agent, clients, default values."""
    AgentModule.__init__(self, *args, **kwargs)
    self.name = 'MonitorAgents'
    self.setup = "Production"
    self.enabled = False
    self.restartAgents = False
    self.restartExecutors = False
    self.restartServices = False
    self.controlComponents = False
    self.commitURLs = False
    self.diracLocation = "/opt/dirac/pro"

    self.sysAdminClient = SystemAdministratorClient(socket.gethostname())
    self.jobMonClient = JobMonitoringClient()
    self.nClient = NotificationClient()
    self.csAPI = None
    self.agents = dict()
    self.executors = dict()
    self.services = dict()
    self.errors = list()
    self.accounting = defaultdict(dict)

    self.addressTo = ["*****@*****.**"]
    self.addressFrom = "*****@*****.**"
    self.emailSubject = "MonitorAgents on %s" % socket.gethostname()

  def logError(self, errStr, varMsg=''):
    """Append errors to a list, which is sent in email notification."""
    self.log.error(errStr, varMsg)
    self.errors.append(errStr + " " + varMsg)

  def beginExecution(self):
    """Reload the configurations before every cycle."""
    self.setup = self.am_getOption("Setup", self.setup)
    self.enabled = self.am_getOption("EnableFlag", self.enabled)
    self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents)
    self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors)
    self.restartServices = self.am_getOption("RestartServices", self.restartServices)
    self.diracLocation = os.environ.get("DIRAC", self.diracLocation)
    self.addressTo = self.am_getOption('MailTo', self.addressTo)
    self.addressFrom = self.am_getOption('MailFrom', self.addressFrom)
    self.controlComponents = self.am_getOption('ControlComponents', self.controlComponents)
    self.commitURLs = self.am_getOption('CommitURLs', self.commitURLs)

    self.csAPI = CSAPI()

    res = self.getRunningInstances(instanceType='Agents')
    if not res["OK"]:
      return S_ERROR("Failure to get running agents")
    self.agents = res["Value"]

    res = self.getRunningInstances(instanceType='Executors')
    if not res["OK"]:
      return S_ERROR("Failure to get running executors")
    self.executors = res["Value"]

    res = self.getRunningInstances(instanceType='Services')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]

    self.accounting.clear()
    return S_OK()

  def sendNotification(self):
    """Send email notification about changes done in the last cycle."""
    if not(self.errors or self.accounting):
      return S_OK()

    emailBody = ""
    rows = []
    for instanceName, val in self.accounting.iteritems():
      rows.append([[instanceName],
                   [val.get('Treatment', 'No Treatment')],
                   [str(val.get('LogAge', 'Not Relevant'))]])

    if rows:
      columns = ["Instance", "Treatment", "Log File Age (Minutes)"]
      emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ')

    if self.errors:
      emailBody += "\n\nErrors:"
      emailBody += "\n".join(self.errors)

    self.log.notice("Sending Email:\n" + emailBody)
    for address in self.addressTo:
      res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False)
      if not res['OK']:
        self.log.error("Failure to send Email notification to ", address)
        continue

    self.errors = []
    self.accounting.clear()

    return S_OK()

  def getRunningInstances(self, instanceType='Agents', runitStatus='Run'):
    """Return a dict of running agents, executors or services.

    Key is agent's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation

    :param str instanceType: 'Agents', 'Executors', 'Services'
    :param str runitStatus: Return only those instances with given RunitStatus or 'All'
    :returns: Dictionary of running instances
    """
    res = self.sysAdminClient.getOverallStatus()
    if not res["OK"]:
      self.logError("Failure to get %s from system administrator client" % instanceType, res["Message"])
      return res

    val = res['Value'][instanceType]
    runningAgents = defaultdict(dict)
    for system, agents in val.iteritems():
      for agentName, agentInfo in agents.iteritems():
        if agentInfo['Setup'] and agentInfo['Installed']:
          if runitStatus != 'All' and agentInfo['RunitStatus'] != runitStatus:
            continue
          confPath = cfgPath('/Systems/' + system + '/' + self.setup + '/%s/' % instanceType + agentName)
          for option, default in (('PollingTime', HOUR), ('Port', None)):
            optPath = os.path.join(confPath, option)
            runningAgents[agentName][option] = gConfig.getValue(optPath, default)
          runningAgents[agentName]["LogFileLocation"] = \
              os.path.join(self.diracLocation, 'runit', system, agentName, 'log', 'current')
          runningAgents[agentName]["PID"] = agentInfo["PID"]
          runningAgents[agentName]['Module'] = agentInfo['Module']
          runningAgents[agentName]['RunitStatus'] = agentInfo['RunitStatus']
          runningAgents[agentName]['System'] = system

    return S_OK(runningAgents)

  def on_terminate(self, agentName, process):
    """Execute callback when a process terminates gracefully."""
    self.log.info("%s's process with ID: %s has been terminated successfully" % (agentName, process.pid))

  def execute(self):
    """Execute checks for agents, executors, services."""
    for instanceType in ('executor', 'agent', 'service'):
      for name, options in getattr(self, instanceType + 's').iteritems():
        # call checkAgent, checkExecutor, checkService
        res = getattr(self, 'check' + instanceType.capitalize())(name, options)
        if not res['OK']:
          self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res['Message']))

    res = self.componentControl()
    if not res['OK']:
      if "Stopped does not exist" not in res['Message'] and \
         "Running does not exist" not in res['Message']:
        self.logError("Failure to control components", res['Message'])

    if not self.errors:
      res = self.checkURLs()
      if not res['OK']:
        self.logError("Failure to check URLs", res['Message'])
    else:
      self.logError('Something was wrong before, not checking URLs this time')

    self.sendNotification()

    if self.errors:
      return S_ERROR("Error during this cycle, check log")

    return S_OK()

  @staticmethod
  def getLastAccessTime(logFileLocation):
    """Return the age of log file."""
    lastAccessTime = 0
    try:
      lastAccessTime = os.path.getmtime(logFileLocation)
      lastAccessTime = datetime.fromtimestamp(lastAccessTime)
    except OSError as e:
      return S_ERROR('Failed to access logfile %s: %r' % (logFileLocation, e))

    now = datetime.now()
    age = now - lastAccessTime
    return S_OK(age)

  def restartInstance(self, pid, instanceName, enabled):
    """Kill a process which is then restarted automatically."""
    if not (self.enabled and enabled):
      self.log.info("Restarting is disabled, please restart %s manually" % instanceName)
      self.accounting[instanceName]["Treatment"] = "Please restart it manually"
      return S_OK(NO_RESTART)

    try:
      agentProc = psutil.Process(int(pid))
      processesToTerminate = agentProc.children(recursive=True)
      processesToTerminate.append(agentProc)

      for proc in processesToTerminate:
        proc.terminate()

      _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5,
                                       callback=partial(self.on_terminate, instanceName))
      for proc in alive:
        self.log.info("Forcefully killing process %s" % proc.pid)
        proc.kill()

      return S_OK()

    except psutil.Error as err:
      self.logError("Exception occurred in terminating processes", "%s" % err)
      return S_ERROR()

  def checkService(self, serviceName, options):
    """Ping the service, restart if the ping does not respond."""
    url = self._getURL(serviceName, options)
    self.log.info("Pinging service", url)
    pingRes = Client().ping(url=url)
    if not pingRes['OK']:
      self.log.info('Failure pinging service: %s: %s' % (url, pingRes['Message']))
      res = self.restartInstance(int(options['PID']), serviceName, self.restartServices)
      if not res["OK"]:
        return res
      elif res['OK'] and res['Value'] != NO_RESTART:
        self.accounting[serviceName]["Treatment"] = "Successfully Restarted"
        self.log.info("Agent %s has been successfully restarted" % serviceName)
    self.log.info("Service responded OK")
    return S_OK()

  def checkAgent(self, agentName, options):
    """Check the age of agent's log file, if it is too old then restart the agent."""
    pollingTime, currentLogLocation, pid = options['PollingTime'], options['LogFileLocation'], options['PID']
    self.log.info("Checking Agent: %s" % agentName)
    self.log.info("Polling Time: %s" % pollingTime)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES)))

    maxLogAge = max(pollingTime + HOUR, 2 * HOUR)
    if age.seconds < maxLogAge:
      return S_OK()

    self.log.info("Current log file is too old for Agent %s" % agentName)
    self.accounting[agentName]["LogAge"] = age.seconds / MINUTES

    res = self.restartInstance(int(pid), agentName, self.restartAgents)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[agentName]["Treatment"] = "Successfully Restarted"
      self.log.info("Agent %s has been successfully restarted" % agentName)

    return S_OK()

  def checkExecutor(self, executor, options):
    """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors."""
    currentLogLocation = options['LogFileLocation']
    pid = options['PID']
    self.log.info("Checking executor: %s" % executor)
    self.log.info("Current Log File location: %s" % currentLogLocation)

    res = self.getLastAccessTime(currentLogLocation)
    if not res["OK"]:
      return res

    age = res["Value"]
    self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES)))

    if age.seconds < 2 * HOUR:
      return S_OK()

    self.log.info("Current log file is too old for Executor %s" % executor)
    self.accounting[executor]["LogAge"] = age.seconds / MINUTES

    res = self.checkForCheckingJobs(executor)
    if not res['OK']:
      return res
    if res['OK'] and res['Value'] == NO_CHECKING_JOBS:
      self.accounting.pop(executor, None)
      return S_OK(NO_RESTART)

    res = self.restartInstance(int(pid), executor, self.restartExecutors)
    if not res["OK"]:
      return res
    elif res['OK'] and res['Value'] != NO_RESTART:
      self.accounting[executor]["Treatment"] = "Successfully Restarted"
      self.log.info("Executor %s has been successfully restarted" % executor)

    return S_OK()

  def checkForCheckingJobs(self, executorName):
    """Check if there are checking jobs with the **executorName** as current MinorStatus."""
    attrDict = {'Status': 'Checking', 'MinorStatus': executorName}

    # returns list of jobs IDs
    resJobs = self.jobMonClient.getJobs(attrDict)
    if not resJobs['OK']:
      self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs['Message']))
      return resJobs
    if resJobs['Value']:
      self.log.info("Found %d jobs in 'Checking' status for %s" % (len(resJobs['Value']), executorName))
      return S_OK(CHECKING_JOBS)
    self.log.info("Found no jobs in 'Checking' status for %s" % executorName)
    return S_OK(NO_CHECKING_JOBS)

  def componentControl(self):
    """Monitor and control component status as defined in the CS.

    Check for running and stopped components and ensure they have the proper status as defined in the CS
    Registry/Hosts/_HOST_/[Running|Stopped] sections

    :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`,
       :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR`
    """
    # get the current status of the components

    resCurrent = self._getCurrentComponentStatus()
    if not resCurrent['OK']:
      return resCurrent
    currentStatus = resCurrent['Value']

    resDefault = self._getDefaultComponentStatus()
    if not resDefault['OK']:
      return resDefault
    defaultStatus = resDefault['Value']

    # ensure instances are in the right state
    shouldBe = {}
    shouldBe['Run'] = defaultStatus['Run'].intersection(currentStatus['Down'])
    shouldBe['Down'] = defaultStatus['Down'].intersection(currentStatus['Run'])
    shouldBe['Unknown'] = defaultStatus['All'].symmetric_difference(currentStatus['All'])

    self._ensureComponentRunning(shouldBe['Run'])
    self._ensureComponentDown(shouldBe['Down'])

    for instance in shouldBe['Unknown']:
      self.logError("Unknown instance", "%r, either uninstall or add to config" % instance)

    return S_OK()

  def _getCurrentComponentStatus(self):
    """Get current status for components."""
    resOverall = self.sysAdminClient.getOverallStatus()
    if not resOverall['OK']:
      return resOverall
    currentStatus = {'Down': set(), 'Run': set(), 'All': set()}
    informationDict = resOverall['Value']
    for systemsDict in informationDict.values():
      for system, instancesDict in systemsDict.items():
        for instanceName, instanceInfoDict in instancesDict.items():
          identifier = '%s__%s' % (system, instanceName)
          runitStatus = instanceInfoDict.get('RunitStatus')
          if runitStatus in ('Run', 'Down'):
            currentStatus[runitStatus].add(identifier)

    currentStatus['All'] = currentStatus['Run'] | currentStatus['Down']
    return S_OK(currentStatus)

  def _getDefaultComponentStatus(self):
    """Get the configured status of the components."""
    host = socket.gethostname()
    defaultStatus = {'Down': set(), 'Run': set(), 'All': set()}
    resRunning = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Running'))
    resStopped = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Stopped'))
    if not resRunning['OK']:
      return resRunning
    if not resStopped['OK']:
      return resStopped
    defaultStatus['Run'] = set(resRunning['Value'].keys())
    defaultStatus['Down'] = set(resStopped['Value'].keys())
    defaultStatus['All'] = defaultStatus['Run'] | defaultStatus['Down']

    if defaultStatus['Run'].intersection(defaultStatus['Down']):
      self.logError("Overlap in configuration", str(defaultStatus['Run'].intersection(defaultStatus['Down'])))
      return S_ERROR("Bad host configuration")

    return S_OK(defaultStatus)

  def _ensureComponentRunning(self, shouldBeRunning):
    """Ensure the correct components are running."""
    for instance in shouldBeRunning:
      self.log.info("Starting instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.startComponent(system, name)
        if not res['OK']:
          self.logError("Failed to start component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was down, started instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is down, should be started"

  def _ensureComponentDown(self, shouldBeDown):
    """Ensure the correct components are not running."""
    for instance in shouldBeDown:
      self.log.info("Stopping instance %s" % instance)
      system, name = instance.split('__')
      if self.controlComponents:
        res = self.sysAdminClient.stopComponent(system, name)
        if not res['OK']:
          self.logError("Failed to stop component:", "%s: %s" % (instance, res['Message']))
        else:
          self.accounting[instance]["Treatment"] = "Instance was running, stopped instance"
      else:
        self.accounting[instance]["Treatment"] = "Instance is running, should be stopped"

  def checkURLs(self):
    """Ensure that the running services have their URL in the Config."""
    self.log.info("Checking URLs")
    # get services again, in case they were started/stop in controlComponents
    gConfig.forceRefresh(fromMaster=True)
    res = self.getRunningInstances(instanceType='Services', runitStatus='All')
    if not res["OK"]:
      return S_ERROR("Failure to get running services")
    self.services = res["Value"]
    for service, options in self.services.iteritems():
      self.log.debug("Checking URL for %s with options %s" % (service, options))
      # ignore SystemAdministrator, does not have URLs
      if 'SystemAdministrator' in service:
        continue
      self._checkServiceURL(service, options)

    if self.csAPI.csModified and self.commitURLs:
      self.log.info("Commiting changes to the CS")
      result = self.csAPI.commit()
      if not result['OK']:
        self.logError('Commit to CS failed', result['Message'])
        return S_ERROR("Failed to commit to CS")
    return S_OK()

  def _checkServiceURL(self, serviceName, options):
    """Ensure service URL is properly configured in the CS."""
    url = self._getURL(serviceName, options)
    system = options['System']
    module = options['Module']
    self.log.info("Checking URLs for %s/%s" % (system, module))
    urlsConfigPath = os.path.join('/Systems', system, self.setup, 'URLs', module)
    urls = gConfig.getValue(urlsConfigPath, [])
    self.log.debug("Found configured URLs for %s: %s" % (module, urls))
    self.log.debug("This URL is %s" % url)
    runitStatus = options['RunitStatus']
    wouldHave = 'Would have ' if not self.commitURLs else ''
    if runitStatus == 'Run' and url not in urls:
      urls.append(url)
      message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))
    if runitStatus == 'Down' and url in urls:
      urls.remove(url)
      message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module)
      self.log.info(message)
      self.accounting[serviceName + "/URL"]["Treatment"] = message
      self.csAPI.modifyValue(urlsConfigPath, ",".join(urls))

  @staticmethod
  def _getURL(serviceName, options):
    """Return URL for the service."""
    system = options['System']
    port = options['Port']
    host = socket.gethostname()
    url = 'dips://%s:%s/%s/%s' % (host, port, system, serviceName)
    return url
Example #32
0
    def __sendAccounting(self, jobID):
        """ Send WMS accounting data for the given job
    """
        try:
            accountingReport = Job()
            endTime = 'Unknown'
            lastHeartBeatTime = 'Unknown'

            result = self.jobDB.getJobAttributes(jobID)
            if not result['OK']:
                return result
            jobDict = result['Value']

            startTime, endTime = self.__checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(
                jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            result = JobMonitoringClient().getJobParameter(
                jobID, 'CPUNormalizationFactor')
            if not result['OK'] or not result['Value']:
                self.log.error(
                    'Error getting Job Parameter CPUNormalizationFactor, setting 0',
                    result.get('Message', 'No such value'))
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(
                    result['Value'].get('CPUNormalizationFactor'))

        except Exception as e:
            self.log.exception(
                "Exception in __sendAccounting",
                "for job=%s: endTime=%s, lastHBTime=%s" %
                (str(jobID), str(endTime), str(lastHeartBeatTime)),
                lException=e)
            return S_ERROR("Exception")
        processingType = self.__getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        # Fill the accounting data
        acData = {
            'Site': jobDict['Site'],
            'User': jobDict['Owner'],
            'UserGroup': jobDict['OwnerGroup'],
            'JobGroup': jobDict['JobGroup'],
            'JobType': jobDict['JobType'],
            'JobClass': jobDict['JobSplitType'],
            'ProcessingType': processingType,
            'FinalMajorStatus': 'Failed',
            'FinalMinorStatus': 'Stalled',
            'CPUTime': lastCPUTime,
            'NormCPUTime': lastCPUTime * cpuNormalization,
            'ExecTime': lastWallTime,
            'InputDataSize': 0.0,
            'OutputDataSize': 0.0,
            'InputDataFiles': 0,
            'OutputDataFiles': 0,
            'DiskSpace': 0.0,
            'InputSandBoxSize': 0.0,
            'OutputSandBoxSize': 0.0,
            'ProcessedEvents': 0
        }

        # For accidentally stopped jobs ExecTime can be not set
        if not acData['ExecTime']:
            acData['ExecTime'] = acData['CPUTime']
        elif acData['ExecTime'] < acData['CPUTime']:
            acData['ExecTime'] = acData['CPUTime']

        self.log.verbose('Accounting Report is:')
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result['OK']:
            self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True')
        else:
            self.log.error(
                'Failed to send accounting report',
                'Job: %d, Error: %s' % (int(jobID), result['Message']))
        return result
Example #33
0
    def finalizeRequest(self, requestID, jobID, useCertificates=True):
        """check request status and perform finalization if necessary
            update the request status and the corresponding job parameter

        :param self: self reference
        :param str requestID: request id
        :param int jobID: job id
        """

        stateServer = JobStateUpdateClient(useCertificates=useCertificates)

        # Checking if to update the job status - we should fail here, so it will be re-tried later
        # Checking the state, first
        res = self.getRequestStatus(requestID)
        if not res["OK"]:
            self.log.error(
                "finalizeRequest: failed to get request",
                "request: %s status: %s" % (requestID, res["Message"]))
            return res
        if res["Value"] != "Done":
            return S_ERROR(
                "The request %s isn't 'Done' but '%s', this should never happen, why are we here?"
                % (requestID, res["Value"]))

        # The request is 'Done', let's update the job status. If we fail, we should re-try later

        monitorServer = JobMonitoringClient(useCertificates=useCertificates)
        res = monitorServer.getJobSummary(int(jobID))
        if not res["OK"]:
            self.log.error("finalizeRequest: Failed to get job status",
                           "JobID: %d" % jobID)
            return res
        elif not res["Value"]:
            self.log.info(
                "finalizeRequest: job %d does not exist (anymore): finalizing"
                % jobID)
            return S_OK()
        else:
            jobStatus = res["Value"]["Status"]
            jobMinorStatus = res["Value"]["MinorStatus"]
            jobAppStatus = ""
            newJobStatus = ""
            if jobStatus == JobStatus.STALLED:
                # If job is stalled, find the previous status from the logging info
                res = monitorServer.getJobLoggingInfo(int(jobID))
                if not res["OK"]:
                    self.log.error(
                        "finalizeRequest: Failed to get job logging info",
                        "JobID: %d" % jobID)
                    return res
                # Check the last status was Stalled and get the one before
                if len(res["Value"]
                       ) >= 2 and res["Value"][-1][0] == JobStatus.STALLED:
                    jobStatus, jobMinorStatus, jobAppStatus = res["Value"][
                        -2][:3]
                    newJobStatus = jobStatus

            # update the job pending request digest in any case since it is modified
            self.log.info(
                "finalizeRequest: Updating request digest for job %d" % jobID)

            digest = self.getDigest(requestID)
            if digest["OK"]:
                digest = digest["Value"]
                self.log.verbose(digest)
                res = stateServer.setJobParameter(jobID, "PendingRequest",
                                                  digest)
                if not res["OK"]:
                    self.log.info(
                        "finalizeRequest: Failed to set job %d parameter: %s" %
                        (jobID, res["Message"]))
                    return res
            else:
                self.log.error(
                    "finalizeRequest: Failed to get request digest for %s: %s"
                    % (requestID, digest["Message"]))
            if jobStatus == JobStatus.COMPLETED:
                # What to do? Depends on what we have in the minorStatus
                if jobMinorStatus == JobMinorStatus.PENDING_REQUESTS:
                    newJobStatus = JobStatus.DONE
                elif jobMinorStatus == JobMinorStatus.APP_ERRORS:
                    newJobStatus = JobStatus.FAILED
                elif jobMinorStatus == JobMinorStatus.MARKED_FOR_TERMINATION:
                    # If the job has been Killed, set it Killed
                    newJobStatus = JobStatus.KILLED
                else:
                    self.log.error(
                        "finalizeRequest: Unexpected jobMinorStatus",
                        "for %d (got %s)" % (jobID, jobMinorStatus))
                    return S_ERROR("Unexpected jobMinorStatus")

            if newJobStatus:
                self.log.info(
                    "finalizeRequest: Updating job status",
                    "for %d to '%s/%s'" %
                    (jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE),
                )
            else:
                self.log.info(
                    "finalizeRequest: Updating job minor status",
                    "for %d to '%s' (current status is %s)" %
                    (jobID, JobMinorStatus.REQUESTS_DONE, jobStatus),
                )
            stateUpdate = stateServer.setJobStatus(
                jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE, "RMS")
            if jobAppStatus and stateUpdate["OK"]:
                stateUpdate = stateServer.setJobApplicationStatus(
                    jobID, jobAppStatus, "RMS")
            if not stateUpdate["OK"]:
                self.log.error(
                    "finalizeRequest: Failed to set job status",
                    "JobID: %d, error: %s" % (jobID, stateUpdate["Message"]),
                )
                return stateUpdate

        return S_OK(newJobStatus)
Example #34
0
class WorkflowTasks(TaskBase):
    """ Handles jobs
  """
    def __init__(self,
                 transClient=None,
                 logger=None,
                 submissionClient=None,
                 jobMonitoringClient=None,
                 outputDataModule=None,
                 jobClass=None,
                 opsH=None,
                 destinationPlugin=None,
                 ownerDN=None,
                 ownerGroup=None):
        """ Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
    """

        if not logger:
            logger = gLogger.getSubLogger('WorkflowTasks')

        super(WorkflowTasks, self).__init__(transClient, logger)

        useCertificates = True if (bool(ownerDN)
                                   and bool(ownerGroup)) else False
        if not submissionClient:
            self.submissionClient = WMSClient(useCertificates=useCertificates,
                                              delegatedDN=ownerDN,
                                              delegatedGroup=ownerGroup)
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not jobClass:
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            self.opsH = Operations()
        else:
            self.opsH = opsH

        if not outputDataModule:
            self.outputDataModule = self.opsH.getValue(
                "Transformations/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule

        if not destinationPlugin:
            self.destinationPlugin = self.opsH.getValue(
                'Transformations/DestinationPlugin', 'BySE')
        else:
            self.destinationPlugin = destinationPlugin

        self.destinationPlugin_o = None

        self.outputDataModule_o = None

    def prepareTransformationTasks(self,
                                   transBody,
                                   taskDict,
                                   owner='',
                                   ownerGroup='',
                                   ownerDN='',
                                   bulkSubmissionFlag=False):
        """ Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works.


    :param str transBody: transformation job template
    :param dict taskDict: dictionary of per task parameters
    :param str owner: owner of the transformation
    :param str ownerGroup: group of the owner of the transformation
    :param str ownerDN: DN of the owner of the transformation
    :param bool bulkSubmissionFlag: flag for using bulk submission or not

    :return: S_OK/S_ERROR with updated taskDict
    """

        if (not owner) or (not ownerGroup):
            res = getProxyInfo(False, False)
            if not res['OK']:
                return res
            proxyInfo = res['Value']
            owner = proxyInfo['username']
            ownerGroup = proxyInfo['group']

        if not ownerDN:
            res = getDNForUsername(owner)
            if not res['OK']:
                return res
            ownerDN = res['Value'][0]

        if bulkSubmissionFlag:
            return self.__prepareTasksBulk(transBody, taskDict, owner,
                                           ownerGroup, ownerDN)
        # not a bulk submission
        return self.__prepareTasks(transBody, taskDict, owner, ownerGroup,
                                   ownerDN)

    def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup,
                           ownerDN):
        """ Prepare transformation tasks with a single job object for bulk submission

    :param str transBody: transformation job template
    :param dict taskDict: dictionary of per task parameters
    :param str owner: owner of the transformation
    :param str ownerGroup: group of the owner of the transformation
    :param str ownerDN: DN of the owner of the transformation

    :return: S_OK/S_ERROR with updated taskDict
    """
        if taskDict:
            transID = taskDict.values()[0]['TransformationID']
        else:
            return S_OK({})

        method = '__prepareTasksBulk'
        startTime = time.time()

        # Prepare the bulk Job object with common parameters
        oJob = self.jobClass(transBody)
        self._logVerbose('Setting job owner:group to %s:%s' %
                         (owner, ownerGroup),
                         transID=transID,
                         method=method)
        oJob.setOwner(owner)
        oJob.setOwnerGroup(ownerGroup)
        oJob.setOwnerDN(ownerDN)

        jobType = oJob.workflow.findParameter('JobType').getValue()
        transGroup = str(transID).zfill(8)

        # Verify that the JOB_ID parameter is added to the workflow
        if not oJob.workflow.findParameter('JOB_ID'):
            oJob._addParameter(oJob.workflow, 'JOB_ID', 'string', '00000000',
                               "Initial JOB_ID")

        if oJob.workflow.findParameter('PRODUCTION_ID'):
            oJob._setParamValue('PRODUCTION_ID', str(transID).zfill(8))  # pylint: disable=protected-access
        else:
            oJob._addParameter(
                oJob.workflow,  # pylint: disable=protected-access
                'PRODUCTION_ID',
                'string',
                str(transID).zfill(8),
                "Production ID")
        oJob.setType(jobType)
        self._logVerbose('Adding default transformation group of %s' %
                         (transGroup),
                         transID=transID,
                         method=method)
        oJob.setJobGroup(transGroup)

        if int(transID) in [
                int(x)
                for x in self.opsH.getValue("Hospital/Transformations", [])
        ]:
            self._handleHospital(oJob)

        # Collect per job parameters sequences
        paramSeqDict = {}
        # tasks must be sorted because we use bulk submission and we must find the correspondance
        for taskID in sorted(taskDict):
            paramsDict = taskDict[taskID]
            seqDict = {}

            paramsDict['JobType'] = jobType

            # Handle destination site
            sites = self._handleDestination(paramsDict)
            if not sites:
                self._logError('Could not get a list a sites',
                               transID=transID,
                               method=method)
                return S_ERROR(ETSUKN, "Can not evaluate destination site")
            else:
                self._logVerbose('Setting Site: ',
                                 str(sites),
                                 transID=transID,
                                 method=method)
                seqDict['Site'] = sites

            seqDict['JobName'] = self._transTaskName(transID, taskID)
            seqDict['JOB_ID'] = str(taskID).zfill(8)

            self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' %
                           (transID, taskID, str(paramsDict)),
                           transID=transID,
                           method=method)

            # Handle Input Data
            inputData = paramsDict.get('InputData')
            if inputData:
                if isinstance(inputData, basestring):
                    inputData = inputData.replace(' ', '').split(';')
                self._logVerbose('Setting input data to %s' % inputData,
                                 transID=transID,
                                 method=method)
                seqDict['InputData'] = inputData
            elif paramSeqDict.get('InputData') is not None:
                self._logError(
                    "Invalid mixture of jobs with and without input data")
                return S_ERROR(
                    ETSDATA,
                    "Invalid mixture of jobs with and without input data")

            for paramName, paramValue in paramsDict.iteritems():
                if paramName not in ('InputData', 'Site', 'TargetSE'):
                    if paramValue:
                        self._logVerbose('Setting %s to %s' %
                                         (paramName, paramValue),
                                         transID=transID,
                                         method=method)
                        seqDict[paramName] = paramValue

            outputParameterList = []
            if self.outputDataModule:
                res = self.getOutputData({
                    'Job': oJob._toXML(),
                    'TransformationID': transID,  # pylint: disable=protected-access
                    'TaskID': taskID,
                    'InputData': inputData
                })
                if not res['OK']:
                    self._logError("Failed to generate output data",
                                   res['Message'],
                                   transID=transID,
                                   method=method)
                    continue
                for name, output in res['Value'].iteritems():
                    seqDict[name] = output
                    outputParameterList.append(name)
                    if oJob.workflow.findParameter(name):
                        oJob._setParamValue(name, "%%(%s)s" % name)  # pylint: disable=protected-access
                    else:
                        oJob._addParameter(
                            oJob.workflow,  # pylint: disable=protected-access
                            name,
                            'JDL',
                            "%%(%s)s" % name,
                            name)

            for pName, seq in seqDict.iteritems():
                paramSeqDict.setdefault(pName, []).append(seq)

        for paramName, paramSeq in paramSeqDict.iteritems():
            if paramName in ['JOB_ID', 'PRODUCTION_ID', 'InputData'
                             ] + outputParameterList:
                res = oJob.setParameterSequence(paramName,
                                                paramSeq,
                                                addToWorkflow=paramName)
            else:
                res = oJob.setParameterSequence(paramName, paramSeq)
            if not res['OK']:
                return res

        if taskDict:
            self._logInfo('Prepared %d tasks' % len(taskDict),
                          transID=transID,
                          method=method,
                          reftime=startTime)

        taskDict['BulkJobObject'] = oJob
        return S_OK(taskDict)

    def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN):
        """ Prepare transformation tasks with a job object per task

    :param str transBody: transformation job template
    :param dict taskDict: dictionary of per task parameters
    :param owner: owner of the transformation
    :param str ownerGroup: group of the owner of the transformation
    :param str ownerDN: DN of the owner of the transformation

    :return:  S_OK/S_ERROR with updated taskDict
    """
        if taskDict:
            transID = taskDict.values()[0]['TransformationID']
        else:
            return S_OK({})

        method = '__prepareTasks'
        startTime = time.time()
        oJobTemplate = self.jobClass(transBody)
        oJobTemplate.setOwner(owner)
        oJobTemplate.setOwnerGroup(ownerGroup)
        oJobTemplate.setOwnerDN(ownerDN)
        try:
            site = oJobTemplate.workflow.findParameter('Site').getValue()
        except AttributeError:
            site = None
        jobType = oJobTemplate.workflow.findParameter('JobType').getValue()
        templateOK = False
        getOutputDataTiming = 0.
        for taskID, paramsDict in taskDict.iteritems():
            # Create a job for each task and add it to the taskDict
            if not templateOK:
                templateOK = True
                # Update the template with common information
                self._logVerbose('Job owner:group to %s:%s' %
                                 (owner, ownerGroup),
                                 transID=transID,
                                 method=method)
                transGroup = str(transID).zfill(8)
                self._logVerbose('Adding default transformation group of %s' %
                                 (transGroup),
                                 transID=transID,
                                 method=method)
                oJobTemplate.setJobGroup(transGroup)
                if oJobTemplate.workflow.findParameter('PRODUCTION_ID'):
                    oJobTemplate._setParamValue('PRODUCTION_ID',
                                                str(transID).zfill(8))
                else:
                    oJobTemplate._addParameter(oJobTemplate.workflow,
                                               'PRODUCTION_ID', 'string',
                                               str(transID).zfill(8),
                                               "Production ID")
                if not oJobTemplate.workflow.findParameter('JOB_ID'):
                    oJobTemplate._addParameter(oJobTemplate.workflow, 'JOB_ID',
                                               'string', '00000000',
                                               "Initial JOB_ID")

            if site is not None:
                paramsDict['Site'] = site
            paramsDict['JobType'] = jobType
            # Now create the job from the template
            oJob = copy.deepcopy(oJobTemplate)
            constructedName = self._transTaskName(transID, taskID)
            self._logVerbose('Setting task name to %s' % constructedName,
                             transID=transID,
                             method=method)
            oJob.setName(constructedName)
            oJob._setParamValue('JOB_ID', str(taskID).zfill(8))
            inputData = None

            self._logDebug('TransID: %s, TaskID: %s, paramsDict: %s' %
                           (transID, taskID, str(paramsDict)),
                           transID=transID,
                           method=method)

            # These helper functions do the real job
            sites = self._handleDestination(paramsDict)
            if not sites:
                self._logError('Could not get a list a sites',
                               transID=transID,
                               method=method)
                paramsDict['TaskObject'] = ''
                continue
            else:
                self._logDebug('Setting Site: ',
                               str(sites),
                               transID=transID,
                               method=method)
                res = oJob.setDestination(sites)
                if not res['OK']:
                    self._logError('Could not set the site: %s' %
                                   res['Message'],
                                   transID=transID,
                                   method=method)
                    paramsDict['TaskObject'] = ''
                    continue

            self._handleInputs(oJob, paramsDict)
            self._handleRest(oJob, paramsDict)

            hospitalTrans = [
                int(x)
                for x in self.opsH.getValue("Hospital/Transformations", [])
            ]
            if int(transID) in hospitalTrans:
                self._handleHospital(oJob)

            paramsDict['TaskObject'] = ''
            if self.outputDataModule:
                getOutputDataTiming -= time.time()
                res = self.getOutputData({
                    'Job': oJob._toXML(),
                    'TransformationID': transID,
                    'TaskID': taskID,
                    'InputData': inputData
                })
                getOutputDataTiming += time.time()
                if not res['OK']:
                    self._logError("Failed to generate output data",
                                   res['Message'],
                                   transID=transID,
                                   method=method)
                    continue
                for name, output in res['Value'].iteritems():
                    oJob._addJDLParameter(name, ';'.join(output))
            paramsDict['TaskObject'] = oJob
        if taskDict:
            self._logVerbose('Average getOutputData time: %.1f per task' %
                             (getOutputDataTiming / len(taskDict)),
                             transID=transID,
                             method=method)
            self._logInfo('Prepared %d tasks' % len(taskDict),
                          transID=transID,
                          method=method,
                          reftime=startTime)
        return S_OK(taskDict)

    #############################################################################

    def _handleDestination(self, paramsDict):
        """ Handle Sites and TargetSE in the parameters
    """

        try:
            sites = ['ANY']
            if paramsDict['Site']:
                # 'Site' comes from the XML and therefore is ; separated
                sites = fromChar(paramsDict['Site'], sepChar=';')
        except KeyError:
            pass

        if self.destinationPlugin_o:
            destinationPlugin_o = self.destinationPlugin_o
        else:
            res = self.__generatePluginObject(self.destinationPlugin)
            if not res['OK']:
                self._logFatal(
                    "Could not generate a destination plugin object")
                return res
            destinationPlugin_o = res['Value']
            self.destinationPlugin_o = destinationPlugin_o

        destinationPlugin_o.setParameters(paramsDict)
        destSites = destinationPlugin_o.run()
        if not destSites:
            return sites

        # Now we need to make the AND with the sites, if defined
        if sites != ['ANY']:
            # Need to get the AND
            destSites &= set(sites)

        return list(destSites)

    def _handleInputs(self, oJob, paramsDict):
        """ set job inputs (+ metadata)
    """
        inputData = paramsDict.get('InputData')
        transID = paramsDict['TransformationID']
        if inputData:
            self._logVerbose('Setting input data to %s' % inputData,
                             transID=transID,
                             method='_handleInputs')
            res = oJob.setInputData(inputData)
            if not res['OK']:
                self._logError("Could not set the inputs: %s" % res['Message'],
                               transID=transID,
                               method='_handleInputs')

    def _handleRest(self, oJob, paramsDict):
        """ add as JDL parameters all the other parameters that are not for inputs or destination
    """
        transID = paramsDict['TransformationID']
        for paramName, paramValue in paramsDict.iteritems():
            if paramName not in ('InputData', 'Site', 'TargetSE'):
                if paramValue:
                    self._logDebug('Setting %s to %s' %
                                   (paramName, paramValue),
                                   transID=transID,
                                   method='_handleRest')
                    oJob._addJDLParameter(paramName, paramValue)

    def _handleHospital(self, oJob):
        """ Optional handle of hospital jobs
    """
        oJob.setType('Hospital')
        oJob.setInputDataPolicy('download', dataScheduling=False)
        hospitalSite = self.opsH.getValue("Hospital/HospitalSite",
                                          'DIRAC.JobDebugger.ch')
        oJob.setDestination(hospitalSite)
        hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", [])
        if hospitalCEs:
            oJob._addJDLParameter('GridCE', hospitalCEs)

    def __generatePluginObject(self, plugin):
        """ This simply instantiates the TaskManagerPlugin class with the relevant plugin name
    """
        method = '__generatePluginObject'
        try:
            plugModule = __import__(self.pluginLocation, globals(), locals(),
                                    ['TaskManagerPlugin'])
        except ImportError as e:
            self._logException("Failed to import 'TaskManagerPlugin' %s: %s" %
                               (plugin, e),
                               method=method)
            return S_ERROR()
        try:
            plugin_o = getattr(plugModule,
                               'TaskManagerPlugin')('%s' % plugin,
                                                    operationsHelper=self.opsH)
            return S_OK(plugin_o)
        except AttributeError as e:
            self._logException("Failed to create %s(): %s." % (plugin, e),
                               method=method)
            return S_ERROR()

    #############################################################################

    def getOutputData(self, paramDict):
        """ Get the list of job output LFNs from the provided plugin
    """
        if not self.outputDataModule_o:
            # Create the module object
            moduleFactory = ModuleFactory()

            moduleInstance = moduleFactory.getModule(self.outputDataModule,
                                                     None)
            if not moduleInstance['OK']:
                return moduleInstance
            self.outputDataModule_o = moduleInstance['Value']
        # This is the "argument" to the module, set it and then execute
        self.outputDataModule_o.paramDict = paramDict
        return self.outputDataModule_o.execute()

    def submitTransformationTasks(self, taskDict):
        """ Submit the tasks
    """
        if 'BulkJobObject' in taskDict:
            return self.__submitTransformationTasksBulk(taskDict)
        return self.__submitTransformationTasks(taskDict)

    def __submitTransformationTasksBulk(self, taskDict):
        """ Submit jobs in one go with one parametric job
    """
        if not taskDict:
            return S_OK(taskDict)
        startTime = time.time()

        method = '__submitTransformationTasksBulk'

        oJob = taskDict.pop('BulkJobObject')
        # we can only do this, once the job has been popped, or we _might_ crash
        transID = taskDict.values()[0]['TransformationID']
        if oJob is None:
            self._logError('no bulk Job object found',
                           transID=transID,
                           method=method)
            return S_ERROR(ETSUKN,
                           'No bulk job object provided for submission')

        result = self.submitTaskToExternal(oJob)
        if not result['OK']:
            self._logError('Failed to submit tasks to external',
                           transID=transID,
                           method=method)
            return result

        jobIDList = result['Value']
        if len(jobIDList) != len(taskDict):
            for task in taskDict.values():
                task['Success'] = False
            return S_ERROR(
                ETSUKN, 'Submitted less number of jobs than requested tasks')
        # Get back correspondance with tasks sorted by ID
        for jobID, taskID in zip(jobIDList, sorted(taskDict)):
            taskDict[taskID]['ExternalID'] = jobID
            taskDict[taskID]['Success'] = True

        submitted = len(jobIDList)
        self._logInfo('Submitted %d tasks to WMS in %.1f seconds' %
                      (submitted, time.time() - startTime),
                      transID=transID,
                      method=method)
        return S_OK(taskDict)

    def __submitTransformationTasks(self, taskDict):
        """ Submit jobs one by one
    """
        method = '__submitTransformationTasks'
        submitted = 0
        failed = 0
        startTime = time.time()
        for task in taskDict.itervalues():
            transID = task['TransformationID']
            if not task['TaskObject']:
                task['Success'] = False
                failed += 1
                continue
            res = self.submitTaskToExternal(task['TaskObject'])
            if res['OK']:
                task['ExternalID'] = res['Value']
                task['Success'] = True
                submitted += 1
            else:
                self._logError("Failed to submit task to WMS",
                               res['Message'],
                               transID=transID,
                               method=method)
                task['Success'] = False
                failed += 1
        if submitted:
            self._logInfo('Submitted %d tasks to WMS in %.1f seconds' %
                          (submitted, time.time() - startTime),
                          transID=transID,
                          method=method)
        if failed:
            self._logError('Failed to submit %d tasks to WMS.' % (failed),
                           transID=transID,
                           method=method)
        return S_OK(taskDict)

    def submitTaskToExternal(self, job):
        """ Submits a single job (which can be a bulk one) to the WMS.
    """
        if isinstance(job, basestring):
            try:
                oJob = self.jobClass(job)
            except Exception as x:  # pylint: disable=broad-except
                self._logException("Failed to create job object", '', x)
                return S_ERROR("Failed to create job object")
        elif isinstance(job, self.jobClass):
            oJob = job
        else:
            self._logError("No valid job description found")
            return S_ERROR("No valid job description found")

        workflowFileObject = StringIO.StringIO(oJob._toXML())
        jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject)
        return self.submissionClient.submitJob(jdl, workflowFileObject)

    def updateTransformationReservedTasks(self, taskDicts):
        transID = None
        jobNames = [
            self._transTaskName(taskDict['TransformationID'],
                                taskDict['TaskID']) for taskDict in taskDicts
        ]
        res = self.jobMonitoringClient.getJobs({'JobName': jobNames})
        if not res['OK']:
            self._logError("Failed to get task from WMS",
                           res['Message'],
                           transID=transID,
                           method='updateTransformationReservedTasks')
            return res
        jobNameIDs = {}
        for wmsID in res['Value']:
            res = self.jobMonitoringClient.getJobPrimarySummary(int(wmsID))
            if not res['OK']:
                self._logWarn("Failed to get task summary from WMS",
                              res['Message'],
                              transID=transID,
                              method='updateTransformationReservedTasks')
            else:
                jobNameIDs[res['Value']['JobName']] = int(wmsID)

        noTask = list(set(jobNames) - set(jobNameIDs))
        return S_OK({'NoTasks': noTask, 'TaskNameIDs': jobNameIDs})

    def getSubmittedTaskStatus(self, taskDicts):
        """
    Check the status of a list of tasks and return lists of taskIDs for each new status
    """
        method = 'getSubmittedTaskStatus'

        if taskDicts:
            wmsIDs = [
                int(taskDict['ExternalID']) for taskDict in taskDicts
                if int(taskDict['ExternalID'])
            ]
            transID = taskDicts[0]['TransformationID']
        else:
            return S_OK({})
        res = self.jobMonitoringClient.getJobsStatus(wmsIDs)
        if not res['OK']:
            self._logWarn("Failed to get job status from the WMS system",
                          transID=transID,
                          method=method)
            return res
        statusDict = res['Value']
        updateDict = {}
        for taskDict in taskDicts:
            taskID = taskDict['TaskID']
            wmsID = int(taskDict['ExternalID'])
            if not wmsID:
                continue
            oldStatus = taskDict['ExternalStatus']
            newStatus = statusDict.get(wmsID, {}).get('Status', 'Removed')
            if oldStatus != newStatus:
                if newStatus == "Removed":
                    self._logVerbose(
                        'Production/Job %d/%d removed from WMS while it is in %s status'
                        % (transID, taskID, oldStatus),
                        transID=transID,
                        method=method)
                    newStatus = "Failed"
                self._logVerbose(
                    'Setting job status for Production/Job %d/%d to %s' %
                    (transID, taskID, newStatus),
                    transID=transID,
                    method=method)
                updateDict.setdefault(newStatus, []).append(taskID)
        return S_OK(updateDict)

    def getSubmittedFileStatus(self, fileDicts):
        """
    Check the status of a list of files and return the new status of each LFN
    """
        if not fileDicts:
            return S_OK({})

        method = 'getSubmittedFileStatus'

        # All files are from the same transformation
        transID = fileDicts[0]['TransformationID']
        taskFiles = {}
        for fileDict in fileDicts:
            jobName = self._transTaskName(transID, fileDict['TaskID'])
            taskFiles.setdefault(jobName,
                                 {})[fileDict['LFN']] = fileDict['Status']

        res = self.updateTransformationReservedTasks(fileDicts)
        if not res['OK']:
            self._logWarn("Failed to obtain taskIDs for files",
                          transID=transID,
                          method=method)
            return res
        noTasks = res['Value']['NoTasks']
        taskNameIDs = res['Value']['TaskNameIDs']

        updateDict = {}
        for jobName in noTasks:
            for lfn, oldStatus in taskFiles[jobName].iteritems():
                if oldStatus != 'Unused':
                    updateDict[lfn] = 'Unused'

        res = self.jobMonitoringClient.getJobsStatus(taskNameIDs.values())
        if not res['OK']:
            self._logWarn("Failed to get job status from the WMS system",
                          transID=transID,
                          method=method)
            return res
        statusDict = res['Value']
        for jobName, wmsID in taskNameIDs.iteritems():
            jobStatus = statusDict.get(wmsID, {}).get('Status')
            newFileStatus = {
                'Done': 'Processed',
                'Completed': 'Processed',
                'Failed': 'Unused'
            }.get(jobStatus)
            if newFileStatus:
                for lfn, oldStatus in taskFiles[jobName].iteritems():
                    if newFileStatus != oldStatus:
                        updateDict[lfn] = newFileStatus
        return S_OK(updateDict)
Example #35
0
  def test_JobStateUpdateAndJobMonitoring( self ):
    """ Verifying all JobStateUpdate and JobMonitoring functions
    """
    wmsClient = WMSClient()
    jobMonitor = JobMonitoringClient()
    jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )

    # create a job and check stuff
    job = helloWorldJob()
    jobDescription = createFile( job )

    # submitting the job. Checking few stuff
    res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
    self.assert_( res['OK'] )
    jobID = int ( res['Value'] )
    # jobID = res['JobID']
    res = jobMonitor.getJobJDL( jobID, True )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobJDL( jobID, False )
    self.assert_( res['OK'] )

    # Adding stuff
    res = jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobParameters( jobID, [( 'par1', 'par1Value' ), ( 'par2', 'par2Value' )] )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobApplicationStatus( jobID, 'app status', 'source' )
    self.assert_( res['OK'] )
#     res = jobStateUpdate.setJobFlag()
#     self.assert_( res['OK'] )
#     res = jobStateUpdate.unsetJobFlag()
#     self.assert_( res['OK'] )
    res = jobStateUpdate.setJobSite( jobID, 'Site' )
    self.assert_( res['OK'] )
#     res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' )
#     self.assert_( res['OK'] )

    # now checking few things
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Running' )
    res = jobMonitor.getJobParameter( jobID, 'par1' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {'par1': 'par1Value'} )
    res = jobMonitor.getJobParameters( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {'par1': 'par1Value', 'par2': 'par2Value'} )
    res = jobMonitor.getJobAttribute( jobID, 'Site' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Site' )
    res = jobMonitor.getJobAttributes( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['ApplicationStatus'], 'app status' )
    self.assertEqual( res['Value']['JobName'], 'helloWorld' )
    res = jobMonitor.getJobSummary( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['ApplicationStatus'], 'app status' )
    self.assertEqual( res['Value']['Status'], 'Running' )
    res = jobMonitor.getJobHeartBeatData( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], [] )
    res = jobMonitor.getInputData( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], [] )
    res = jobMonitor.getJobPrimarySummary( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getAtticJobParameters( jobID )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobsStatus( [jobID], 'Done', 'MinorStatus', 'Unknown' )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobSummary( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['Status'], 'Done' )
    self.assertEqual( res['Value']['MinorStatus'], 'MinorStatus' )
    self.assertEqual( res['Value']['ApplicationStatus'], 'app status' )
    res = jobStateUpdate.sendHeartBeat( jobID, {'bih':'bih'}, {'boh':'boh'} )
    self.assert_( res['OK'] )


    # delete the job - this will just set its status to "deleted"
    wmsClient.deleteJob( jobID )
Example #36
0
    def test_JobStateUpdateAndJobMonitoring(self):
        """ Verifying all JobStateUpdate and JobMonitoring functions
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        # create a job and check stuff
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submitting the job. Checking few stuff
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assert_(res['OK'])
        jobID = int(res['Value'])
        # jobID = res['JobID']
        res = jobMonitor.getJobJDL(jobID, True)
        self.assert_(res['OK'])
        res = jobMonitor.getJobJDL(jobID, False)
        self.assert_(res['OK'])
        res = jobMonitor.getJobsParameters([jobID], [])
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {})
        res = jobMonitor.getJobsParameters([jobID], ['Owner'])
        self.assert_(res['OK'])

        # Adding stuff
        res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching',
                                          'source')
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'),
                                                      ('par2', 'par2Value')])
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status',
                                                     'source')
        self.assert_(res['OK'])
        #     res = jobStateUpdate.setJobFlag()
        #     self.assert_( res['OK'] )
        #     res = jobStateUpdate.unsetJobFlag()
        #     self.assert_( res['OK'] )
        res = jobStateUpdate.setJobSite(jobID, 'Site')
        self.assert_(res['OK'])
        #     res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' )
        #     self.assert_( res['OK'] )

        # now checking few things
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Running')
        res = jobMonitor.getJobParameter(jobID, 'par1')
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {'par1': 'par1Value'})
        res = jobMonitor.getJobParameters(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {
            'par1': 'par1Value',
            'par2': 'par2Value'
        })
        res = jobMonitor.getJobAttribute(jobID, 'Site')
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Site')
        res = jobMonitor.getJobAttributes(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        self.assertEqual(res['Value']['JobName'], 'helloWorld')
        res = jobMonitor.getJobSummary(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        self.assertEqual(res['Value']['Status'], 'Running')
        res = jobMonitor.getJobHeartBeatData(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], [])
        res = jobMonitor.getInputData(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], [])
        res = jobMonitor.getJobPrimarySummary(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getAtticJobParameters(jobID)
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus',
                                           'Unknown')
        self.assert_(res['OK'])
        res = jobMonitor.getJobSummary(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['Status'], 'Done')
        self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus')
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'},
                                           {'boh': 'boh'})
        self.assert_(res['OK'])

        # delete the job - this will just set its status to "deleted"
        wmsClient.deleteJob(jobID)
Example #37
0
  def test_JobStateUpdateAndJobMonitoringMultuple( self ):
    """ # Now, let's submit some jobs. Different sites, types, inputs
    """
    wmsClient = WMSClient()
    jobMonitor = JobMonitoringClient()
    jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )

    jobIDs = []
    dests = ['DIRAC.site1.org', 'DIRAC.site2.org']
    lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
    types = ['User', 'Test']
    for dest in dests:
      for lfns in lfnss:
        for jobType in types:
          job = helloWorldJob()
          job.setDestination( dest )
          job.setInputData( lfns )
          job.setType( jobType )
          jobDescription = createFile( job )
          res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
          self.assert_( res['OK'] )
          jobID = res['Value']
          jobIDs.append( jobID )

    res = jobMonitor.getSites()
    self.assert_( res['OK'] )
    self.assert_( set( res['Value'] ) <= set( dests + ['ANY', 'DIRAC.Jenkins.org'] ) )
    res = jobMonitor.getJobTypes()
    self.assert_( res['OK'] )
    self.assertEqual( sorted( res['Value'] ), sorted( types ) )
    res = jobMonitor.getApplicationStates()
    self.assert_( res['OK'] )
    self.assertEqual( sorted( res['Value'] ), sorted( ['Unknown'] ) )

    res = jobMonitor.getOwners()
    self.assert_( res['OK'] )
    res = jobMonitor.getOwnerGroup()
    self.assert_( res['OK'] )
    res = jobMonitor.getProductionIds()
    self.assert_( res['OK'] )
    res = jobMonitor.getJobGroups()
    self.assert_( res['OK'] )
    res = jobMonitor.getStates()
    self.assert_( res['OK'] )
    self.assert_( sorted( res['Value'] ) in [['Received'], sorted( ['Received', 'Waiting'] )] )
    res = jobMonitor.getMinorStates()
    self.assert_( res['OK'] )
    self.assert_( sorted( res['Value'] ) in [['Job accepted'], sorted( ['Job accepted', 'matching'] ) ] )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobs()
    self.assert_( res['OK'] )
    self.assert_( set( [str( x ) for x in jobIDs] ) <= set( res['Value'] ) )
#     res = jobMonitor.getCounters(attrList)
#     self.assert_( res['OK'] )
    res = jobMonitor.getCurrentJobCounters()
    self.assert_( res['OK'] )
    try:
      self.assert_( res['Value'].get( 'Received' ) + res['Value'].get( 'Waiting' ) >= long( len( dests ) * len( lfnss ) * len( types ) ) )
    except TypeError:
      pass
    res = jobMonitor.getJobsSummary( jobIDs )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobPageSummaryWeb( {}, [], 0, 100 )
    self.assert_( res['OK'] )

    res = jobStateUpdate.setJobStatusBulk( jobID, {str( datetime.datetime.utcnow() ):{'Status': 'Running',
                                                                                      'MinorStatus': 'MinorStatus',
                                                                                      'ApplicationStatus': 'ApplicationStatus',
                                                                                      'Source': 'Unknown'}} )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobsParameter( {jobID:['Status', 'Running']} )
    self.assert_( res['OK'] )

    # delete the jobs - this will just set its status to "deleted"
    wmsClient.deleteJob( jobIDs )
Example #38
0
    def test_FullChain(self):
        """ This test will

        - call all the WMSClient methods
          that will end up calling all the JobManager service methods
        - use the JobMonitoring to verify few properties
        - call the JobCleaningAgent to eliminate job entries from the DBs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        # create the job
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assert_(res['OK'])
        # self.assertEqual( type( res['Value'] ), int )
        # self.assertEqual( res['Value'], res['JobID'] )
        # jobID = res['JobID']
        jobID = res['Value']

        # updating the status
        jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp',
                                    'source')

        # reset the job
        res = wmsClient.resetJob(jobID)
        self.assert_(res['OK'])

        # reschedule the job
        res = wmsClient.rescheduleJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Received')

        # updating the status again
        jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source')

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Killed')

        # updating the status aaaagain
        jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'],
                         'Done')  # this time it won't kill... it's done!

        # delete the job - this will just set its status to "deleted"
        res = wmsClient.deleteJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Deleted')
Example #39
0
  def test_FullChain( self ):
    """ This test will

        - call all the WMSClient methods
          that will end up calling all the JobManager service methods
        - use the JobMonitoring to verify few properties
        - call the JobCleaningAgent to eliminate job entries from the DBs
    """
    wmsClient = WMSClient()
    jobMonitor = JobMonitoringClient()
    jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )

    # create the job
    job = helloWorldJob()
    jobDescription = createFile( job )

    # submit the job
    res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
    self.assert_( res['OK'] )
  # self.assertEqual( type( res['Value'] ), int )
  # self.assertEqual( res['Value'], res['JobID'] )
  # jobID = res['JobID']
    jobID = res['Value']

    # updating the status
    jobStateUpdate.setJobStatus( jobID, 'Running', 'Executing Minchiapp', 'source' )

    # reset the job
    res = wmsClient.resetJob( jobID )
    self.assert_( res['OK'] )

    # reschedule the job
    res = wmsClient.rescheduleJob( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Received' )

    # updating the status again
    jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' )

    # kill the job
    res = wmsClient.killJob( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Killed' )

    # updating the status aaaagain
    jobStateUpdate.setJobStatus( jobID, 'Done', 'matching', 'source' )

    # kill the job
    res = wmsClient.killJob( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Done' )  # this time it won't kill... it's done!

    # delete the job - this will just set its status to "deleted"
    res = wmsClient.deleteJob( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Deleted' )
import time

__RCSID__ = "$Id$"

from DIRAC.Core.Base import Script
Script.parseCommandLine()

from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient

# sut
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient

from DIRAC.tests.Integration.WorkloadManagementSystem.Test_Client_WMS import helloWorldJob, createFile

jobMonitoringClient = JobMonitoringClient()
jobStateUpdateClient = JobStateUpdateClient()


def createJob():

    job = helloWorldJob()
    jobDescription = createFile(job)

    wmsClient = WMSClient()
    res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
    assert res['OK'], res['Message']
    jobID = int(res['Value'])
    return jobID

Example #41
0
 def __init__(self):
     TaskBase.__init__(self)
     self.submissionClient = WMSClient()
     self.jobMonitoringClient = JobMonitoringClient()