예제 #1
0
    def test_JobStateUpdateAndJobMonitoring(self):
        """ Verifying all JobStateUpdate and JobMonitoring functions
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        # create a job and check stuff
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submitting the job. Checking few stuff
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assert_(res['OK'])
        jobID = int(res['Value'])
        # jobID = res['JobID']
        res = jobMonitor.getJobJDL(jobID, True)
        self.assert_(res['OK'])
        res = jobMonitor.getJobJDL(jobID, False)
        self.assert_(res['OK'])
        res = jobMonitor.getJobsParameters([jobID], [])
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {})
        res = jobMonitor.getJobsParameters([jobID], ['Owner'])
        self.assert_(res['OK'])

        # Adding stuff
        res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching',
                                          'source')
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'),
                                                      ('par2', 'par2Value')])
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status',
                                                     'source')
        self.assert_(res['OK'])
        #     res = jobStateUpdate.setJobFlag()
        #     self.assert_( res['OK'] )
        #     res = jobStateUpdate.unsetJobFlag()
        #     self.assert_( res['OK'] )
        res = jobStateUpdate.setJobSite(jobID, 'Site')
        self.assert_(res['OK'])
        #     res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' )
        #     self.assert_( res['OK'] )

        # now checking few things
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Running')
        res = jobMonitor.getJobParameter(jobID, 'par1')
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {'par1': 'par1Value'})
        res = jobMonitor.getJobParameters(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {
            'par1': 'par1Value',
            'par2': 'par2Value'
        })
        res = jobMonitor.getJobAttribute(jobID, 'Site')
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Site')
        res = jobMonitor.getJobAttributes(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        self.assertEqual(res['Value']['JobName'], 'helloWorld')
        res = jobMonitor.getJobSummary(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        self.assertEqual(res['Value']['Status'], 'Running')
        res = jobMonitor.getJobHeartBeatData(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], [])
        res = jobMonitor.getInputData(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], [])
        res = jobMonitor.getJobPrimarySummary(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getAtticJobParameters(jobID)
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus',
                                           'Unknown')
        self.assert_(res['OK'])
        res = jobMonitor.getJobSummary(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['Status'], 'Done')
        self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus')
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'},
                                           {'boh': 'boh'})
        self.assert_(res['OK'])

        # delete the job - this will just set its status to "deleted"
        wmsClient.deleteJob(jobID)
예제 #2
0
    def finalizeRequest(self, requestID, jobID, useCertificates=True):
        """check request status and perform finalization if necessary
            update the request status and the corresponding job parameter

        :param self: self reference
        :param str requestID: request id
        :param int jobID: job id
        """

        stateServer = JobStateUpdateClient(useCertificates=useCertificates)

        # Checking if to update the job status - we should fail here, so it will be re-tried later
        # Checking the state, first
        res = self.getRequestStatus(requestID)
        if not res["OK"]:
            self.log.error(
                "finalizeRequest: failed to get request",
                "request: %s status: %s" % (requestID, res["Message"]))
            return res
        if res["Value"] != "Done":
            return S_ERROR(
                "The request %s isn't 'Done' but '%s', this should never happen, why are we here?"
                % (requestID, res["Value"]))

        # The request is 'Done', let's update the job status. If we fail, we should re-try later

        monitorServer = JobMonitoringClient(useCertificates=useCertificates)
        res = monitorServer.getJobSummary(int(jobID))
        if not res["OK"]:
            self.log.error("finalizeRequest: Failed to get job status",
                           "JobID: %d" % jobID)
            return res
        elif not res["Value"]:
            self.log.info(
                "finalizeRequest: job %d does not exist (anymore): finalizing"
                % jobID)
            return S_OK()
        else:
            jobStatus = res["Value"]["Status"]
            jobMinorStatus = res["Value"]["MinorStatus"]
            jobAppStatus = ""
            newJobStatus = ""
            if jobStatus == JobStatus.STALLED:
                # If job is stalled, find the previous status from the logging info
                res = monitorServer.getJobLoggingInfo(int(jobID))
                if not res["OK"]:
                    self.log.error(
                        "finalizeRequest: Failed to get job logging info",
                        "JobID: %d" % jobID)
                    return res
                # Check the last status was Stalled and get the one before
                if len(res["Value"]
                       ) >= 2 and res["Value"][-1][0] == JobStatus.STALLED:
                    jobStatus, jobMinorStatus, jobAppStatus = res["Value"][
                        -2][:3]
                    newJobStatus = jobStatus

            # update the job pending request digest in any case since it is modified
            self.log.info(
                "finalizeRequest: Updating request digest for job %d" % jobID)

            digest = self.getDigest(requestID)
            if digest["OK"]:
                digest = digest["Value"]
                self.log.verbose(digest)
                res = stateServer.setJobParameter(jobID, "PendingRequest",
                                                  digest)
                if not res["OK"]:
                    self.log.info(
                        "finalizeRequest: Failed to set job %d parameter: %s" %
                        (jobID, res["Message"]))
                    return res
            else:
                self.log.error(
                    "finalizeRequest: Failed to get request digest for %s: %s"
                    % (requestID, digest["Message"]))
            if jobStatus == JobStatus.COMPLETED:
                # What to do? Depends on what we have in the minorStatus
                if jobMinorStatus == JobMinorStatus.PENDING_REQUESTS:
                    newJobStatus = JobStatus.DONE
                elif jobMinorStatus == JobMinorStatus.APP_ERRORS:
                    newJobStatus = JobStatus.FAILED
                elif jobMinorStatus == JobMinorStatus.MARKED_FOR_TERMINATION:
                    # If the job has been Killed, set it Killed
                    newJobStatus = JobStatus.KILLED
                else:
                    self.log.error(
                        "finalizeRequest: Unexpected jobMinorStatus",
                        "for %d (got %s)" % (jobID, jobMinorStatus))
                    return S_ERROR("Unexpected jobMinorStatus")

            if newJobStatus:
                self.log.info(
                    "finalizeRequest: Updating job status",
                    "for %d to '%s/%s'" %
                    (jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE),
                )
            else:
                self.log.info(
                    "finalizeRequest: Updating job minor status",
                    "for %d to '%s' (current status is %s)" %
                    (jobID, JobMinorStatus.REQUESTS_DONE, jobStatus),
                )
            stateUpdate = stateServer.setJobStatus(
                jobID, newJobStatus, JobMinorStatus.REQUESTS_DONE, "RMS")
            if jobAppStatus and stateUpdate["OK"]:
                stateUpdate = stateServer.setJobApplicationStatus(
                    jobID, jobAppStatus, "RMS")
            if not stateUpdate["OK"]:
                self.log.error(
                    "finalizeRequest: Failed to set job status",
                    "JobID: %d, error: %s" % (jobID, stateUpdate["Message"]),
                )
                return stateUpdate

        return S_OK(newJobStatus)
예제 #3
0
  def test_JobStateUpdateAndJobMonitoring( self ):
    """ Verifying all JobStateUpdate and JobMonitoring functions
    """
    wmsClient = WMSClient()
    jobMonitor = JobMonitoringClient()
    jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )

    # create a job and check stuff
    job = helloWorldJob()
    jobDescription = createFile( job )

    # submitting the job. Checking few stuff
    res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
    self.assert_( res['OK'] )
    jobID = int ( res['Value'] )
    # jobID = res['JobID']
    res = jobMonitor.getJobJDL( jobID, True )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobJDL( jobID, False )
    self.assert_( res['OK'] )

    # Adding stuff
    res = jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobParameters( jobID, [( 'par1', 'par1Value' ), ( 'par2', 'par2Value' )] )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobApplicationStatus( jobID, 'app status', 'source' )
    self.assert_( res['OK'] )
#     res = jobStateUpdate.setJobFlag()
#     self.assert_( res['OK'] )
#     res = jobStateUpdate.unsetJobFlag()
#     self.assert_( res['OK'] )
    res = jobStateUpdate.setJobSite( jobID, 'Site' )
    self.assert_( res['OK'] )
#     res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' )
#     self.assert_( res['OK'] )

    # now checking few things
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Running' )
    res = jobMonitor.getJobParameter( jobID, 'par1' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {'par1': 'par1Value'} )
    res = jobMonitor.getJobParameters( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {'par1': 'par1Value', 'par2': 'par2Value'} )
    res = jobMonitor.getJobAttribute( jobID, 'Site' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Site' )
    res = jobMonitor.getJobAttributes( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['ApplicationStatus'], 'app status' )
    self.assertEqual( res['Value']['JobName'], 'helloWorld' )
    res = jobMonitor.getJobSummary( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['ApplicationStatus'], 'app status' )
    self.assertEqual( res['Value']['Status'], 'Running' )
    res = jobMonitor.getJobHeartBeatData( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], [] )
    res = jobMonitor.getInputData( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], [] )
    res = jobMonitor.getJobPrimarySummary( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getAtticJobParameters( jobID )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobsStatus( [jobID], 'Done', 'MinorStatus', 'Unknown' )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobSummary( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['Status'], 'Done' )
    self.assertEqual( res['Value']['MinorStatus'], 'MinorStatus' )
    self.assertEqual( res['Value']['ApplicationStatus'], 'app status' )
    res = jobStateUpdate.sendHeartBeat( jobID, {'bih':'bih'}, {'boh':'boh'} )
    self.assert_( res['OK'] )


    # delete the job - this will just set its status to "deleted"
    wmsClient.deleteJob( jobID )
예제 #4
0
class WorkflowTasks(TaskBase):
    """Handles jobs"""
    def __init__(
        self,
        transClient=None,
        logger=None,
        submissionClient=None,
        jobMonitoringClient=None,
        outputDataModule=None,
        jobClass=None,
        opsH=None,
        destinationPlugin=None,
        ownerDN=None,
        ownerGroup=None,
    ):
        """Generates some default objects.
        jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works:
        VOs can pass in their job class extension, if present
        """

        if not logger:
            logger = gLogger.getSubLogger(self.__class__.__name__)

        super(WorkflowTasks, self).__init__(transClient, logger)

        useCertificates = True if (bool(ownerDN)
                                   and bool(ownerGroup)) else False
        if not submissionClient:
            self.submissionClient = WMSClient(useCertificates=useCertificates,
                                              delegatedDN=ownerDN,
                                              delegatedGroup=ownerGroup)
        else:
            self.submissionClient = submissionClient

        if not jobMonitoringClient:
            self.jobMonitoringClient = JobMonitoringClient()
        else:
            self.jobMonitoringClient = jobMonitoringClient

        if not jobClass:
            self.jobClass = Job
        else:
            self.jobClass = jobClass

        if not opsH:
            self.opsH = Operations()
        else:
            self.opsH = opsH

        if not outputDataModule:
            self.outputDataModule = self.opsH.getValue(
                "Transformations/OutputDataModule", "")
        else:
            self.outputDataModule = outputDataModule

        if not destinationPlugin:
            self.destinationPlugin = self.opsH.getValue(
                "Transformations/DestinationPlugin", "BySE")
        else:
            self.destinationPlugin = destinationPlugin

        self.destinationPlugin_o = None

        self.outputDataModule_o = None

    def prepareTransformationTasks(self,
                                   transBody,
                                   taskDict,
                                   owner="",
                                   ownerGroup="",
                                   ownerDN="",
                                   bulkSubmissionFlag=False):
        """Prepare tasks, given a taskDict, that is created (with some manipulation) by the DB
            jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works.


        :param str transBody: transformation job template
        :param dict taskDict: dictionary of per task parameters
        :param str owner: owner of the transformation
        :param str ownerGroup: group of the owner of the transformation
        :param str ownerDN: DN of the owner of the transformation
        :param bool bulkSubmissionFlag: flag for using bulk submission or not

        :return: S_OK/S_ERROR with updated taskDict
        """

        if (not owner) or (not ownerGroup):
            res = getProxyInfo(False, False)
            if not res["OK"]:
                return res
            proxyInfo = res["Value"]
            owner = proxyInfo["username"]
            ownerGroup = proxyInfo["group"]

        if not ownerDN:
            res = getDNForUsername(owner)
            if not res["OK"]:
                return res
            ownerDN = res["Value"][0]

        if bulkSubmissionFlag:
            return self.__prepareTasksBulk(transBody, taskDict, owner,
                                           ownerGroup, ownerDN)
        # not a bulk submission
        return self.__prepareTasks(transBody, taskDict, owner, ownerGroup,
                                   ownerDN)

    def __prepareTasksBulk(self, transBody, taskDict, owner, ownerGroup,
                           ownerDN):
        """Prepare transformation tasks with a single job object for bulk submission

        :param str transBody: transformation job template
        :param dict taskDict: dictionary of per task parameters
        :param str owner: owner of the transformation
        :param str ownerGroup: group of the owner of the transformation
        :param str ownerDN: DN of the owner of the transformation

        :return: S_OK/S_ERROR with updated taskDict
        """
        if taskDict:
            transID = list(taskDict.values())[0]["TransformationID"]
        else:
            return S_OK({})

        method = "__prepareTasksBulk"
        startTime = time.time()

        # Prepare the bulk Job object with common parameters
        oJob = self.jobClass(transBody)
        self._logVerbose("Setting job owner:group to %s:%s" %
                         (owner, ownerGroup),
                         transID=transID,
                         method=method)
        oJob.setOwner(owner)
        oJob.setOwnerGroup(ownerGroup)
        oJob.setOwnerDN(ownerDN)

        try:
            site = oJob.workflow.findParameter("Site").getValue()
        except AttributeError:
            site = None
        jobType = oJob.workflow.findParameter("JobType").getValue()
        transGroup = str(transID).zfill(8)

        # Verify that the JOB_ID parameter is added to the workflow
        if not oJob.workflow.findParameter("JOB_ID"):
            oJob._addParameter(oJob.workflow, "JOB_ID", "string", "00000000",
                               "Initial JOB_ID")

        if oJob.workflow.findParameter("PRODUCTION_ID"):
            oJob._setParamValue("PRODUCTION_ID", str(transID).zfill(8))  # pylint: disable=protected-access
        else:
            oJob._addParameter(
                oJob.workflow,  # pylint: disable=protected-access
                "PRODUCTION_ID",
                "string",
                str(transID).zfill(8),
                "Production ID",
            )
        oJob.setType(jobType)
        self._logVerbose("Adding default transformation group of %s" %
                         (transGroup),
                         transID=transID,
                         method=method)
        oJob.setJobGroup(transGroup)

        clinicPath = self._checkSickTransformations(transID)
        if clinicPath:
            self._handleHospital(oJob, clinicPath)

        # Collect per job parameters sequences
        paramSeqDict = {}
        # tasks must be sorted because we use bulk submission and we must find the correspondance
        for taskID in sorted(taskDict):
            paramsDict = taskDict[taskID]
            seqDict = {}

            if site is not None:
                paramsDict["Site"] = site
            paramsDict["JobType"] = jobType

            # Handle destination site
            sites = self._handleDestination(paramsDict)
            if not sites:
                self._logError("Could not get a list a sites",
                               transID=transID,
                               method=method)
                return S_ERROR(ETSUKN, "Can not evaluate destination site")
            else:
                self._logVerbose("Setting Site: ",
                                 str(sites),
                                 transID=transID,
                                 method=method)
                seqDict["Site"] = sites

            seqDict["JobName"] = self._transTaskName(transID, taskID)
            seqDict["JOB_ID"] = str(taskID).zfill(8)

            self._logDebug(
                "TransID: %s, TaskID: %s, paramsDict: %s" %
                (transID, taskID, str(paramsDict)),
                transID=transID,
                method=method,
            )

            # Handle Input Data
            inputData = paramsDict.get("InputData")
            if inputData:
                if isinstance(inputData, six.string_types):
                    inputData = inputData.replace(" ", "").split(";")
                self._logVerbose("Setting input data to %s" % inputData,
                                 transID=transID,
                                 method=method)
                seqDict["InputData"] = inputData
            elif paramSeqDict.get("InputData") is not None:
                self._logError(
                    "Invalid mixture of jobs with and without input data")
                return S_ERROR(
                    ETSDATA,
                    "Invalid mixture of jobs with and without input data")

            for paramName, paramValue in paramsDict.items():
                if paramName not in ("InputData", "Site", "TargetSE"):
                    if paramValue:
                        self._logVerbose("Setting %s to %s" %
                                         (paramName, paramValue),
                                         transID=transID,
                                         method=method)
                        seqDict[paramName] = paramValue

            outputParameterList = []
            if self.outputDataModule:
                res = self.getOutputData({
                    "Job": oJob._toXML(),  # pylint: disable=protected-access
                    "TransformationID": transID,
                    "TaskID": taskID,
                    "InputData": inputData,
                })
                if not res["OK"]:
                    self._logError("Failed to generate output data",
                                   res["Message"],
                                   transID=transID,
                                   method=method)
                    continue
                for name, output in res["Value"].items():
                    seqDict[name] = output
                    outputParameterList.append(name)
                    if oJob.workflow.findParameter(name):
                        oJob._setParamValue(name, "%%(%s)s" % name)  # pylint: disable=protected-access
                    else:
                        oJob._addParameter(
                            oJob.workflow,
                            name,
                            "JDL",
                            "%%(%s)s" % name,
                            name  # pylint: disable=protected-access
                        )

            for pName, seq in seqDict.items():
                paramSeqDict.setdefault(pName, []).append(seq)

        for paramName, paramSeq in paramSeqDict.items():
            if paramName in ["JOB_ID", "PRODUCTION_ID", "InputData"
                             ] + outputParameterList:
                res = oJob.setParameterSequence(paramName,
                                                paramSeq,
                                                addToWorkflow=paramName)
            else:
                res = oJob.setParameterSequence(paramName, paramSeq)
            if not res["OK"]:
                return res

        if taskDict:
            self._logInfo("Prepared %d tasks" % len(taskDict),
                          transID=transID,
                          method=method,
                          reftime=startTime)

        taskDict["BulkJobObject"] = oJob
        return S_OK(taskDict)

    def __prepareTasks(self, transBody, taskDict, owner, ownerGroup, ownerDN):
        """Prepare transformation tasks with a job object per task

        :param str transBody: transformation job template
        :param dict taskDict: dictionary of per task parameters
        :param owner: owner of the transformation
        :param str ownerGroup: group of the owner of the transformation
        :param str ownerDN: DN of the owner of the transformation

        :return:  S_OK/S_ERROR with updated taskDict
        """
        if taskDict:
            transID = list(taskDict.values())[0]["TransformationID"]
        else:
            return S_OK({})

        method = "__prepareTasks"
        startTime = time.time()

        oJobTemplate = self.jobClass(transBody)
        oJobTemplate.setOwner(owner)
        oJobTemplate.setOwnerGroup(ownerGroup)
        oJobTemplate.setOwnerDN(ownerDN)

        try:
            site = oJobTemplate.workflow.findParameter("Site").getValue()
        except AttributeError:
            site = None
        jobType = oJobTemplate.workflow.findParameter("JobType").getValue()
        templateOK = False
        getOutputDataTiming = 0.0

        for taskID, paramsDict in taskDict.items():
            # Create a job for each task and add it to the taskDict
            if not templateOK:
                templateOK = True
                # Update the template with common information
                self._logVerbose("Job owner:group to %s:%s" %
                                 (owner, ownerGroup),
                                 transID=transID,
                                 method=method)
                transGroup = str(transID).zfill(8)
                self._logVerbose("Adding default transformation group of %s" %
                                 (transGroup),
                                 transID=transID,
                                 method=method)
                oJobTemplate.setJobGroup(transGroup)
                if oJobTemplate.workflow.findParameter("PRODUCTION_ID"):
                    oJobTemplate._setParamValue("PRODUCTION_ID",
                                                str(transID).zfill(8))
                else:
                    oJobTemplate._addParameter(oJobTemplate.workflow,
                                               "PRODUCTION_ID", "string",
                                               str(transID).zfill(8),
                                               "Production ID")
                if not oJobTemplate.workflow.findParameter("JOB_ID"):
                    oJobTemplate._addParameter(oJobTemplate.workflow, "JOB_ID",
                                               "string", "00000000",
                                               "Initial JOB_ID")

            if site is not None:
                paramsDict["Site"] = site
            paramsDict["JobType"] = jobType
            # Now create the job from the template
            oJob = copy.deepcopy(oJobTemplate)
            constructedName = self._transTaskName(transID, taskID)
            self._logVerbose("Setting task name to %s" % constructedName,
                             transID=transID,
                             method=method)
            oJob.setName(constructedName)
            oJob._setParamValue("JOB_ID", str(taskID).zfill(8))
            inputData = None

            self._logDebug(
                "TransID: %s, TaskID: %s, paramsDict: %s" %
                (transID, taskID, str(paramsDict)),
                transID=transID,
                method=method,
            )

            # These helper functions do the real job
            sites = self._handleDestination(paramsDict)
            if not sites:
                self._logError("Could not get a list a sites",
                               transID=transID,
                               method=method)
                paramsDict["TaskObject"] = ""
                continue
            else:
                self._logDebug("Setting Site: ",
                               str(sites),
                               transID=transID,
                               method=method)
                res = oJob.setDestination(sites)
                if not res["OK"]:
                    self._logError("Could not set the site: %s" %
                                   res["Message"],
                                   transID=transID,
                                   method=method)
                    paramsDict["TaskObject"] = ""
                    continue

            self._handleInputs(oJob, paramsDict)
            self._handleRest(oJob, paramsDict)

            clinicPath = self._checkSickTransformations(transID)
            if clinicPath:
                self._handleHospital(oJob, clinicPath)

            paramsDict["TaskObject"] = ""
            if self.outputDataModule:
                getOutputDataTiming -= time.time()
                res = self.getOutputData({
                    "Job": oJob._toXML(),
                    "TransformationID": transID,
                    "TaskID": taskID,
                    "InputData": inputData
                })
                getOutputDataTiming += time.time()
                if not res["OK"]:
                    self._logError("Failed to generate output data",
                                   res["Message"],
                                   transID=transID,
                                   method=method)
                    continue
                for name, output in res["Value"].items():
                    oJob._addJDLParameter(name, ";".join(output))
            paramsDict["TaskObject"] = oJob
        if taskDict:
            self._logVerbose(
                "Average getOutputData time: %.1f per task" %
                (getOutputDataTiming / len(taskDict)),
                transID=transID,
                method=method,
            )
            self._logInfo("Prepared %d tasks" % len(taskDict),
                          transID=transID,
                          method=method,
                          reftime=startTime)
        return S_OK(taskDict)

    #############################################################################

    def _handleDestination(self, paramsDict):
        """Handle Sites and TargetSE in the parameters"""

        try:
            sites = ["ANY"]
            if paramsDict["Site"]:
                # 'Site' comes from the XML and therefore is ; separated
                sites = fromChar(paramsDict["Site"], sepChar=";")
        except KeyError:
            pass

        if self.destinationPlugin_o:
            destinationPlugin_o = self.destinationPlugin_o
        else:
            res = self.__generatePluginObject(self.destinationPlugin)
            if not res["OK"]:
                self._logFatal(
                    "Could not generate a destination plugin object")
                return res
            destinationPlugin_o = res["Value"]
            self.destinationPlugin_o = destinationPlugin_o

        destinationPlugin_o.setParameters(paramsDict)
        destSites = destinationPlugin_o.run()
        if not destSites:
            return sites

        # Now we need to make the AND with the sites, if defined
        if sites != ["ANY"]:
            # Need to get the AND
            destSites &= set(sites)

        return list(destSites)

    def _handleInputs(self, oJob, paramsDict):
        """set job inputs (+ metadata)"""
        inputData = paramsDict.get("InputData")
        transID = paramsDict["TransformationID"]
        if inputData:
            self._logVerbose("Setting input data to %s" % inputData,
                             transID=transID,
                             method="_handleInputs")
            res = oJob.setInputData(inputData)
            if not res["OK"]:
                self._logError("Could not set the inputs: %s" % res["Message"],
                               transID=transID,
                               method="_handleInputs")

    def _handleRest(self, oJob, paramsDict):
        """add as JDL parameters all the other parameters that are not for inputs or destination"""
        transID = paramsDict["TransformationID"]
        for paramName, paramValue in paramsDict.items():
            if paramName not in ("InputData", "Site", "TargetSE"):
                if paramValue:
                    self._logDebug("Setting %s to %s" %
                                   (paramName, paramValue),
                                   transID=transID,
                                   method="_handleRest")
                    oJob._addJDLParameter(paramName, paramValue)

    def _checkSickTransformations(self, transID):
        """Check if the transformation is in the transformations to be processed at Hospital or Clinic"""
        transID = int(transID)
        clinicPath = "Hospital"
        if transID in set(
                int(x) for x in self.opsH.getValue(
                    os.path.join(clinicPath, "Transformations"), [])):
            return clinicPath
        if "Clinics" in self.opsH.getSections("Hospital").get("Value", []):
            basePath = os.path.join("Hospital", "Clinics")
            clinics = self.opsH.getSections(basePath)["Value"]
            for clinic in clinics:
                clinicPath = os.path.join(basePath, clinic)
                if transID in set(
                        int(x) for x in self.opsH.getValue(
                            os.path.join(clinicPath, "Transformations"), [])):
                    return clinicPath
        return None

    def _handleHospital(self, oJob, clinicPath):
        """Optional handle of hospital/clinic jobs"""
        if not clinicPath:
            return
        oJob.setInputDataPolicy("download", dataScheduling=False)

        # Check first for a clinic, if not it must be the general hospital
        hospitalSite = self.opsH.getValue(
            os.path.join(clinicPath, "ClinicSite"), "")
        hospitalCEs = self.opsH.getValue(os.path.join(clinicPath, "ClinicCE"),
                                         [])
        # If not found, get the hospital parameters
        if not hospitalSite:
            hospitalSite = self.opsH.getValue("Hospital/HospitalSite",
                                              "DIRAC.JobDebugger.ch")
        if not hospitalCEs:
            hospitalCEs = self.opsH.getValue("Hospital/HospitalCEs", [])

        oJob.setDestination(hospitalSite)
        if hospitalCEs:
            oJob._addJDLParameter("GridCE", hospitalCEs)

    def __generatePluginObject(self, plugin):
        """This simply instantiates the TaskManagerPlugin class with the relevant plugin name"""
        method = "__generatePluginObject"
        try:
            plugModule = __import__(self.pluginLocation, globals(), locals(),
                                    ["TaskManagerPlugin"])
        except ImportError as e:
            self._logException("Failed to import 'TaskManagerPlugin' %s: %s" %
                               (plugin, e),
                               method=method)
            return S_ERROR()
        try:
            plugin_o = getattr(plugModule,
                               "TaskManagerPlugin")("%s" % plugin,
                                                    operationsHelper=self.opsH)
            return S_OK(plugin_o)
        except AttributeError as e:
            self._logException("Failed to create %s(): %s." % (plugin, e),
                               method=method)
            return S_ERROR()

    #############################################################################

    def getOutputData(self, paramDict):
        """Get the list of job output LFNs from the provided plugin"""
        if not self.outputDataModule_o:
            # Create the module object
            moduleFactory = ModuleFactory()

            moduleInstance = moduleFactory.getModule(self.outputDataModule,
                                                     None)
            if not moduleInstance["OK"]:
                return moduleInstance
            self.outputDataModule_o = moduleInstance["Value"]
        # This is the "argument" to the module, set it and then execute
        self.outputDataModule_o.paramDict = paramDict
        return self.outputDataModule_o.execute()

    def submitTransformationTasks(self, taskDict):
        """Submit the tasks"""
        if "BulkJobObject" in taskDict:
            return self.__submitTransformationTasksBulk(taskDict)
        return self.__submitTransformationTasks(taskDict)

    def __submitTransformationTasksBulk(self, taskDict):
        """Submit jobs in one go with one parametric job"""
        if not taskDict:
            return S_OK(taskDict)
        startTime = time.time()

        method = "__submitTransformationTasksBulk"

        oJob = taskDict.pop("BulkJobObject")
        # we can only do this, once the job has been popped, or we _might_ crash
        transID = list(taskDict.values())[0]["TransformationID"]
        if oJob is None:
            self._logError("no bulk Job object found",
                           transID=transID,
                           method=method)
            return S_ERROR(ETSUKN,
                           "No bulk job object provided for submission")

        result = self.submitTaskToExternal(oJob)
        if not result["OK"]:
            self._logError("Failed to submit tasks to external",
                           transID=transID,
                           method=method)
            return result

        jobIDList = result["Value"]
        if len(jobIDList) != len(taskDict):
            for task in taskDict.values():
                task["Success"] = False
            return S_ERROR(
                ETSUKN, "Submitted less number of jobs than requested tasks")
        # Get back correspondence with tasks sorted by ID
        for jobID, taskID in zip(jobIDList, sorted(taskDict)):
            taskDict[taskID]["ExternalID"] = jobID
            taskDict[taskID]["Success"] = True

        submitted = len(jobIDList)
        self._logInfo(
            "Submitted %d tasks to WMS in %.1f seconds" %
            (submitted, time.time() - startTime),
            transID=transID,
            method=method,
        )
        return S_OK(taskDict)

    def __submitTransformationTasks(self, taskDict):
        """Submit jobs one by one"""
        method = "__submitTransformationTasks"
        submitted = 0
        failed = 0
        startTime = time.time()
        for task in taskDict.values():
            transID = task["TransformationID"]
            if not task["TaskObject"]:
                task["Success"] = False
                failed += 1
                continue
            res = self.submitTaskToExternal(task["TaskObject"])
            if res["OK"]:
                task["ExternalID"] = res["Value"]
                task["Success"] = True
                submitted += 1
            else:
                self._logError("Failed to submit task to WMS",
                               res["Message"],
                               transID=transID,
                               method=method)
                task["Success"] = False
                failed += 1
        if submitted:
            self._logInfo(
                "Submitted %d tasks to WMS in %.1f seconds" %
                (submitted, time.time() - startTime),
                transID=transID,
                method=method,
            )
        if failed:
            self._logError("Failed to submit %d tasks to WMS." % (failed),
                           transID=transID,
                           method=method)
        return S_OK(taskDict)

    def submitTaskToExternal(self, job):
        """Submits a single job (which can be a bulk one) to the WMS."""
        if isinstance(job, six.string_types):
            try:
                oJob = self.jobClass(job)
            except Exception as x:  # pylint: disable=broad-except
                self._logException("Failed to create job object", "", x)
                return S_ERROR("Failed to create job object")
        elif isinstance(job, self.jobClass):
            oJob = job
        else:
            self._logError("No valid job description found")
            return S_ERROR("No valid job description found")

        workflowFileObject = StringIO(oJob._toXML())
        jdl = oJob._toJDL(jobDescriptionObject=workflowFileObject)
        return self.submissionClient.submitJob(jdl, workflowFileObject)

    def updateTransformationReservedTasks(self, taskDicts):
        transID = None
        jobNames = [
            self._transTaskName(taskDict["TransformationID"],
                                taskDict["TaskID"]) for taskDict in taskDicts
        ]
        res = self.jobMonitoringClient.getJobs({"JobName": jobNames})
        if not res["OK"]:
            self._logError(
                "Failed to get task from WMS",
                res["Message"],
                transID=transID,
                method="updateTransformationReservedTasks",
            )
            return res
        jobNameIDs = {}
        for wmsID in res["Value"]:
            res = self.jobMonitoringClient.getJobSummary(int(wmsID))
            if not res["OK"]:
                self._logWarn(
                    "Failed to get task summary from WMS",
                    res["Message"],
                    transID=transID,
                    method="updateTransformationReservedTasks",
                )
            else:
                jobNameIDs[res["Value"]["JobName"]] = int(wmsID)

        noTask = list(set(jobNames) - set(jobNameIDs))
        return S_OK({"NoTasks": noTask, "TaskNameIDs": jobNameIDs})

    def getSubmittedTaskStatus(self, taskDicts):
        """
        Check the status of a list of tasks and return lists of taskIDs for each new status
        """
        method = "getSubmittedTaskStatus"

        if taskDicts:
            wmsIDs = [
                int(taskDict["ExternalID"]) for taskDict in taskDicts
                if int(taskDict["ExternalID"])
            ]
            transID = taskDicts[0]["TransformationID"]
        else:
            return S_OK({})
        res = self.jobMonitoringClient.getJobsStatus(wmsIDs)
        if not res["OK"]:
            self._logWarn("Failed to get job status from the WMS system",
                          transID=transID,
                          method=method)
            return res
        statusDict = res["Value"]
        updateDict = {}
        for taskDict in taskDicts:
            taskID = taskDict["TaskID"]
            wmsID = int(taskDict["ExternalID"])
            if not wmsID:
                continue
            oldStatus = taskDict["ExternalStatus"]
            newStatus = statusDict.get(wmsID, {}).get("Status", "Removed")
            if oldStatus != newStatus:
                if newStatus == "Removed":
                    self._logVerbose(
                        "Production/Job %d/%d removed from WMS while it is in %s status"
                        % (transID, taskID, oldStatus),
                        transID=transID,
                        method=method,
                    )
                    newStatus = "Failed"
                self._logVerbose(
                    "Setting job status for Production/Job %d/%d to %s" %
                    (transID, taskID, newStatus),
                    transID=transID,
                    method=method,
                )
                updateDict.setdefault(newStatus, []).append(taskID)
        return S_OK(updateDict)

    def getSubmittedFileStatus(self, fileDicts):
        """
        Check the status of a list of files and return the new status of each LFN
        """
        if not fileDicts:
            return S_OK({})

        method = "getSubmittedFileStatus"

        # All files are from the same transformation
        transID = fileDicts[0]["TransformationID"]
        taskFiles = {}
        for fileDict in fileDicts:
            jobName = self._transTaskName(transID, fileDict["TaskID"])
            taskFiles.setdefault(jobName,
                                 {})[fileDict["LFN"]] = fileDict["Status"]

        res = self.updateTransformationReservedTasks(fileDicts)
        if not res["OK"]:
            self._logWarn("Failed to obtain taskIDs for files",
                          transID=transID,
                          method=method)
            return res
        noTasks = res["Value"]["NoTasks"]
        taskNameIDs = res["Value"]["TaskNameIDs"]

        updateDict = {}
        for jobName in noTasks:
            for lfn, oldStatus in taskFiles[jobName].items():
                if oldStatus != TransformationFilesStatus.UNUSED:
                    updateDict[lfn] = TransformationFilesStatus.UNUSED

        res = self.jobMonitoringClient.getJobsStatus(list(
            taskNameIDs.values()))
        if not res["OK"]:
            self._logWarn("Failed to get job status from the WMS system",
                          transID=transID,
                          method=method)
            return res
        statusDict = res["Value"]
        for jobName, wmsID in taskNameIDs.items():
            jobStatus = statusDict.get(wmsID, {}).get("Status")
            newFileStatus = {
                "Done": TransformationFilesStatus.PROCESSED,
                "Completed": TransformationFilesStatus.PROCESSED,
                "Failed": TransformationFilesStatus.UNUSED,
            }.get(jobStatus)
            if newFileStatus:
                for lfn, oldStatus in taskFiles[jobName].items():
                    if newFileStatus != oldStatus:
                        updateDict[lfn] = newFileStatus
        return S_OK(updateDict)
예제 #5
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """ # Now, let's submit some jobs. Different sites, types, inputs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        jobIDs = []
        lfnss = [['/a/1.txt', '/a/2.txt'],
                 ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
        types = ['User', 'Test']
        for lfns in lfnss:
            for jobType in types:
                job = helloWorldJob()
                job.setDestination('DIRAC.Jenkins.ch')
                job.setInputData(lfns)
                job.setType(jobType)
                jobDescription = createFile(job)
                res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
                self.assertTrue(res['OK'], res.get('Message'))
                jobID = res['Value']
            jobIDs.append(jobID)

        res = jobMonitor.getSites()
        print(res)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch', 'Site'},
            msg="Got %s" % res['Value'])
        res = jobMonitor.getJobTypes()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(types),
                         msg="Got %s" % str(sorted(res['Value'])))
        res = jobMonitor.getApplicationStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(['Unknown']),
                         msg="Got %s" % sorted(str(res['Value'])))

        res = jobMonitor.getOwners()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getOwnerGroup()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getProductionIds()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobGroups()
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_empty = res['Value']
        res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow())
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanNow = res['Value']
        self.assertEqual(resJG_empty, resJG_olderThanNow)
        res = jobMonitor.getJobGroups(
            None,
            datetime.datetime.utcnow() - datetime.timedelta(days=365))
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanOneYear = res['Value']
        self.assertTrue(
            set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)))
        res = jobMonitor.getStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Received'],
                                     sorted(['Received', 'Waiting'])])
        res = jobMonitor.getMinorStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Job accepted'],
                                     sorted(
                                         ['Job accepted', 'Job Rescheduled'])])
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobs()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value']))
        #     res = jobMonitor.getCounters(attrList)
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getCurrentJobCounters()
        self.assertTrue(res['OK'], res.get('Message'))
        try:
            self.assertTrue(
                res['Value'].get('Received') +
                res['Value'].get('Waiting') >= int(len(lfnss) * len(types)))
        except TypeError:
            pass
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assertTrue(res['OK'], res.get('Message'))

        res = jobStateUpdate.setJobStatusBulk(
            jobID, {
                str(datetime.datetime.utcnow()): {
                    'Status': 'Matched',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                },
                str(datetime.datetime.utcnow() + datetime.timedelta(hours=1)):
                {
                    'Status': 'Running',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                },
                str(datetime.datetime.utcnow() + datetime.timedelta(hours=2)):
                {
                    'Status': 'Completed',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                }
            })
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']})
        self.assertTrue(res['OK'], res.get('Message'))

        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value']['Status'], 'Completed')
        self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus')

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
예제 #6
0
    def __call__(self):
        """request processing"""

        self.log.debug("about to execute request")
        if not self.rmsMonitoring:
            gMonitor.addMark("RequestAtt", 1)

        # # setup proxy for request owner
        setupProxy = self.setupProxy()
        if not setupProxy["OK"]:
            userSuspended = "User is currently suspended"
            self.request.Error = setupProxy["Message"]
            # In case the user does not have proxy
            if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND):
                self.log.error("Error setting proxy. Request set to Failed:",
                               setupProxy["Message"])
                # If user is no longer registered, fail the request
                for operation in self.request:
                    for opFile in operation:
                        opFile.Status = "Failed"
                    operation.Status = "Failed"
            elif userSuspended in setupProxy["Message"]:
                # If user is suspended, wait for a long time
                self.request.delayNextExecution(6 * 60)
                self.request.Error = userSuspended
                self.log.error("Error setting proxy: " + userSuspended,
                               self.request.OwnerDN)
            else:
                self.log.error("Error setting proxy", setupProxy["Message"])
            return S_OK(self.request)
        shifter = setupProxy["Value"]["Shifter"]

        error = None

        while self.request.Status == "Waiting":

            # # get waiting operation
            operation = self.request.getWaiting()
            if not operation["OK"]:
                self.log.error("Cannot get waiting operation",
                               operation["Message"])
                return operation
            operation = operation["Value"]
            self.log.info("executing operation", "%s" % operation.Type)

            # # and handler for it
            handler = self.getHandler(operation)
            if not handler["OK"]:
                self.log.error("Unable to process operation",
                               "%s: %s" % (operation.Type, handler["Message"]))
                # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 )
                operation.Error = handler["Message"]
                break

            handler = handler["Value"]
            # # set shifters list in the handler
            handler.shifter = shifter
            # set rmsMonitoring flag for the RequestOperation
            handler.rmsMonitoring = self.rmsMonitoring
            # # and execute
            pluginName = self.getPluginName(
                self.handlersDict.get(operation.Type))
            if self.standalone:
                useServerCertificate = gConfig.useServerCertificate()
            else:
                # Always use server certificates if executed within an agent
                useServerCertificate = True
            try:
                if pluginName:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Operation",
                            "operationType":
                            pluginName,
                            "objectID":
                            operation.OperationID,
                            "parentID":
                            operation.RequestID,
                            "status":
                            "Attempted",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("%s%s" % (pluginName, "Att"), 1)
                # Always use request owner proxy
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "false")
                exe = handler()
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "true")
                if not exe["OK"]:
                    self.log.error("unable to process operation",
                                   "%s: %s" % (operation.Type, exe["Message"]))
                    if pluginName:
                        if self.rmsMonitoring:
                            self.rmsMonitoringReporter.addRecord({
                                "timestamp":
                                int(Time.toEpoch()),
                                "host":
                                Network.getFQDN(),
                                "objectType":
                                "Operation",
                                "operationType":
                                pluginName,
                                "objectID":
                                operation.OperationID,
                                "parentID":
                                operation.RequestID,
                                "status":
                                "Failed",
                                "nbObject":
                                1,
                            })
                        else:
                            gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Request",
                            "objectID":
                            operation.RequestID,
                            "status":
                            "Failed",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("RequestFail", 1)

                    if self.request.JobID:
                        # Check if the job exists
                        monitorServer = JobMonitoringClient(
                            useCertificates=True)
                        res = monitorServer.getJobSummary(
                            int(self.request.JobID))
                        if not res["OK"]:
                            self.log.error(
                                "RequestTask: Failed to get job status",
                                "%d" % self.request.JobID)
                        elif not res["Value"]:
                            self.log.warn(
                                "RequestTask: job does not exist (anymore): failed request",
                                "JobID: %d" % self.request.JobID,
                            )
                            for opFile in operation:
                                opFile.Status = "Failed"
                            if operation.Status != "Failed":
                                operation.Status = "Failed"
                            self.request.Error = "Job no longer exists"
            except Exception as e:
                error = str(e)
                self.log.exception("hit by exception:", "%s" % error)
                if pluginName:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Operation",
                            "operationType":
                            pluginName,
                            "objectID":
                            operation.OperationID,
                            "parentID":
                            operation.RequestID,
                            "status":
                            "Failed",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Request",
                        "objectID":
                        operation.RequestID,
                        "status":
                        "Failed",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("RequestFail", 1)

                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "true")
                break

            # # operation status check
            if operation.Status == "Done" and pluginName:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Operation",
                        "operationType":
                        pluginName,
                        "objectID":
                        operation.OperationID,
                        "parentID":
                        operation.RequestID,
                        "status":
                        "Successful",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("%s%s" % (pluginName, "OK"), 1)
            elif operation.Status == "Failed" and pluginName:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Operation",
                        "operationType":
                        pluginName,
                        "objectID":
                        operation.OperationID,
                        "parentID":
                        operation.RequestID,
                        "status":
                        "Failed",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
            elif operation.Status in ("Waiting", "Scheduled"):
                # # no update for waiting or all files scheduled
                break

        if not self.rmsMonitoring:
            gMonitor.flush()

        if error:
            return S_ERROR(error)

        # # request done?
        if self.request.Status == "Done":
            # # update request to the RequestDB
            self.log.info("Updating request status:",
                          "%s" % self.request.Status)
            update = self.updateRequest()
            if not update["OK"]:
                self.log.error("Cannot update request status",
                               update["Message"])
                return update
            self.log.info("request is done", "%s" % self.request.RequestName)
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord({
                    "timestamp":
                    int(Time.toEpoch()),
                    "host":
                    Network.getFQDN(),
                    "objectType":
                    "Request",
                    "objectID":
                    getattr(self.request, "RequestID", 0),
                    "status":
                    "Successful",
                    "nbObject":
                    1,
                })
            else:
                gMonitor.addMark("RequestOK", 1)
            # # and there is a job waiting for it? finalize!
            if self.request.JobID:
                attempts = 0
                while True:
                    finalizeRequest = self.requestClient.finalizeRequest(
                        self.request.RequestID,
                        self.request.JobID  # pylint: disable=no-member
                    )
                    if not finalizeRequest["OK"]:
                        if not attempts:
                            self.log.error(
                                "unable to finalize request, will retry",
                                "ReqName %s:%s" % (self.request.RequestName,
                                                   finalizeRequest["Message"]),
                            )
                        self.log.debug("Waiting 10 seconds")
                        attempts += 1
                        if attempts == 10:
                            self.log.error("Giving up finalize request")
                            return S_ERROR("Could not finalize request")

                        time.sleep(10)

                    else:
                        self.log.info(
                            "request is finalized",
                            "ReqName %s %s" % (self.request.RequestName,
                                               (" after %d attempts" %
                                                attempts) if attempts else ""),
                        )
                        break

        # Commit all the data to the ES Backend
        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()
        # Request will be updated by the callBack method
        self.log.verbose("RequestTasks exiting",
                         "request %s" % self.request.Status)
        return S_OK(self.request)
예제 #7
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """# Now, let's submit some jobs. Different sites, types, inputs"""
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        jobIDs = []
        lfnss = [["/a/1.txt", "/a/2.txt"],
                 ["/a/1.txt", "/a/3.txt", "/a/4.txt"], []]
        types = ["User", "Test"]
        for lfns in lfnss:
            for jobType in types:
                job = helloWorldJob()
                job.setDestination("DIRAC.Jenkins.ch")
                job.setInputData(lfns)
                job.setType(jobType)
                jobDescription = createFile(job)
                res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
                self.assertTrue(res["OK"], res.get("Message"))
                jobID = res["Value"]
            jobIDs.append(jobID)

        res = jobMonitor.getSites()
        print(res)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            set(res["Value"]) <= {"ANY", "DIRAC.Jenkins.ch", "Site"},
            msg="Got %s" % res["Value"])
        res = jobMonitor.getJobTypes()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(sorted(res["Value"]),
                         sorted(types),
                         msg="Got %s" % str(sorted(res["Value"])))
        res = jobMonitor.getApplicationStates()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], ["app status", "Unknown"],
                         msg="Got %s" % str(res["Value"]))

        res = jobMonitor.getOwners()
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getOwnerGroup()
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getProductionIds()
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobGroups()
        self.assertTrue(res["OK"], res.get("Message"))
        resJG_empty = res["Value"]
        res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow())
        self.assertTrue(res["OK"], res.get("Message"))
        resJG_olderThanNow = res["Value"]
        self.assertEqual(resJG_empty, resJG_olderThanNow)
        res = jobMonitor.getJobGroups(
            None,
            datetime.datetime.utcnow() - datetime.timedelta(days=365))
        self.assertTrue(res["OK"], res.get("Message"))
        resJG_olderThanOneYear = res["Value"]
        self.assertTrue(
            set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)),
            resJG_olderThanOneYear)
        res = jobMonitor.getStates()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            sorted(res["Value"])
            in [[JobStatus.RECEIVED],
                sorted([JobStatus.RECEIVED, JobStatus.KILLED])], res["Value"])
        res = jobMonitor.getMinorStates()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            sorted(res["Value"]) in [
                ["Job accepted"],
                sorted(["Job accepted", "Job Rescheduled"]),
                sorted(["Job accepted", "Marked for termination"]),
            ],
            res["Value"],
        )
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobs()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            set([str(x) for x in jobIDs]) <= set(res["Value"]), res["Value"])
        #     res = jobMonitor.getCounters(attrList)
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assertTrue(res["OK"], res.get("Message"))

        res = jobStateUpdate.setJobStatusBulk(
            jobID,
            {
                str(datetime.datetime.utcnow()): {
                    "Status": JobStatus.CHECKING,
                    "MinorStatus": "MinorStatus",
                    "Source": "Unknown",
                }
            },
            False,
        )
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.CHECKING)
        self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus")

        res = jobStateUpdate.setJobStatusBulk(
            jobID,
            {
                str(datetime.datetime.utcnow() + datetime.timedelta(hours=1)):
                {
                    "Status": JobStatus.WAITING,
                    "MinorStatus": "MinorStatus",
                    "Source": "Unknown",
                },
                str(datetime.datetime.utcnow() + datetime.timedelta(hours=2)):
                {
                    "Status": JobStatus.MATCHED,
                    "MinorStatus": "MinorStatus-matched",
                    "Source": "Unknown",
                },
            },
            False,
        )
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED)
        self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched")

        res = jobStateUpdate.setJobsParameter({jobID: ["Whatever", "booh"]})
        self.assertTrue(res["OK"], res.get("Message"))

        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED)
        self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched")

        res = jobStateUpdate.setJobAttribute(jobID, "Status",
                                             JobStatus.RUNNING)
        self.assertTrue(res["OK"], res.get("Message"))

        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING)

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
예제 #8
0
    def test_JobStateUpdateAndJobMonitoring(self):
        """Verifying all JobStateUpdate and JobMonitoring functions"""
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        # create a job and check stuff
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submitting the job. Checking few stuff
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res["OK"], res.get("Message"))
        jobID = int(res["Value"])
        # jobID = res['JobID']
        res = jobMonitor.getJobJDL(jobID, True)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobJDL(jobID, False)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobsParameters([jobID], [])
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobOwner(jobID)
        self.assertTrue(res["OK"], res.get("Message"))

        # Adding stuff

        # forcing the update
        res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING, "running",
                                          "source", None, True)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobStateUpdate.setJobParameters(jobID, [("par1", "par1Value"),
                                                      ("par2", "par2Value")])
        time.sleep(5)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobStateUpdate.setJobApplicationStatus(jobID, "app status",
                                                     "source")
        self.assertTrue(res["OK"], res.get("Message"))
        #     res = jobStateUpdate.setJobFlag()
        #     self.assertTrue(res['OK'], res.get('Message'))
        #     res = jobStateUpdate.unsetJobFlag()
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobStateUpdate.setJobSite(jobID, "Site")
        self.assertTrue(res["OK"], res.get("Message"))

        # now checking few things
        res = jobMonitor.getJobsStatus(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"][jobID]["Status"],
                         JobStatus.RUNNING,
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobParameter(jobID, "par1")
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], {"par1": "par1Value"},
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobParameters(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"],
                         {jobID: {
                             "par1": "par1Value",
                             "par2": "par2Value"
                         }},
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobParameters(jobID, "par1")
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], {jobID: {
            "par1": "par1Value"
        }},
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobAttribute(jobID, "Site")
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"],
                         "Site",
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobAttributes(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["ApplicationStatus"],
                         "app status",
                         msg="Got %s" % str(res["Value"]["ApplicationStatus"]))
        self.assertEqual(res["Value"]["JobName"],
                         "helloWorld",
                         msg="Got %s" % str(res["Value"]["JobName"]))
        res = jobMonitor.getJobSummary(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["ApplicationStatus"],
                         "app status",
                         msg="Got %s" % str(res["Value"]["ApplicationStatus"]))
        self.assertEqual(res["Value"]["Status"],
                         JobStatus.RUNNING,
                         msg="Got %s" % str(res["Value"]["Status"]))
        res = jobMonitor.getJobHeartBeatData(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getInputData(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobSummary(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getAtticJobParameters(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobStateUpdate.setJobStatus(jobID, JobStatus.DONE, "MinorStatus",
                                          "Unknown")
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobSummary(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"],
                         JobStatus.DONE,
                         msg="Got %s" % str(res["Value"]["Status"]))
        self.assertEqual(res["Value"]["MinorStatus"],
                         "MinorStatus",
                         msg="Got %s" % str(res["Value"]["MinorStatus"]))
        self.assertEqual(res["Value"]["ApplicationStatus"],
                         "app status",
                         msg="Got %s" % str(res["Value"]["ApplicationStatus"]))
        res = jobStateUpdate.sendHeartBeat(jobID, {"bih": "bih"},
                                           {"boh": "boh"})
        self.assertTrue(res["OK"], res.get("Message"))

        # delete the job - this will just set its status to "deleted"
        wmsClient.deleteJob(jobID)