Exemple #1
0
    def _checkLoggingInfo(self, jobID, jobDict):
        """Get info from JobLogging"""
        logList = []
        result = self.logDB.getJobLoggingInfo(jobID)
        if result["OK"]:
            logList = result["Value"]

        startTime = jobDict["StartExecTime"]
        if not startTime or startTime == "None":
            # status, minor, app, stime, source
            for items in logList:
                if items[0] == "Running":
                    startTime = items[3]
                    break
            if not startTime or startTime == "None":
                startTime = jobDict["SubmissionTime"]

        if isinstance(startTime, str):
            startTime = fromString(startTime)
            if startTime is None:
                self.log.error("Wrong timestamp in DB", items[3])
                startTime = datetime.datetime.utcnow()

        endTime = datetime.datetime.utcnow()
        # status, minor, app, stime, source
        for items in logList:
            if items[0] == "Stalled":
                endTime = fromString(items[3])
        if endTime is None:
            self.log.error("Wrong timestamp in DB", items[3])
            endTime = datetime.datetime.utcnow()

        return startTime, endTime
Exemple #2
0
    def __sendAccounting(self, ftsJob):

        self.dataOpSender.sendData(
            ftsJob.accountingDict,
            commitFlag=True,
            delayedCommit=True,
            startTime=fromString(ftsJob.submitTime),
            endTime=fromString(ftsJob.lastUpdate),
        )
Exemple #3
0
    def _getLatestUpdateTime(self, job):
        """Returns the most recent of HeartBeatTime and LastUpdateTime"""
        result = self.jobDB.getJobAttributes(
            job, ["HeartBeatTime", "LastUpdateTime"])
        if not result["OK"] or not result["Value"]:
            self.log.error(
                "Failed to get job attributes",
                "for job %d: %s" %
                (job, result["Message"] if "Message" in result else "empty"),
            )
            return S_ERROR("Could not get attributes for job")

        latestUpdate = 0
        if not result["Value"]["HeartBeatTime"] or result["Value"][
                "HeartBeatTime"] == "None":
            self.log.verbose("HeartBeatTime is null", "for job %s" % job)
        else:
            latestUpdate = toEpoch(fromString(
                result["Value"]["HeartBeatTime"]))

        if not result["Value"]["LastUpdateTime"] or result["Value"][
                "LastUpdateTime"] == "None":
            self.log.verbose("LastUpdateTime is null", "for job %s" % job)
        else:
            latestUpdate = max(
                latestUpdate,
                toEpoch(fromString(result["Value"]["LastUpdateTime"])))

        if not latestUpdate:
            return S_ERROR(
                "LastUpdate and HeartBeat times are null for job %s" % job)
        else:
            self.log.verbose(
                "", "Latest update time from epoch for job %s is %s" %
                (job, latestUpdate))
            return S_OK(latestUpdate)
Exemple #4
0
    def optimizeJob(self, jid, jobState):
        """1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
        """
        # Reschedule delay
        result = jobState.getAttributes(
            ["RescheduleCounter", "RescheduleTime", "ApplicationStatus"])
        if not result["OK"]:
            return result
        attDict = result["Value"]
        try:
            reschedules = int(attDict["RescheduleCounter"])
        except (ValueError, KeyError):
            return S_ERROR("RescheduleCounter has to be an integer")
        if reschedules != 0:
            delays = self.ex_getOption("RescheduleDelays", [60, 180, 300, 600])
            delay = delays[min(reschedules, len(delays) - 1)]
            waited = toEpoch() - toEpoch(fromString(attDict["RescheduleTime"]))
            if waited < delay:
                return self.__holdJob(
                    jobState, "On Hold: after rescheduling %s" % reschedules,
                    delay)

        # Get the job manifest for the later checks
        result = jobState.getManifest()
        if not result["OK"]:
            self.jobLog.error("Could not retrieve job manifest",
                              result["Message"])
            return result
        jobManifest = result["Value"]

        # Get site requirements
        result = self.__getSitesRequired(jobManifest)
        if not result["OK"]:
            return result
        userSites, userBannedSites = result["Value"]

        # Get job type
        result = jobState.getAttribute("JobType")
        if not result["OK"]:
            self.jobLog.error("Could not retrieve job type", result["Message"])
            return result
        jobType = result["Value"]

        # Get banned sites from DIRAC
        result = self.siteClient.getSites("Banned")
        if not result["OK"]:
            self.jobLog.error("Cannot retrieve banned sites",
                              result["Message"])
            return result
        wmsBannedSites = result["Value"]

        # If the user has selected any site, filter them and hold the job if not able to run
        if userSites:
            if jobType not in self.ex_getOption("ExcludedOnHoldJobTypes", []):

                result = self.siteClient.getUsableSites(userSites)
                if not result["OK"]:
                    self.jobLog.error(
                        "Problem checking userSites for tuple of active/banned/invalid sites",
                        result["Message"])
                    return result
                usableSites = set(result["Value"])
                bannedSites = []
                invalidSites = []
                for site in userSites:
                    if site in wmsBannedSites:
                        bannedSites.append(site)
                    elif site not in usableSites:
                        invalidSites.append(site)

                if invalidSites:
                    self.jobLog.debug("Invalid site(s) requested: %s" %
                                      ",".join(invalidSites))
                    if not self.ex_getOption("AllowInvalidSites", True):
                        return self.__holdJob(
                            jobState, "Requested site(s) %s are invalid" %
                            ",".join(invalidSites))
                if bannedSites:
                    self.jobLog.debug("Banned site(s) %s ignored" %
                                      ",".join(bannedSites))
                    if not usableSites:
                        return self.__holdJob(
                            jobState, "Requested site(s) %s are inactive" %
                            ",".join(bannedSites))

                if not usableSites:
                    return self.__holdJob(
                        jobState, "No requested site(s) are active/valid")
                userSites = list(usableSites)

        checkPlatform = self.ex_getOption("CheckPlatform", False)
        jobPlatform = jobManifest.getOption("Platform", None)
        # First check that the platform is valid (in OSCompatibility list)
        if checkPlatform and jobPlatform:
            result = gConfig.getOptionsDict(
                "/Resources/Computing/OSCompatibility")
            if not result["OK"]:
                self.jobLog.error("Unable to get OSCompatibility list",
                                  result["Message"])
                return result
            allPlatforms = result["Value"]
            if jobPlatform not in allPlatforms:
                self.jobLog.error("Platform not supported", jobPlatform)
                return S_ERROR("Platform is not supported")

        # Filter the userSites by the platform selection (if there is one)
        if checkPlatform and userSites:
            if jobPlatform:
                result = self.__filterByPlatform(jobPlatform, userSites)
                if not result["OK"]:
                    self.jobLog.error("Failed to filter job sites by platform",
                                      result["Message"])
                    return result
                userSites = result["Value"]
                if not userSites:
                    # No sites left after filtering -> Invalid platform/sites combination
                    self.jobLog.error("No selected sites match platform",
                                      jobPlatform)
                    return S_ERROR("No selected sites match platform '%s'" %
                                   jobPlatform)

        # Check if there is input data
        result = jobState.getInputData()
        if not result["OK"]:
            self.jobLog.error("Failed to get input data from JobDB",
                              result["Message"])
            return result

        if not result["Value"]:
            # No input data? Just send to TQ
            return self.__sendToTQ(jobState, jobManifest, userSites,
                                   userBannedSites)

        self.jobLog.verbose("Has an input data requirement")
        inputData = result["Value"]

        # ===================================================================================
        # Production jobs are sent to TQ, but first we have to verify if staging is necessary
        # ===================================================================================
        if jobType in Operations().getValue("Transformations/DataProcessing",
                                            []):
            self.jobLog.info(
                "Production job: sending to TQ, but first checking if staging is requested"
            )

            res = getFilesToStage(
                inputData,
                jobState=jobState,
                checkOnlyTapeSEs=self.ex_getOption("CheckOnlyTapeSEs", True),
                jobLog=self.jobLog,
            )

            if not res["OK"]:
                return self.__holdJob(jobState, res["Message"])
            if res["Value"]["absentLFNs"]:
                # Some files do not exist at all... set the job Failed
                # Reverse errors
                reasons = {}
                for lfn, reason in res["Value"]["absentLFNs"].items():
                    reasons.setdefault(reason, []).append(lfn)
                for reason, lfns in reasons.items():
                    # Some files are missing in the FC or in SEs, fail the job
                    self.jobLog.error(reason, ",".join(lfns))
                error = ",".join(reasons)
                return S_ERROR(error)

            if res["Value"]["failedLFNs"]:
                return self.__holdJob(
                    jobState, "Couldn't get storage metadata of some files")
            stageLFNs = res["Value"]["offlineLFNs"]
            if stageLFNs:
                res = self.__checkStageAllowed(jobState)
                if not res["OK"]:
                    return res
                if not res["Value"]:
                    return S_ERROR("Stage not allowed")
                self.__requestStaging(jobState, stageLFNs)
                return S_OK()
            else:
                # No staging required
                onlineSites = res["Value"]["onlineSites"]
                if onlineSites:
                    # Set the online site(s) first
                    userSites = set(userSites)
                    onlineSites &= userSites
                    userSites = list(onlineSites) + list(userSites -
                                                         onlineSites)
                return self.__sendToTQ(jobState,
                                       jobManifest,
                                       userSites,
                                       userBannedSites,
                                       onlineSites=onlineSites)

        # ===================================================
        # From now on we know it's a user job with input data
        # ===================================================

        idAgent = self.ex_getOption("InputDataAgent", "InputData")
        result = self.retrieveOptimizerParam(idAgent)
        if not result["OK"]:
            self.jobLog.error("Could not retrieve input data info",
                              result["Message"])
            return result
        opData = result["Value"]

        if "SiteCandidates" not in opData:
            return S_ERROR("No possible site candidates")

        # Filter input data sites with user requirement
        siteCandidates = list(opData["SiteCandidates"])
        self.jobLog.info("Site candidates are", siteCandidates)

        if userSites:
            siteCandidates = list(set(siteCandidates) & set(userSites))

        siteCandidates = self._applySiteFilter(siteCandidates,
                                               banned=userBannedSites)
        if not siteCandidates:
            return S_ERROR("Impossible InputData * Site requirements")

        idSites = {}
        for site in siteCandidates:
            idSites[site] = opData["SiteCandidates"][site]

        # Check if sites have correct count of disk+tape replicas
        numData = len(inputData)
        errorSites = set()
        for site in idSites:
            if numData != idSites[site]["disk"] + idSites[site]["tape"]:
                self.jobLog.error(
                    "Site candidate does not have all the input data",
                    "(%s)" % site)
                errorSites.add(site)
        for site in errorSites:
            idSites.pop(site)
        if not idSites:
            return S_ERROR("Site candidates do not have all the input data")

        # Check if staging is required
        stageRequired, siteCandidates = self.__resolveStaging(
            inputData, idSites)
        if not siteCandidates:
            return S_ERROR("No destination sites available")

        # Is any site active?
        stageSites = self._applySiteFilter(siteCandidates,
                                           banned=wmsBannedSites)
        if not stageSites:
            return self.__holdJob(
                jobState,
                "Sites %s are inactive or banned" % ", ".join(siteCandidates))

        # If no staging is required send to TQ
        if not stageRequired:
            # Use siteCandidates and not stageSites because active and banned sites
            # will be taken into account on matching time
            return self.__sendToTQ(jobState, jobManifest, siteCandidates,
                                   userBannedSites)

        # Check if the user is allowed to stage
        if self.ex_getOption("RestrictDataStage", False):
            res = self.__checkStageAllowed(jobState)
            if not res["OK"]:
                return res
            if not res["Value"]:
                return S_ERROR("Stage not allowed")

        # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
        stageSite = stageSites[0]
        self.jobLog.verbose(" Staging site will be", stageSite)
        stageData = idSites[stageSite]
        # Set as if everything has already been staged
        stageData["disk"] += stageData["tape"]
        stageData["tape"] = 0
        # Set the site info back to the original dict to save afterwards
        opData["SiteCandidates"][stageSite] = stageData

        stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData)
        if not stageRequest["OK"]:
            return stageRequest
        stageLFNs = stageRequest["Value"]
        result = self.__requestStaging(jobState, stageLFNs)
        if not result["OK"]:
            return result
        stageLFNs = result["Value"]
        self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData)
        # Save the optimizer data again
        self.jobLog.verbose("Updating Optimizer Info",
                            ": %s for %s" % (idAgent, opData))
        result = self.storeOptimizerParam(idAgent, opData)
        if not result["OK"]:
            return result

        return self.__setJobSite(jobState, stageSites)
Exemple #5
0
    def _sendAccounting(self, jobID):
        """
        Send WMS accounting data for the given job.

        Run inside thread.
        """
        try:
            accountingReport = Job()
            endTime = "Unknown"
            lastHeartBeatTime = "Unknown"

            result = self.jobDB.getJobAttributes(jobID)
            if not result["OK"]:
                return result
            jobDict = result["Value"]

            startTime, endTime = self._checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self._checkHeartBeat(
                jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            result = JobMonitoringClient().getJobParameter(
                jobID, "CPUNormalizationFactor")
            if not result["OK"] or not result["Value"]:
                self.log.error(
                    "Error getting Job Parameter CPUNormalizationFactor, setting 0",
                    result.get("Message", "No such value"),
                )
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(
                    result["Value"].get("CPUNormalizationFactor"))

        except Exception as e:
            self.log.exception(
                "Exception in _sendAccounting",
                "for job=%s: endTime=%s, lastHBTime=%s" %
                (str(jobID), str(endTime), str(lastHeartBeatTime)),
                lException=e,
            )
            return S_ERROR("Exception")
        processingType = self._getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        # Fill the accounting data
        acData = {
            "Site": jobDict["Site"],
            "User": jobDict["Owner"],
            "UserGroup": jobDict["OwnerGroup"],
            "JobGroup": jobDict["JobGroup"],
            "JobType": jobDict["JobType"],
            "JobClass": jobDict["JobSplitType"],
            "ProcessingType": processingType,
            "FinalMajorStatus": JobStatus.FAILED,
            "FinalMinorStatus": JobMinorStatus.STALLED_PILOT_NOT_RUNNING,
            "CPUTime": lastCPUTime,
            "NormCPUTime": lastCPUTime * cpuNormalization,
            "ExecTime": lastWallTime,
            "InputDataSize": 0.0,
            "OutputDataSize": 0.0,
            "InputDataFiles": 0,
            "OutputDataFiles": 0,
            "DiskSpace": 0.0,
            "InputSandBoxSize": 0.0,
            "OutputSandBoxSize": 0.0,
            "ProcessedEvents": 0,
        }

        # For accidentally stopped jobs ExecTime can be not set
        if not acData["ExecTime"]:
            acData["ExecTime"] = acData["CPUTime"]
        elif acData["ExecTime"] < acData["CPUTime"]:
            acData["ExecTime"] = acData["CPUTime"]

        self.log.verbose("Accounting Report is:")
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result["OK"]:
            self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True")
        else:
            self.log.error(
                "Failed to send accounting report",
                "Job: %d, Error: %s" % (int(jobID), result["Message"]))
        return result
Exemple #6
0
    def __getToken2(self):
        """Get the Keystone token for the version v2 of the keystone service

        :return: S_OK(token) or S_ERROR
        """

        user = self.parameters.get("User")
        password = self.parameters.get("Password")
        authArgs = {}
        if user and password:
            authDict = {
                "auth": {
                    "passwordCredentials": {
                        "username": user,
                        "password": password
                    }
                }
            }
            if self.project:
                authDict["auth"]["tenantName"] = self.project
        elif self.parameters.get("Auth") == "voms":
            authDict = {"auth": {"voms": True}}
            if self.project:
                authDict["auth"]["tenantName"] = self.project

            if self.parameters.get("Proxy"):
                authArgs["cert"] = self.parameters.get("Proxy")

        try:
            result = requests.post(
                "%s/tokens" % self.url,
                headers={"Content-Type": "application/json"},
                json=authDict,
                verify=self.caPath,
                **authArgs,
            )
        except Exception as exc:
            return S_ERROR("Exception getting keystone token: %s" % str(exc))

        output = result.json()

        if result.status_code in [400, 401]:
            message = "None"
            if "error" in output:
                message = output["error"].get("message")
            return S_ERROR("Authorization error: %s" % message)

        self.token = str(output["access"]["token"]["id"])
        expires = fromString(
            str(output["access"]["token"]["expires"]).replace("T",
                                                              " ").replace(
                                                                  "Z", ""))
        issued = fromString(
            str(output["access"]["token"]["issued_at"]).replace("T",
                                                                " ").replace(
                                                                    "Z", ""))
        self.expires = datetime.datetime.utcnow() + (expires - issued)

        self.projectID = output["access"]["token"]["tenant"]["id"]

        for endpoint in output["access"]["serviceCatalog"]:
            if endpoint["type"] == "compute":
                self.computeURL = str(endpoint["endpoints"][0]["publicURL"])
            elif endpoint["type"] == "image":
                self.imageURL = str(endpoint["endpoints"][0]["publicURL"])
            elif endpoint["type"] == "network":
                self.networkURL = str(endpoint["endpoints"][0]["publicURL"])
        return S_OK(self.token)
Exemple #7
0
    def __getToken3(self):
        """Get the Keystone token for the version v3 of the keystone service

        :return: S_OK(token) or S_ERROR
        """

        domain = self.parameters.get("Domain", "Default")
        user = self.parameters.get("User")
        password = self.parameters.get("Password")
        appcred_file = self.parameters.get("Appcred")
        authDict = {}
        authArgs = {}
        if user and password:
            authDict = {
                "auth": {
                    "identity": {
                        "methods": ["password"],
                        "password": {
                            "user": {
                                "name": user,
                                "domain": {
                                    "name": domain
                                },
                                "password": password
                            }
                        },
                    }
                }
            }
        elif self.parameters.get("Auth") == "voms":
            authDict = {
                "auth": {
                    "identity": {
                        "methods": ["mapped"],
                        "mapped": {
                            "voms": True,
                            "identity_provider": "egi.eu",
                            "protocol": "mapped"
                        },
                    }
                }
            }
            if self.parameters.get("Proxy"):
                authArgs["cert"] = self.parameters.get("Proxy")
        elif appcred_file:
            # The application credentials are stored in a file of the format:
            # id secret
            ac_fd = open(appcred_file, "r")
            auth_info = ac_fd.read()
            auth_info = auth_info.strip()
            ac_id, ac_secret = auth_info.split(" ", 1)
            ac_fd.close()
            authDict = {
                "auth": {
                    "identity": {
                        "methods": ["application_credential"],
                        "application_credential": {
                            "id": ac_id,
                            "secret": ac_secret
                        },
                    }
                }
            }
        else:
            return S_ERROR("No valid credentials provided")

        # appcred includes the project scope binding in the credential itself
        if self.project and not appcred_file:
            authDict["auth"]["scope"] = {
                "project": {
                    "domain": {
                        "name": domain
                    },
                    "name": self.project
                }
            }

        gLogger.debug("Request token with auth arguments: %s and body %s" %
                      (str(authArgs), str(authDict)))

        url = "%s/auth/tokens" % self.url
        try:
            result = requests.post(
                url,
                headers={
                    "Content-Type": "application/json",
                    "Accept": "application/json",
                },
                json=authDict,
                verify=self.caPath,
                **authArgs,
            )

        except Exception as exc:
            return S_ERROR("Exception getting keystone token: %s" % str(exc))

        if result.status_code not in [200, 201, 202, 203, 204]:
            return S_ERROR("Failed to get keystone token: %s" % result.text)

        try:
            self.token = result.headers["X-Subject-Token"]
        except Exception as exc:
            return S_ERROR("Failed to get keystone token: %s" % str(exc))

        output = result.json()

        expires = fromString(
            str(output["token"]["expires_at"]).replace("T",
                                                       " ").replace("Z", ""))
        issued = fromString(
            str(output["token"]["issued_at"]).replace("T",
                                                      " ").replace("Z", ""))
        self.expires = datetime.datetime.utcnow() + (expires - issued)

        if "project" in output["token"]:
            if output["token"]["project"]["name"] == self.project:
                self.projectID = output["token"]["project"]["id"]

        if "catalog" in output["token"]:
            for service in output["token"]["catalog"]:
                if service["type"] == "compute":
                    for endpoint in service["endpoints"]:
                        if endpoint["interface"] == "public":
                            self.computeURL = str(endpoint["url"])

                elif service["type"] == "image":
                    for endpoint in service["endpoints"]:
                        if endpoint["interface"] == "public":
                            self.imageURL = str(endpoint["url"])

                elif service["type"] == "network":
                    for endpoint in service["endpoints"]:
                        if endpoint["interface"] == "public":
                            self.networkURL = str(endpoint["url"])

        return S_OK(self.token)
Exemple #8
0
    def export_checkComponentLog(self, component):
        """Check component log for errors"""
        componentList = []
        if "*" in component:
            if component == "*":
                result = gComponentInstaller.getSetupComponents()
                if result["OK"]:
                    for ctype in ["Services", "Agents", "Executors"]:
                        if ctype in result["Value"]:
                            for sname in result["Value"][ctype]:
                                for cname in result["Value"][ctype][sname]:
                                    componentList.append("/".join(
                                        [sname, cname]))
        elif isinstance(component, str):
            componentList = [component]
        else:
            componentList = component

        resultDict = {}
        for comp in componentList:
            if "/" not in comp:
                continue
            system, cname = comp.split("/")

            startDir = gComponentInstaller.startDir
            currentLog = startDir + "/" + system + "_" + cname + "/log/current"
            try:
                with open(currentLog, "r") as logFile:
                    logLines = logFile.readlines()
            except IOError as err:
                gLogger.error("File does not exists:", currentLog)
                resultDict[comp] = {
                    "ErrorsHour": -1,
                    "ErrorsDay": -1,
                    "LastError": currentLog + "::" + repr(err)
                }
                continue

            errors_1 = 0
            errors_24 = 0
            now = datetime.utcnow()
            lastError = ""
            for line in logLines:
                if "ERROR:" in line:
                    fields = line.split()
                    recent = False
                    if len(fields) < 2:  # if the line contains only one word
                        lastError = line.split("ERROR:")[-1].strip()
                        continue
                    timeStamp = fromString(fields[0] + " " + fields[1])
                    if not timeStamp:  # if the timestamp is missing in the log
                        lastError = line.split("ERROR:")[-1].strip()
                        continue
                    if (now - timeStamp) < hour:
                        errors_1 += 1
                        recent = True
                    if (now - timeStamp) < day:
                        errors_24 += 1
                        recent = True
                    if recent:
                        lastError = line.split("ERROR:")[-1].strip()

            resultDict[comp] = {
                "ErrorsHour": errors_1,
                "ErrorsDay": errors_24,
                "LastError": lastError
            }

        return S_OK(resultDict)