Esempio n. 1
0
    def printTaskInfo(self, crabDBInfo, username):
        """ Print general information like project directory, task name, scheduler, task status (in the database),
            dashboard URL, warnings and failire messages in the database.
        """
        schedd = getColumn(crabDBInfo, 'tm_schedd')
        status = getColumn(crabDBInfo, 'tm_task_status')
        command = getColumn(crabDBInfo, 'tm_task_command')
        warnings = literal_eval(getColumn(crabDBInfo, 'tm_task_warnings'))
        failure = getColumn(crabDBInfo, 'tm_task_failure')

        self.logger.info("CRAB project directory:\t\t%s" % (self.requestarea))
        self.logger.info("Task name:\t\t\t%s" % self.cachedinfo['RequestName'])
        if schedd:
            msg = "Grid scheduler:\t\t\t%s" % schedd
            self.logger.info(msg)
        msg = "Status on the CRAB server:\t"
        if 'FAILED' in status:
            msg += "%s%s%s" % (colors.RED, status, colors.NORMAL)
        else:
            if status in TASKDBSTATUSES_TMP:
                msg += "%s on command %s" % (status, command)
            else:
                msg += "%s" % (status)
        self.logger.info(msg)

        # Show server and dashboard URL for the task.
        taskname = urllib.quote(self.cachedinfo['RequestName'])

        ## CRAB Server UI URL for this task is always useful
        crabServerUIURL = "https://cmsweb.cern.ch/crabserver/ui/task/" + taskname
        msg = "%sTask URL to use for HELP:\t%s%s" % (
            colors.GREEN, crabServerUIURL, colors.NORMAL)
        self.logger.info(msg)

        ## Dashboard monitoring URL only makes sense if submitted to schedd
        if schedd:
            dashboardURL = "http://dashb-cms-job.cern.ch/dashboard/templates/task-analysis/#user="******"&refresh=0&table=Jobs&p=1&records=25&activemenu=2&status=&site=&tid=" + taskname
            self.logger.info("Dashboard monitoring URL:\t%s" % (dashboardURL))

        # Print the warning messages (these are the warnings in the Tasks DB,
        # and/or maybe some warning added by the REST Interface to the status result).
        if warnings:
            for warningMsg in warnings:
                self.logger.warning("%sWarning%s:\t\t\t%s" %
                                    (colors.RED, colors.NORMAL, warningMsg))
        if failure:  #TODO failure should be ignored if the task is not in failure state in the task db
            msg = "%sFailure message from the server%s:" % (colors.RED,
                                                            colors.NORMAL)
            msg += "\t\t%s" % (failure.replace('\n', '\n\t\t\t\t'))
            self.logger.error(msg)
Esempio n. 2
0
    def __call__(self):

        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version = __version__)

        crabDBInfo, jobList = getMutedStatusInfo(self.logger)

        if not jobList:
            msg  = "%sError%s:" % (colors.RED, colors.NORMAL)
            msg += " Status information is unavailable, will not proceed with the resubmission."
            msg += " Try again a few minutes later if the task has just been submitted."
            self.logger.info(msg)
            return None

        publicationEnabled = getColumn(crabDBInfo, "tm_publication")
        jobsPerStatus = jobList['jobsPerStatus']

        if self.options.publication:
            if publicationEnabled == "F":
                msg = "Publication was disabled for this task. Therefore, "
                msg += "there are no publications to resubmit."
                self.logger.info(msg)
                return None
            else:
                if "finished" not in jobsPerStatus:
                    msg = "No files found to publish"
                    self.logger.info(msg)
                    return None

        self.jobids = self.processJobIds(jobList)

        configreq = self.getQueryParams()
        self.logger.info("Sending resubmit request to the server.")
        self.logger.debug("Submitting %s " % str(configreq))
        configreq_encoded = self._encodeRequest(configreq)
        self.logger.debug("Encoded resubmit request: %s" % (configreq_encoded))

        dictresult, _, _ = server.post(self.uri, data = configreq_encoded)
        self.logger.debug("Result: %s" % (dictresult))
        self.logger.info("Resubmit request sent to the server.")
        if dictresult['result'][0]['result'] != 'ok':
            msg = "Server responded with: '%s'" % (dictresult['result'][0]['result'])
            self.logger.info(msg)
            returndict = {'status': 'FAILED'}
        else:
            if not self.options.wait:
                msg  = "Please use 'crab status' to check how the resubmission process proceeds."
                msg += "\nNotice it may take a couple of minutes for the resubmission to get fully processed."
                self.logger.info(msg)
            else:
                targetTaskStatus = 'SUBMITTED'
                checkStatusLoop(self.logger, server, self.uri, self.cachedinfo['RequestName'], targetTaskStatus, self.name)
            returndict = {'status': 'SUCCESS'}

        return returndict
Esempio n. 3
0
    def printDAGStatus(self, crabDBInfo, statusCacheInfo):
        # Get dag status from the node_state/job_log summary
        dagman_codes = {
            1: 'SUBMITTED',
            2: 'SUBMITTED',
            3: 'SUBMITTED',
            4: 'SUBMITTED',
            5: 'COMPLETED',
            6: 'FAILED'
        }
        dagStatus = dagman_codes.get(statusCacheInfo['DagStatus']['DagStatus'])
        #Unfortunately DAG code for killed task is 6, just as like for finished DAGs with failed jobs
        #Relabeling the status from 'FAILED' to 'FAILED (KILLED)'     if a successful kill command was issued
        dbstatus = getColumn(crabDBInfo, 'tm_task_status')
        if dagStatus == 'FAILED' and dbstatus == 'KILLED':
            dagStatus = 'FAILED (KILLED)'

        msg = "Status on the scheduler:\t" + dagStatus
        self.logger.info(msg)
        return dagStatus
Esempio n. 4
0
    def validateOptions(self):
        """
        Check if the sitelist parameter is a comma separater list of cms sitenames,
        and put the strings to be passed to the server to self
        """
        SubCommand.validateOptions(self)

        serverFactory = CRABClient.Emulator.getEmulator('rest')
        self.server = serverFactory(self.serverurl,
                                    self.proxyfilename,
                                    self.proxyfilename,
                                    version=__version__)
        uri = getUrl(self.instance, resource='task')
        crabDBInfo, _, _ = self.server.get(uri,
                                           data={
                                               'subresource':
                                               'search',
                                               'workflow':
                                               self.cachedinfo['RequestName']
                                           })
        self.splitting = getColumn(crabDBInfo, 'tm_split_algo')

        if self.options.publication:
            if self.options.sitewhitelist is not None or self.options.siteblacklist is not None or \
               self.options.maxjobruntime is not None or self.options.maxmemory is not None or \
               self.options.numcores is not None or self.options.priority is not None:
                msg = "The options --sitewhitelist, --siteblacklist,"
                msg += " --maxjobruntime, --maxmemory, --numcores and  --priority"
                msg += " can not be specified together with the option --publication."
                msg += " The last option is to only resubmit (failed) publications,"
                msg += " in which case all of the first options make no sense."
                raise ConfigurationException(msg)
            if self.options.jobids:
                msg = "The option --jobids"
                msg += " can not be specified together with the option --publication."
                msg += " The last option is to only resubmit (failed) publications,"
                msg += " which does not allow yet filtering on job ids (ALL failed publications will be resubmitted)."
                raise ConfigurationException(msg)
            if self.options.force:
                msg = "The option --force"
                msg += " can not be specified together with the option --publication."
                msg += " The last option is to only resubmit failed publications."
                msg += " Publications in a status other than 'failed' can not be resubmitted."
                raise ConfigurationException(msg)

        ## The --jobids option indicates which jobs have to be resubmitted. If it is not
        ## given, then all jobs in the task that are not running or successfully
        ## completed are resubmitted. If the user provides a list of job ids, then also
        ## successfully completed jobs can be resubmitted.

        ## Check the format of the jobids option.
        if self.options.jobids:
            jobidstuple = validateJobids(self.options.jobids,
                                         self.splitting != 'Automatic')
            self.jobids = [str(jobid) for (_, jobid) in jobidstuple]

        ## The --force option should not be accepted unless combined with a user-given
        ## list of job ids via --jobids.
        if self.options.force and not self.jobids:
            msg = "Option --force can only be used in combination with option --jobids."
            raise ConfigurationException(msg)

        ## Covention used for the job parameters that the user can set when doing job
        ## resubmission (i.e. siteblacklist, sitewhitelist, maxjobruntime, maxmemory,
        ## numcores and priority):
        ## - If the user doesn't set a parameter we don't pass it to the server and the
        ##   the server copies the original value the parameter had at task submission.
        ##   It copies it from the Task DB. Therefore we need to keep these parameters
        ##   in separate columns of the Task DB containing their original values.
        ## - For the site black- and whitelists, if the user passes an empty string,
        ##   e.g. --siteblacklist='', we pass to the server siteblacklist=empty and the
        ##   server interprets this as and empty list ([]). If the user passes a given
        ##   list of sites, this new list overwrittes the original one.
        ## - The values of the parameters are used only for the resubmitted jobs (for
        ##   their first resubmission and all next automatic resubmissions).

        #Checking if the sites provided by the user are valid cmsnames. Doing this because with only the
        #server error handling we get:
        #    Server answered with: Invalid input parameter
        #    Reason is: Incorrect 'siteblacklist' parameter
        #which is not really user friendly.
        #Moreover, I prefer to be independent from Lexicon. I'll the regex here.
        sn_re = "^T[1-3]_[A-Z]{2}(_[A-Za-z0-9]+)+$"  #sn_re => SiteName_RegularExpression
        sn_rec = re.compile(
            sn_re)  #sn_rec => SiteName_RegularExpressionCompiled
        for sitelist in ['sitewhitelist', 'siteblacklist']:
            if getattr(self.options, sitelist) is not None:
                if getattr(self.options, sitelist) != "":
                    for site_name in getattr(self.options,
                                             sitelist).split(','):
                        if '*' not in site_name and not sn_rec.match(
                                site_name):
                            msg = "The site name '%s' does not look like a valid CMS site name" % (
                                site_name)
                            msg += " (it is not matching the regular expression '%s')." % (
                                sn_re)
                            raise ConfigurationException(msg)
                    setattr(self, sitelist,
                            getattr(self.options, sitelist).split(','))
                else:
                    setattr(self, sitelist, [])

        ## Sanity checks for task sizes. Limits are purposely fairly generous to provide
        ## some level of future-proofing. The server may restrict further.
        if self.options.maxjobruntime is not None:
            if self.options.maxjobruntime < 60 or self.options.maxjobruntime > 336 * 60:
                msg = "The requested maximum job runtime (%d minutes) must be between 60 and 20160 minutes." % (
                    self.options.maxjobruntime)
                raise ConfigurationException(msg)

        if self.options.maxmemory is not None:
            if self.options.maxmemory < 30 or self.options.maxmemory > 1024 * 30:
                msg = "The requested per-job memory (%d MB) must be between 30 and 30720 MB." % (
                    self.options.maxmemory)
                raise ConfigurationException(msg)

        if self.options.numcores is not None:
            if self.options.numcores < 1 or self.options.numcores > 128:
                msg = "The requested number of cores (%d) must be between 1 and 128." % (
                    self.options.numcores)
                raise ConfigurationException(msg)

        if self.options.priority is not None:
            if self.options.priority < 1:
                msg = "The requested priority (%d) must be greater than 0." % (
                    self.options.priority)
                raise ConfigurationException(msg)
Esempio n. 5
0
    def validateOptions(self):
        """
        Check if the sitelist parameter is a comma separater list of cms sitenames,
        and put the strings to be passed to the server to self
        """
        SubCommand.validateOptions(self)

        serverFactory = CRABClient.Emulator.getEmulator('rest')
        self.server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version = __version__)
        uri = self.getUrl(self.instance, resource = 'task')
        crabDBInfo, _, _ = self.server.get(uri, data = {'subresource': 'search', 'workflow': self.cachedinfo['RequestName']})
        self.splitting = getColumn(crabDBInfo, 'tm_split_algo')

        if self.options.publication:
            if self.options.sitewhitelist is not None or self.options.siteblacklist is not None or \
               self.options.maxjobruntime is not None or self.options.maxmemory is not None or \
               self.options.numcores is not None or self.options.priority is not None:
                msg  = "The options --sitewhitelist, --siteblacklist,"
                msg += " --maxjobruntime, --maxmemory, --numcores and  --priority"
                msg += " can not be specified together with the option --publication."
                msg += " The last option is to only resubmit (failed) publications,"
                msg += " in which case all of the first options make no sense."
                raise ConfigurationException(msg)
            if self.options.jobids:
                msg  = "The option --jobids"
                msg += " can not be specified together with the option --publication."
                msg += " The last option is to only resubmit (failed) publications,"
                msg += " which does not allow yet filtering on job ids (ALL failed publications will be resubmitted)."
                raise ConfigurationException(msg)
            if self.options.force:
                msg  = "The option --force"
                msg += " can not be specified together with the option --publication."
                msg += " The last option is to only resubmit failed publications."
                msg += " Publications in a status other than 'failed' can not be resubmitted."
                raise ConfigurationException(msg)

        ## The --jobids option indicates which jobs have to be resubmitted. If it is not
        ## given, then all jobs in the task that are not running or successfully
        ## completed are resubmitted. If the user provides a list of job ids, then also
        ## successfully completed jobs can be resubmitted.

        ## Check the format of the jobids option.
        if self.options.jobids:
            jobidstuple = validateJobids(self.options.jobids, self.splitting != 'Automatic')
            self.jobids = [str(jobid) for (_, jobid) in jobidstuple]

        ## The --force option should not be accepted unless combined with a user-given
        ## list of job ids via --jobids.
        if self.options.force and not self.jobids:
            msg = "Option --force can only be used in combination with option --jobids."
            raise ConfigurationException(msg)

        ## Covention used for the job parameters that the user can set when doing job
        ## resubmission (i.e. siteblacklist, sitewhitelist, maxjobruntime, maxmemory,
        ## numcores and priority):
        ## - If the user doesn't set a parameter we don't pass it to the server and the
        ##   the server copies the original value the parameter had at task submission.
        ##   It copies it from the Task DB. Therefore we need to keep these parameters
        ##   in separate columns of the Task DB containing their original values.
        ## - For the site black- and whitelists, if the user passes an empty string,
        ##   e.g. --siteblacklist='', we pass to the server siteblacklist=empty and the
        ##   server interprets this as and empty list ([]). If the user passes a given
        ##   list of sites, this new list overwrittes the original one.
        ## - The values of the parameters are used only for the resubmitted jobs (for
        ##   their first resubmission and all next automatic resubmissions).

        #Checking if the sites provided by the user are valid cmsnames. Doing this because with only the
        #server error handling we get:
        #    Server answered with: Invalid input parameter
        #    Reason is: Incorrect 'siteblacklist' parameter
        #which is not really user friendly.
        #Moreover, I prefer to be independent from Lexicon. I'll the regex here.
        sn_re = "^T[1-3]_[A-Z]{2}(_[A-Za-z0-9]+)+$" #sn_re => SiteName_RegularExpression
        sn_rec = re.compile(sn_re) #sn_rec => SiteName_RegularExpressionCompiled
        for sitelist in ['sitewhitelist', 'siteblacklist']:
            if getattr(self.options, sitelist) is not None:
                if getattr(self.options, sitelist) != "":
                    for site_name in getattr(self.options, sitelist).split(','):
                        if '*' not in site_name and not sn_rec.match(site_name):
                            msg  = "The site name '%s' does not look like a valid CMS site name" % (site_name)
                            msg += " (it is not matching the regular expression '%s')." % (sn_re)
                            raise ConfigurationException(msg)
                    setattr(self, sitelist, getattr(self.options, sitelist).split(','))
                else:
                    setattr(self, sitelist, [])

        ## Sanity checks for task sizes. Limits are purposely fairly generous to provide
        ## some level of future-proofing. The server may restrict further.
        if self.options.maxjobruntime is not None:
            if self.options.maxjobruntime < 60 or self.options.maxjobruntime > 336*60:
                msg = "The requested maximum job runtime (%d minutes) must be between 60 and 20160 minutes." % (self.options.maxjobruntime)
                raise ConfigurationException(msg)

        if self.options.maxmemory is not None:
            if self.options.maxmemory < 30 or self.options.maxmemory > 1024*30:
                msg = "The requested per-job memory (%d MB) must be between 30 and 30720 MB." % (self.options.maxmemory)
                raise ConfigurationException(msg)

        if self.options.numcores is not None:
            if self.options.numcores < 1 or self.options.numcores > 128:
                msg = "The requested number of cores (%d) must be between 1 and 128." % (self.options.numcores)
                raise ConfigurationException(msg)

        if self.options.priority is not None:
            if self.options.priority < 1:
                msg = "The requested priority (%d) must be greater than 0." % (self.options.priority)
                raise ConfigurationException(msg)
Esempio n. 6
0
    def printPublication(self, publicationEnabled, jobsPerStatus, asourl,
                         asodb, taskname, user, crabDBInfo):
        """Print information about the publication of the output files in DBS.
        """
        # Collecting publication information
        pubStatus = {}
        if (publicationEnabled and 'finished' in jobsPerStatus):
            #let's default asodb to asynctransfer, for old task this is empty!
            asodb = asodb or 'asynctransfer'
            pubStatus = self.publicationStatus(taskname, asourl, asodb, user)
        elif not publicationEnabled:
            pubStatus['status'] = {'disabled': []}
        pubInfo = {}
        pubInfo['publication'] = pubStatus.get('status', {})
        pubInfo['publicationFailures'] = pubStatus.get('failure_reasons', {})

        ## The output datasets are written into the Task DB by the post-job
        ## when uploading the output files metadata.
        outdatasets = literal_eval(
            getColumn(crabDBInfo, 'tm_output_dataset') if getColumn(
                crabDBInfo, 'tm_output_dataset') else 'None')
        pubInfo['outdatasets'] = outdatasets
        pubInfo['jobsPerStatus'] = jobsPerStatus

        if 'publication' not in pubInfo:
            return pubStatus
        ## If publication was disabled, print a pertinent message and return.
        if 'disabled' in pubInfo['publication']:
            msg = "\nNo publication information (publication has been disabled in the CRAB configuration file)"
            self.logger.info(msg)
            return pubStatus
        ## List of output datasets that are going to be (or are already) published. This
        ## list is written into the Tasks DB by the post-job when it does the upload of
        ## the output files metadata. This means that the list will be empty until one
        ## of the post-jobs will finish executing.
        outputDatasets = pubInfo.get('outdatasets')

        ## If publication information is not available yet, print a pertinent message
        ## (print first the list of output datasets, without the DAS URL) and return.
        if not pubInfo['publication']:
            self.printOutputDatasets(outputDatasets)
            msg = "\nNo publication information available yet"
            self.logger.info(msg)
            return pubStatus
        ## Case in which there was an error in retrieving the publication status.
        if 'error' in pubInfo['publication']:
            msg = "\nPublication status:\t\t%s" % (
                pubInfo['publication']['error'])
            self.logger.info(msg)
            ## Print the output datasets with the corresponding DAS URL.
            self.printOutputDatasets(outputDatasets, includeDASURL=True)
            return pubStatus
        if pubInfo['publication'] and outputDatasets:
            states = pubInfo['publication']
            ## Don't consider publication states for which 0 files are in this state.
            states_tmp = states.copy()
            for status in states:
                if states[status] == 0:
                    del states_tmp[status]
            states = states_tmp.copy()
            ## Count the total number of files to publish. For this we count the number of
            ## jobs and the number of files to publish per job (which is equal to the number
            ## of output datasets produced by the task, because, when multiple EDM files are
            ## produced, each EDM file goes into a different output dataset).
            numJobs = sum(pubInfo['jobsPerStatus'].values())
            numOutputDatasets = len(outputDatasets)
            numFilesToPublish = numJobs * numOutputDatasets
            ## Count how many of these files have already started the publication process.
            numSubmittedFiles = sum(states.values())
            ## Substract the above two numbers to obtain how many files have not yet been
            ## considered for publication.
            states['unsubmitted'] = numFilesToPublish - numSubmittedFiles
            ## Print the publication status.
            statesList = sorted(states)
            msg = "\nPublication status:\t\t{0} {1}".format(self._printState(statesList[0], 13), \
                                                            self._percentageString(statesList[0], states[statesList[0]], numFilesToPublish))
            for status in statesList[1:]:
                if states[status]:
                    msg += "\n\t\t\t\t{0} {1}".format(self._printState(status, 13), \
                                                      self._percentageString(status, states[status], numFilesToPublish))
            self.logger.info(msg)
            ## Print the publication errors.
            if pubInfo.get('publicationFailures'):
                msg = "\nPublication error summary:"
                if 'error' in pubInfo['publicationFailures']:
                    msg += "\t%s" % (pubInfo['publicationFailures']['error'])
                elif pubInfo['publicationFailures'].get('result'):
                    ndigits = int(
                        math.ceil(math.log(numFilesToPublish + 1, 10)))
                    for failureReason, numFailedFiles in pubInfo[
                            'publicationFailures']['result']:
                        msg += (
                            "\n\n\t%" + str(ndigits) +
                            "s files failed to publish with following error message:\n\n\t\t%s"
                        ) % (numFailedFiles, failureReason)
                self.logger.info(msg)
            ## Print the output datasets with the corresponding DAS URL.
            self.printOutputDatasets(outputDatasets, includeDASURL=True)

        return pubStatus
Esempio n. 7
0
    def __call__(self):
        # Get all of the columns from the database for a certain task
        taskname = self.cachedinfo['RequestName']
        uri = self.getUrl(self.instance, resource='task')
        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl,
                               self.proxyfilename,
                               self.proxyfilename,
                               version=__version__)
        crabDBInfo, _, _ = server.get(uri,
                                      data={
                                          'subresource': 'search',
                                          'workflow': taskname
                                      })
        self.logger.debug("Got information from server oracle database: %s",
                          crabDBInfo)

        # Until the task lands on a schedd we'll show the status from the DB
        combinedStatus = getColumn(crabDBInfo, 'tm_task_status')

        user = getColumn(crabDBInfo, 'tm_username')
        webdir = getColumn(crabDBInfo, 'tm_user_webdir')
        rootDagId = getColumn(crabDBInfo,
                              'clusterid')  #that's the condor id from the TW
        asourl = getColumn(crabDBInfo, 'tm_asourl')
        asodb = getColumn(crabDBInfo, 'tm_asodb')
        publicationEnabled = True if getColumn(
            crabDBInfo, 'tm_publication') == 'T' else False

        #Print information from the database
        self.printTaskInfo(crabDBInfo, user)
        if not rootDagId:
            failureMsg = "The task has not been submitted to the Grid scheduler yet. Not printing job information."
            self.logger.debug(failureMsg)
            return self.makeStatusReturnDict(crabDBInfo,
                                             combinedStatus,
                                             statusFailureMsg=failureMsg)

        self.logger.debug(
            "The CRAB server submitted your task to the Grid scheduler (cluster ID: %s)"
            % rootDagId)

        if not webdir:
            # Query condor through the server for information about this task
            uri = self.getUrl(self.instance, resource='workflow')
            params = {'subresource': 'taskads', 'workflow': taskname}

            res = server.get(uri, data=params)[0]['result'][0]
            # JobStatus 5 = Held
            if res['JobStatus'] == '5' and 'DagmanHoldReason' in res:
                # If we didn't find a webdir in the DB and the DAG is held,
                # the task bootstrapping failed before or during the webdir
                # upload and the reason should be printed.
                failureMsg = "The task failed to bootstrap on the Grid scheduler."
                failureMsg += " Please send an e-mail to %s." % (FEEDBACKMAIL)
                failureMsg += "\nHold reason: %s" % (res['DagmanHoldReason'])
                self.logger.info(failureMsg)
                combinedStatus = "FAILED"
            else:
                # if the dag is submitted and the webdir is not there we have to wait that AdjustSites run
                # and upload the webdir location to the server
                self.logger.info(
                    "Waiting for the Grid scheduler to bootstrap your task")
                failureMsg = "Schedd has not reported back the webdir (yet)"
                self.logger.debug(failureMsg)
                combinedStatus = "UNKNOWN"
            return self.makeStatusReturnDict(crabDBInfo,
                                             combinedStatus,
                                             statusFailureMsg=failureMsg)

        self.logger.debug("Webdir is located at %s", webdir)

        proxiedWebDir = getProxiedWebDir(taskname, self.serverurl, uri,
                                         self.proxyfilename, self.logger.debug)
        if not proxiedWebDir:
            msg = "Failed to get the proxied webdir from CRABServer. "
            msg += "\nWill fall back to the regular webdir url for file downloads "
            msg += "but will likely fail if the client is located outside CERN."
            self.logger.debug(msg)
            proxiedWebDir = webdir
        self.logger.debug("Proxied webdir is located at %s", proxiedWebDir)

        # Download status_cache file
        url = proxiedWebDir + "/status_cache"
        self.logger.debug("Retrieving 'status_cache' file from %s", url)

        statusCacheInfo = None
        try:
            statusCacheData = getDataFromURL(url, self.proxyfilename)
        except HTTPException as ce:
            self.logger.info(
                "Waiting for the Grid scheduler to report back the status of your task"
            )
            failureMsg = "Cannot retrieve the status_cache file. Maybe the task process has not run yet?"
            failureMsg += " Got:\n%s" % ce
            self.logger.error(failureMsg)
            logging.getLogger("CRAB3").exception(ce)
            combinedStatus = "UNKNOWN"
            return self.makeStatusReturnDict(crabDBInfo,
                                             combinedStatus,
                                             statusFailureMsg=failureMsg)
        else:
            # We skip first two lines of the file because they contain the checkpoint locations
            # for the job_log / fjr_parse_results files and are used by the status caching script.
            # Load the job_report summary
            statusCacheInfo = literal_eval(statusCacheData.split('\n')[2])
            self.logger.debug("Got information from status cache file: %s",
                              statusCacheInfo)

        # If the task is already on the grid, show the dagman status
        combinedStatus = dagStatus = self.printDAGStatus(
            crabDBInfo, statusCacheInfo)

        shortResult = self.printShort(statusCacheInfo)
        pubStatus = self.printPublication(publicationEnabled,
                                          shortResult['jobsPerStatus'], asourl,
                                          asodb, taskname, user, crabDBInfo)
        self.printErrors(statusCacheInfo)

        if self.options.summary:
            self.printSummary(statusCacheInfo)
        if self.options.long or self.options.sort:
            # If user correctly passed some jobid CSVs to use in the status --long, self.jobids
            # will be a list of strings already parsed from the input by the validateOptions()
            if self.jobids:
                self.checkUserJobids(statusCacheInfo, self.jobids)
            sortdict = self.printLong(statusCacheInfo,
                                      self.jobids,
                                      quiet=(not self.options.long))
            if self.options.sort:
                self.printSort(sortdict, self.options.sort)
        if self.options.json:
            self.logger.info(json.dumps(statusCacheInfo))

        statusDict = self.makeStatusReturnDict(crabDBInfo, combinedStatus,
                                               dagStatus, '', shortResult,
                                               statusCacheInfo, pubStatus,
                                               proxiedWebDir)

        return statusDict
Esempio n. 8
0
    def makeStatusReturnDict(self,
                             crabDBInfo,
                             combinedStatus,
                             dagStatus='',
                             statusFailureMsg='',
                             shortResult={},
                             statusCacheInfo={},
                             pubStatus={},
                             proxiedWebDir=''):
        """ Create a dictionary which is mostly identical to the dictionary
            that was being returned by the old status (plus a few other keys
            needed by the other client commands). This is to ensure backward
            compatibility after the status2 transition for users relying on
            this dictionary in their scripts.
        """

        statusDict = {}
        statusDict['status'] = combinedStatus
        statusDict['dbStatus'] = getColumn(crabDBInfo, 'tm_task_status')
        statusDict['dagStatus'] = dagStatus
        statusDict['username'] = getColumn(crabDBInfo, 'tm_username')
        statusDict['taskFailureMsg'] = getColumn(crabDBInfo, 'tm_task_failure')
        statusDict['taskWarningMsg'] = getColumn(crabDBInfo,
                                                 'tm_task_warnings')
        statusDict['outdatasets'] = getColumn(crabDBInfo, 'tm_output_dataset')
        statusDict['schedd'] = getColumn(crabDBInfo, 'tm_schedd')
        statusDict['collector'] = getColumn(crabDBInfo, 'tm_collector')
        statusDict['ASOURL'] = getColumn(crabDBInfo, 'tm_asourl')
        statusDict['command'] = getColumn(crabDBInfo, 'tm_task_command')
        statusDict['publicationEnabled'] = True if getColumn(
            crabDBInfo, 'tm_publication') == 'T' else False
        statusDict['userWebDirURL'] = getColumn(crabDBInfo, 'tm_user_webdir')
        statusDict['inputDataset'] = getColumn(crabDBInfo, 'tm_input_dataset')

        dbStartTime = getColumn(crabDBInfo, 'tm_start_time')
        statusDict['submissionTime'] = getEpochFromDBTime(
            datetime.strptime(dbStartTime, '%Y-%m-%d %H:%M:%S.%f'))

        statusDict['statusFailureMsg'] = statusFailureMsg
        statusDict['proxiedWebDir'] = proxiedWebDir
        statusDict['jobsPerStatus'] = shortResult.get('jobsPerStatus', {})
        statusDict['jobList'] = shortResult.get('jobList', {})
        statusDict['publication'] = pubStatus.get('status', {})
        statusDict['publicationFailures'] = pubStatus.get(
            'failure_reasons', {})
        statusDict['jobs'] = statusCacheInfo
        return statusDict
Esempio n. 9
0
    def __call__(self):

        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl,
                               self.proxyfilename,
                               self.proxyfilename,
                               version=__version__)

        crabDBInfo, jobList = getMutedStatusInfo(self.logger)

        if not jobList:
            msg = "%sError%s:" % (colors.RED, colors.NORMAL)
            msg += " Status information is unavailable, will not proceed with the resubmission."
            msg += " Try again a few minutes later if the task has just been submitted."
            self.logger.info(msg)
            return None

        publicationEnabled = getColumn(crabDBInfo, "tm_publication")
        jobsPerStatus = jobList['jobsPerStatus']

        if self.options.publication:
            if publicationEnabled == "F":
                msg = "Publication was disabled for this task. Therefore, "
                msg += "there are no publications to resubmit."
                self.logger.info(msg)
                return None
            else:
                if "finished" not in jobsPerStatus:
                    msg = "No files found to publish"
                    self.logger.info(msg)
                    return None

        self.jobids = self.processJobIds(jobList)

        configreq = self.getQueryParams()
        self.logger.info("Sending resubmit request to the server.")
        self.logger.debug("Submitting %s " % str(configreq))
        configreq_encoded = self._encodeRequest(configreq)
        self.logger.debug("Encoded resubmit request: %s" % (configreq_encoded))

        dictresult, _, _ = server.post(self.uri, data=configreq_encoded)
        self.logger.debug("Result: %s" % (dictresult))
        self.logger.info("Resubmit request sent to the server.")
        if dictresult['result'][0]['result'] != 'ok':
            msg = "Server responded with: '%s'" % (
                dictresult['result'][0]['result'])
            self.logger.info(msg)
            returndict = {'status': 'FAILED'}
        else:
            if not self.options.wait:
                msg = "Please use 'crab status' to check how the resubmission process proceeds."
                msg += "\nNotice it may take a couple of minutes for the resubmission to get fully processed."
                self.logger.info(msg)
            else:
                targetTaskStatus = 'SUBMITTED'
                checkStatusLoop(self.logger, server, self.uri,
                                self.cachedinfo['RequestName'],
                                targetTaskStatus, self.name)
            returndict = {'status': 'SUCCESS'}

        return returndict