コード例 #1
0
    def query(self, obj, service='', objType='node'):
        """
        query status of jobs
        """

        from xml.sax import make_parser
        from CondorHandler import CondorHandler
        from xml.sax.handler import feature_external_ges

        # FUTURE:
        #  Use condor_history -attributes to limit the XML size. Faster on both ends
        # Convert Condor integer status to BossLite Status codes
        # Condor status is e.g. from http://pages.cs.wisc.edu/~adesmet/status.html#condor-jobstatus
        # 0	Unexpanded 	U
        # 1	Idle 	        I
        # 2	Running 	R
        # 3	Removed 	X
        # 4	Completed 	C
        # 5	Held 	        H
        # 6	Submission_err 	E

        statusCodes = {'0':'RE', '1':'S', '2':'R',
                       '3':'K',  '4':'SD', '5':'A'}
        textStatusCodes = {
                '0':'Ready',
                '1':'Submitted',
                '2':'Running',
                '3':'Cancelled',
                '4':'Done',
                '5':'Aborted'
        }


        if not type(obj) == Task:
            raise SchedulerError('Wrong argument type or object type',
                                  str(type(obj)) + ' ' + str(objType))

        taskId = obj['name']
        
        jobIds = {}
        bossStatus = {}
        schdId = {}

        somethingDone = False # note if some jobs became Done in this round

        for job in obj.jobs:
            if not self.valid(job.runningJob):
                continue
            
            # skip if the Job was created but never submitted
            if job.runningJob['status'] == 'C' :
                continue

            # skip if the Job is already Done, nothing more to ask glidein
            if job.runningJob['statusScheduler'] == 'Done' :
                continue
            
            schedulerId = job.runningJob['schedulerId']

            # Jobs are done if condor_q/history does not list them
            # queries to condor schedd's will only return cluster.job
            # so needs to cross link the two via the schdId[condoId] map
            bossStatus[schedulerId] = {'status':'SD', 'statusScheduler':'Done'}
            schedd = schedulerId.split('//')[0]
            submissionDay = schedulerId.split('//')[1]
            jobNum = schedulerId.split('//')[2]
            condorId = schedd + '//' + jobNum
            schdId[condorId] = schedulerId

            # Fill dictionary of schedd and job #'s to check
            if schedd in jobIds.keys():
                jobIds[schedd].append(jobNum)
            else :
                jobIds[schedd] = [jobNum]

        if len(jobIds.keys()) > 0 :
            # there is something to check on remote condor host
            self.initializeGsissh(obj)

        for schedd in jobIds.keys() :
            if not schedd == self.remoteHost:
                self.logging.info("ERROR: found jobs for schedd %s in a task targetted for submission host %s" % (schedd,self.remoteHost))
                raise Exception("Mixing schedd's in same task is not supported")
            
            # to begin with, push a fresh proxy to the remote host
            command = '%s %s %s %s:%s' % \
                      (self.remoteCopyCommand, self.gsisshOptions, \
                           self.x509Proxy(), self.remoteUserHost, taskId)
            self.logging.debug("Execute command :\n%s" % command)
            (status, output) = commands.getstatusoutput(command)
            self.logging.debug("Status,output= %s,%s" %
                    (status, output))
            if (status) :
                self.logging.error("Failed to renew proxy on remote submission host")
                self.logging.error("Command: %s failed with output=\n%s"%(command,output))
                if "already exists" in output:
                    self.removeGsisshSocket()

            command = "%s %s %s " % \
                (self.remoteCommand, self.gsisshOptions, self.remoteUserHost)
            command += ' "condor_history -userlog %s/condor.log' % taskId
            command += ' -xml"'

            self.logging.debug("Execute command :\n%s" % command)

            pObj = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE, close_fds=True)
            (inputFile, outputFp, errorFp) = (pObj.stdin, pObj.stdout, pObj.stderr)
            try:
                outputFile = cStringIO.StringIO(outputFp.read()) # >7.3 vers.
            except:
                raise SchedulerError('Problem reading output of command', command)

            # If the command succeeded, close returns None
            # Otherwise, close returns the exit code
            if outputFp.close():
                raise SchedulerError("condor_history command or cache file failed.")
            else:   # close stderr from command ignoring errors
                try:
                    errorFp.close()
                except:
                    pass

            handler = CondorHandler('GlobalJobId',
                       ['JobStatus', 'GridJobId','ProcId','ClusterId',
                        'JOB_Gatekeeper', 'MATCH_GLIDEIN_Gatekeeper','GlobalJobId'])

            parser = make_parser()
            try:
                parser.setContentHandler(handler)
                parser.setFeature(feature_external_ges, False)
                parser.parse(outputFile)
            except:
                self.logging.info("Unexpected exception: %s" % sys.exc_info()[0])
                self.logging.debug("Error parsing condor_history output:\n%s" % outputFile.getvalue())
                raise SchedulerError('Problem parsing output of condor_history command', command)

            jobDicts = handler.getJobInfo()


            for globalJobId in jobDicts.keys():
                clusterId = jobDicts[globalJobId].get('ClusterId', None)
                procId    = jobDicts[globalJobId].get('ProcId',    None)
                jobId = str(clusterId) + '.' + str(procId)
                condorId = schedd + '//' + jobId
                jobStatus = jobDicts[globalJobId].get('JobStatus', None)

                # Host can be either in Job_Gatekeeper or MATCH_GLIDEIN_Gatekeeper
                execHost = None
                glideinHost = None
                jobGkpr = jobDicts[globalJobId].get('JOB_Gatekeeper', None)
                matGkpr = jobDicts[globalJobId].get('MATCH_GLIDEIN_Gatekeeper', None)
                if jobGkpr and not "Unknown" in jobGkpr:
                    glideinHost = jobGkpr
                else:
                    if matGkpr:
                        glideinHost = matGkpr

                if glideinHost:
                    execHost = glideinHost
                    # strip possible leading https's and leading/trailing extra words
                    for token in glideinHost.replace("https://","").split(" ") :
                        if token.find("/") != -1 :
                            execHost = token

                # Don't mess with jobs we're not interested in,
                # put what we found into BossLite statusRecord
                
                if schdId.has_key(condorId):
                #if bossStatus.has_key(schedd+'//'+jobId):
                    schedulerId = schdId[condorId]
                    statusRecord = {}
                    statusRecord['status']          = statusCodes.get(jobStatus, 'UN')
                    statusRecord['statusScheduler'] = textStatusCodes.get(jobStatus, 'Undefined')
                    statusRecord['statusReason']    = ''
                    statusRecord['service']         = service
                    if execHost:
                        statusRecord['destination'] = execHost
                    if statusRecord['status'] == 'SD' :
                        somethingDone = True

                    bossStatus[schedulerId] = statusRecord

        if somethingDone :
            # get ExitCodes from fjrs"
            command = "%s %s %s " % \
            (self.remoteCommand, self.gsisshOptions, self.remoteUserHost)
            command += '"cd %s; ' % (taskId)
            # need to put single and double quotes and tab (\t) in
            # shell command for gsissh. So get horrible escaping here
            # be very careful with changes
            command += "egrep -H WrapperExitCode\|ExeExitCode crab_fjr_*xml"
            command += "|tr '_.\\t\\\"' ' '"   # change all delim to blank for awk
            command += "|awk '{print \\$3\\\" \\\"\\$7\\\" \\\"\\$9}'\""
            self.logging.debug("Execute command :\n%s" % command)
            (status, output) = commands.getstatusoutput(command)
            self.logging.debug("Status,output= %s\n%s" %
                               (status, output))
            if (status) :
                if "already exists" in output:
                    self.removeGsisshSocket()

            ExeCodes={}
            WrapperCodes={}
            for line in output.split('\n'):
                jid,code,kind=line.split(' ')
                if kind == 'ExeExitCode' :
                    ExeCodes[int(jid)]=code
                if kind == 'WrapperExitCode' :
                    WrapperCodes[int(jid)]=code
        
        for job in obj.jobs:      # loop on crab job id's
            jid=job.runningJob['jobId']
            schedulerId = job.runningJob['schedulerId']
            if bossStatus.has_key(schedulerId):  # there's an update from condor
                for key, value in bossStatus[schedulerId].items():
                    job.runningJob[key] = value
                # if this a newly terminated job get the exit code w/o waiting for crab -get
                if somethingDone :
                    if ExeCodes.has_key(jid):
                        job.runningJob['applicationReturnCode']=ExeCodes[jid]
                        if WrapperCodes.has_key(jid):
                            job.runningJob['wrapperReturnCode']=WrapperCodes[jid]

        return
コード例 #2
0
    def query(self, obj, service='', objType='node'):
        """
        query status of jobs
        """

        from xml.sax import make_parser
        from CondorHandler import CondorHandler
        from xml.sax.handler import feature_external_ges

        jobIds = {}
        bossIds = {}

        # FUTURE:
        #  Remove Condor < 7.3 when OK
        #  Use condor_q -attributes to limit the XML size. Faster on both ends
        # Convert Condor integer status to BossLite Status codes
        statusCodes = {'0':'RE', '1':'S', '2':'R',
                       '3':'K',  '4':'D', '5':'A'}
        textStatusCodes = {
                '0':'Ready',
                '1':'Submitted',
                '2':'Running',
                '3':'Cancelled',
                '4':'Done',
                '5':'Aborted'
        }

        if type(obj) == Task:
            taskId = obj['name']

            for job in obj.jobs:
                if not self.valid(job.runningJob):
                    continue

                schedulerId = job.runningJob['schedulerId']

                # fix: skip if the Job was created but never submitted
                if job.runningJob['status'] == 'C' :
                    continue

                # Jobs are done by default
                bossIds[schedulerId] = {'status':'SD', 'statusScheduler':'Done'}
                schedd = schedulerId.split('//')[0]
                jobNum = schedulerId.split('//')[1]

                # Fill dictionary of schedd and job #'s to check
                if schedd in jobIds.keys():
                    jobIds[schedd].append(jobNum)
                else :
                    jobIds[schedd] = [jobNum]
        else:
            raise SchedulerError('Wrong argument type or object type',
                                  str(type(obj)) + ' ' + str(objType))

        for schedd in jobIds.keys() :
            cmd = 'condor_q -xml '
            if schedd != self.hostname:
                cmd += '-name ' + schedd + ' '
            cmd += """-constraint 'BLTaskID=?="%s"'""" % taskId

            pObj = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE,
                         stderr=STDOUT, close_fds=True)
            (inputFile, outputFp) = (pObj.stdin, pObj.stdout)
            try:
                xmlLine = ''
                while xmlLine.find('<?xml') == -1:
                    # Throw away junk for condor < 7.3, remove when obsolete
                    xmlLine = outputFp.readline()

                outputFile = cStringIO.StringIO(xmlLine+outputFp.read())
                #outputFile = cStringIO.StringIO(outputFp.read()) # >7.3 vers.
            except:
                raise SchedulerError('Problem reading output of command', cmd)

            # If the command succeeded, close returns None
            # Otherwise, close returns the exit code
            if outputFp.close():
                raise SchedulerError("condor_q command or cache file failed.")

            handler = CondorHandler('GlobalJobId',
                       ['JobStatus', 'GridJobId','ProcId','ClusterId',
                        'MATCH_GLIDEIN_Gatekeeper', 'GlobalJobId'])
            parser = make_parser()
            try:
                parser.setContentHandler(handler)
                parser.setFeature(feature_external_ges, False)
                parser.parse(outputFile)
            except:
                raise SchedulerError('Problem parsing output of command', cmd)

            jobDicts = handler.getJobInfo()

            for globalJobId in jobDicts.keys():
                clusterId = jobDicts[globalJobId].get('ClusterId', None)
                procId    = jobDicts[globalJobId].get('ProcId',    None)
                jobId = str(clusterId) + '.' + str(procId)
                jobStatus = jobDicts[globalJobId].get('JobStatus', None)

                # Host can be either in GridJobId or Glidein match
                execHost = None
                gridJobId = jobDicts[globalJobId].get('GridJobId', None)
                if gridJobId:
                    uri = gridJobId.split(' ')[1]
                    execHost = uri.split(':')[0]
                glideinHost = jobDicts[globalJobId].get('MATCH_GLIDEIN_Gatekeeper', None)
                if glideinHost:
                    execHost = glideinHost

                # Don't mess with jobs we're not interested in,
                # put what we found into BossLite statusRecord
                if bossIds.has_key(schedd+'//'+jobId):
                    statusRecord = {}
                    statusRecord['status']          = statusCodes.get(jobStatus, 'UN')
                    statusRecord['statusScheduler'] = textStatusCodes.get(jobStatus, 'Undefined')
                    statusRecord['statusReason']    = ''
                    statusRecord['service']         = service
                    if execHost:
                        statusRecord['destination'] = execHost

                    bossIds[schedd + '//' + jobId] = statusRecord

        for job in obj.jobs:
            schedulerId = job.runningJob['schedulerId']
            if bossIds.has_key(schedulerId):
                for key, value in bossIds[schedulerId].items():
                    job.runningJob[key] = value
        return
コード例 #3
0
    def query(self, obj, service='', objType='node'):
        """
        query status of jobs
        """

        from xml.sax import make_parser
        from CondorHandler import CondorHandler
        from xml.sax.handler import feature_external_ges

        # FUTURE:
        #  Use condor_history -attributes to limit the XML size. Faster on both ends
        # Convert Condor integer status to BossLite Status codes
        # Condor status is e.g. from http://pages.cs.wisc.edu/~adesmet/status.html#condor-jobstatus
        # 0	Unexpanded 	U
        # 1	Idle 	        I
        # 2	Running 	R
        # 3	Removed 	X
        # 4	Completed 	C
        # 5	Held 	        H
        # 6	Submission_err 	E

        statusCodes = {
            '0': 'RE',
            '1': 'S',
            '2': 'R',
            '3': 'K',
            '4': 'SD',
            '5': 'A'
        }
        textStatusCodes = {
            '0': 'Ready',
            '1': 'Submitted',
            '2': 'Running',
            '3': 'Cancelled',
            '4': 'Done',
            '5': 'Aborted'
        }

        if not type(obj) == Task:
            raise SchedulerError('Wrong argument type or object type',
                                 str(type(obj)) + ' ' + str(objType))

        taskId = obj['name']

        jobIds = {}
        bossStatus = {}
        schdId = {}

        somethingDone = False  # note if some jobs became Done in this round

        for job in obj.jobs:
            if not self.valid(job.runningJob):
                continue

            # skip if the Job was created but never submitted
            if job.runningJob['status'] == 'C':
                continue

            # skip if the Job is already Done, nothing more to ask glidein
            if job.runningJob['statusScheduler'] == 'Done':
                continue

            schedulerId = job.runningJob['schedulerId']

            # Jobs are done if condor_q/history does not list them
            # queries to condor schedd's will only return cluster.job
            # so needs to cross link the two via the schdId[condoId] map
            bossStatus[schedulerId] = {
                'status': 'SD',
                'statusScheduler': 'Done'
            }
            schedd = schedulerId.split('//')[0]
            submissionDay = schedulerId.split('//')[1]
            jobNum = schedulerId.split('//')[2]
            condorId = schedd + '//' + jobNum
            schdId[condorId] = schedulerId

            # Fill dictionary of schedd and job #'s to check
            if schedd in jobIds.keys():
                jobIds[schedd].append(jobNum)
            else:
                jobIds[schedd] = [jobNum]

        if len(jobIds.keys()) > 0:
            # there is something to check on remote condor host
            self.initializeGsissh(obj)

        for schedd in jobIds.keys():
            if not schedd == self.remoteHost:
                self.logging.info(
                    "ERROR: found jobs for schedd %s in a task targetted for submission host %s"
                    % (schedd, self.remoteHost))
                raise Exception(
                    "Mixing schedd's in same task is not supported")

            # to begin with, push a fresh proxy to the remote host
            command = '%s %s %s %s:%s' % \
                      (self.remoteCopyCommand, self.gsisshOptions, \
                           self.x509Proxy(), self.remoteUserHost, taskId)
            self.logging.debug("Execute command :\n%s" % command)
            (status, output) = commands.getstatusoutput(command)
            self.logging.debug("Status,output= %s,%s" % (status, output))
            if (status):
                self.logging.error(
                    "Failed to renew proxy on remote submission host")
                self.logging.error("Command: %s failed with output=\n%s" %
                                   (command, output))
                if "already exists" in output:
                    self.removeGsisshSocket()

            command = "%s %s %s %s " % \
                (self.unsetenvScram, self.remoteCommand, self.gsisshOptions, self.remoteUserHost)
            command += ' "condor_history -userlog %s/condor.log' % taskId
            command += ' -xml"'

            self.logging.debug("Execute command :\n%s" % command)

            pObj = subprocess.Popen(command,
                                    shell=True,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    close_fds=True)
            (inputFile, outputFp, errorFp) = (pObj.stdin, pObj.stdout,
                                              pObj.stderr)
            try:
                outputFile = cStringIO.StringIO(outputFp.read())  # >7.3 vers.
            except:
                raise SchedulerError('Problem reading output of command',
                                     command)

            # If the command succeeded, close returns None
            # Otherwise, close returns the exit code
            if outputFp.close():
                raise SchedulerError(
                    "condor_history command or cache file failed.")
            else:  # close stderr from command ignoring errors
                try:
                    errorFp.close()
                except:
                    pass

            handler = CondorHandler('GlobalJobId', [
                'JobStatus', 'GridJobId', 'ProcId', 'ClusterId',
                'JOB_Gatekeeper', 'MATCH_GLIDEIN_Gatekeeper', 'GlobalJobId'
            ])

            parser = make_parser()
            try:
                parser.setContentHandler(handler)
                parser.setFeature(feature_external_ges, False)
                parser.parse(outputFile)
            except:
                self.logging.info("Unexpected exception: %s" %
                                  sys.exc_info()[0])
                self.logging.debug("Error parsing condor_history output:\n%s" %
                                   outputFile.getvalue())
                raise SchedulerError(
                    'Problem parsing output of condor_history command',
                    command)

            jobDicts = handler.getJobInfo()

            for globalJobId in jobDicts.keys():
                clusterId = jobDicts[globalJobId].get('ClusterId', None)
                procId = jobDicts[globalJobId].get('ProcId', None)
                jobId = str(clusterId) + '.' + str(procId)
                condorId = schedd + '//' + jobId
                jobStatus = jobDicts[globalJobId].get('JobStatus', None)

                # Host can be either in Job_Gatekeeper or MATCH_GLIDEIN_Gatekeeper
                execHost = None
                glideinHost = None
                jobGkpr = jobDicts[globalJobId].get('JOB_Gatekeeper', None)
                matGkpr = jobDicts[globalJobId].get('MATCH_GLIDEIN_Gatekeeper',
                                                    None)
                if jobGkpr and not "Unknown" in jobGkpr:
                    glideinHost = jobGkpr
                else:
                    if matGkpr:
                        glideinHost = matGkpr

                if glideinHost:
                    execHost = glideinHost
                    # strip possible leading https's and leading/trailing extra words
                    for token in glideinHost.replace("https://",
                                                     "").split(" "):
                        if token.find("/") != -1:
                            execHost = token

                # Don't mess with jobs we're not interested in,
                # put what we found into BossLite statusRecord

                if schdId.has_key(condorId):
                    #if bossStatus.has_key(schedd+'//'+jobId):
                    schedulerId = schdId[condorId]
                    statusRecord = {}
                    statusRecord['status'] = statusCodes.get(jobStatus, 'UN')
                    statusRecord['statusScheduler'] = textStatusCodes.get(
                        jobStatus, 'Undefined')
                    statusRecord['statusReason'] = ''
                    statusRecord['service'] = service
                    if execHost:
                        statusRecord['destination'] = execHost
                    if statusRecord['status'] == 'SD':
                        somethingDone = True

                    bossStatus[schedulerId] = statusRecord

        if somethingDone:
            # get ExitCodes from fjrs"
            command = "%s %s %s %s " % \
            (self.unsetenvScram, self.remoteCommand, self.gsisshOptions, self.remoteUserHost)
            command += '"cd %s; ' % (taskId)
            # need to put single and double quotes and tab (\t) in
            # shell command for gsissh. So get horrible escaping here
            # be very careful with changes
            command += "egrep -H WrapperExitCode\|ExeExitCode crab_fjr_*xml"
            command += "|tr '_.\\t\\\"' ' '"  # change all delim to blank for awk
            command += "|awk '{print \\$3\\\" \\\"\\$7\\\" \\\"\\$9}'\""
            self.logging.debug("Execute command :\n%s" % command)
            (status, output) = commands.getstatusoutput(command)
            self.logging.debug("Status,output= %s\n%s" % (status, output))
            if (status):
                if "already exists" in output:
                    self.removeGsisshSocket()

            ExeCodes = {}
            WrapperCodes = {}
            for line in output.split('\n'):
                jid, code, kind = line.split(' ')
                if kind == 'ExeExitCode':
                    ExeCodes[int(jid)] = code
                if kind == 'WrapperExitCode':
                    WrapperCodes[int(jid)] = code

        for job in obj.jobs:  # loop on crab job id's
            jid = job.runningJob['jobId']
            schedulerId = job.runningJob['schedulerId']
            if bossStatus.has_key(
                    schedulerId):  # there's an update from condor
                for key, value in bossStatus[schedulerId].items():
                    job.runningJob[key] = value
                # if this a newly terminated job get the exit code w/o waiting for crab -get
                if somethingDone:
                    if ExeCodes.has_key(jid):
                        job.runningJob['applicationReturnCode'] = ExeCodes[jid]
                    if WrapperCodes.has_key(jid):
                        job.runningJob['wrapperReturnCode'] = WrapperCodes[jid]

        return
コード例 #4
0
    def query(self, obj, service='', objType='node'):
        """
        query status of jobs
        """

        from xml.sax import make_parser
        from CondorHandler import CondorHandler
        from xml.sax.handler import feature_external_ges

        jobIds = {}
        bossIds = {}

        # FUTURE:
        #  Use condor_q -attributes to limit the XML size. Faster on both ends
        # Convert Condor integer status to BossLite Status codes
        statusCodes = {'0':'RE', '1':'S', '2':'R',
                       '3':'K',  '4':'SD', '5':'A'}
        textStatusCodes = {
                '0':'Ready',
                '1':'Submitted',
                '2':'Running',
                '3':'Cancelled',
                '4':'Done',
                '5':'Aborted'
        }

        if type(obj) == Task:
            taskId = obj['name']

            for job in obj.jobs:
                if not self.valid(job.runningJob):
                    continue

                schedulerId = job.runningJob['schedulerId']

                # fix: skip if the Job was created but never submitted
                if job.runningJob['status'] == 'C' :
                    continue

                # Jobs are done if condor_q does not list them
                bossIds[schedulerId] = {'status':'SD', 'statusScheduler':'Done'}
                schedd = schedulerId.split('//')[0]
                jobNum = schedulerId.split('//')[1]

                # Fill dictionary of schedd and job #'s to check
                if schedd in jobIds.keys():
                    jobIds[schedd].append(jobNum)
                else :
                    jobIds[schedd] = [jobNum]
        else:
            raise SchedulerError('Wrong argument type or object type',
                                  str(type(obj)) + ' ' + str(objType))

        for schedd in jobIds.keys() :
            submissionHost = schedd
            
            # to begin with, push a fresh proxy to the remote host
            
            command = 'gsiscp %s %s %s@%s:%s' % \
                      (self.gsisshOptions, self.x509Proxy(), self.rcondorUser, submissionHost, taskId)
            self.logging.debug("Execute command :\n%s" % command)
            (status, output) = commands.getstatusoutput(command)
            self.logging.debug("Status,output= %s,%s" %
                    (status, output))

            command = "gsissh %s %s@%s " % (self.gsisshOptions, self.rcondorUser, submissionHost)
            command += ' "condor_q -userlog %s/condor.log' % taskId
            command += ' -xml"'

            self.logging.debug("Execute command :\n%s" % command)

            pObj = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE, close_fds=True)
            (inputFile, outputFp, errorFp) = (pObj.stdin, pObj.stdout, pObj.stderr)
            try:
                outputFile = cStringIO.StringIO(outputFp.read()) # >7.3 vers.
            except:
                raise SchedulerError('Problem reading output of command', command)

            # If the command succeeded, close returns None
            # Otherwise, close returns the exit code
            if outputFp.close():
                raise SchedulerError("condor_q command or cache file failed.")
            else:   # close stderr from command ignoring errors
                try:
                    errorFp.close()
                except:
                    pass

            handler = CondorHandler('GlobalJobId',
                       ['JobStatus', 'GridJobId','ProcId','ClusterId',
                        'JOB_Gatekeeper', 'MATCH_GLIDEIN_Gatekeeper','GlobalJobId'])

            parser = make_parser()
            try:
                parser.setContentHandler(handler)
                parser.setFeature(feature_external_ges, False)
                parser.parse(outputFile)
            except:
                self.logging.info("Unexpected exception: %s" % sys.exc_info()[0])
                self.logging.debug("Error parsing condor_q output:\n%s" % outputFile.getvalue())
                raise SchedulerError('Problem parsing output of condor_q command', command)

            jobDicts = handler.getJobInfo()


            for globalJobId in jobDicts.keys():
                clusterId = jobDicts[globalJobId].get('ClusterId', None)
                procId    = jobDicts[globalJobId].get('ProcId',    None)
                jobId = str(clusterId) + '.' + str(procId)
                jobStatus = jobDicts[globalJobId].get('JobStatus', None)

                # Host can be either in Job_Gatekeeper or MATCH_GLIDEIN_Gatekeeper
                execHost = None
                glideinHost = None
                jobGkpr = jobDicts[globalJobId].get('JOB_Gatekeeper', None)
                matGkpr = jobDicts[globalJobId].get('MATCH_GLIDEIN_Gatekeeper', None)
                if jobGkpr and not "Unknown" in jobGkpr:
                    glideinHost = jobGkpr
                else:
                    if matGkpr:
                        glideinHost = matGkpr

                if glideinHost:
                    execHost = glideinHost
                    # strip possible leading https's and leading/trailing extra words
                    for token in glideinHost.replace("https://","").split(" ") :
                        if token.find("/") != -1 :
                            execHost = token

                # Don't mess with jobs we're not interested in,
                # put what we found into BossLite statusRecord
                if bossIds.has_key(schedd+'//'+jobId):
                    statusRecord = {}
                    statusRecord['status']          = statusCodes.get(jobStatus, 'UN')
                    statusRecord['statusScheduler'] = textStatusCodes.get(jobStatus, 'Undefined')
                    statusRecord['statusReason']    = ''
                    statusRecord['service']         = service
                    if execHost:
                        statusRecord['destination'] = execHost

                    bossIds[schedd + '//' + jobId] = statusRecord


        for job in obj.jobs:
            schedulerId = job.runningJob['schedulerId']
            if bossIds.has_key(schedulerId):
                for key, value in bossIds[schedulerId].items():
                    job.runningJob[key] = value

        return
コード例 #5
0
    def query(self, obj, service='', objType='node'):
        """
        query status of jobs
        """

        from xml.sax import make_parser
        from CondorHandler import CondorHandler
        from xml.sax.handler import feature_external_ges

        jobIds = {}
        bossStatus = {}
        schdId = {}

        # FUTURE:
        #  Remove Condor < 7.3 when OK
        #  Use condor_q -attributes to limit the XML size. Faster on both ends
        # Convert Condor integer status to BossLite Status codes
        statusCodes = {
            '0': 'RE',
            '1': 'S',
            '2': 'R',
            '3': 'K',
            '4': 'D',
            '5': 'A'
        }
        textStatusCodes = {
            '0': 'Ready',
            '1': 'Submitted',
            '2': 'Running',
            '3': 'Cancelled',
            '4': 'Done',
            '5': 'Aborted'
        }

        if type(obj) == Task:
            taskId = obj['name']

            for job in obj.jobs:
                if not self.valid(job.runningJob):
                    continue

                schedulerId = job.runningJob['schedulerId']

                # fix: skip if the Job was created but never submitted
                if job.runningJob['status'] == 'C':
                    continue

                # Jobs are done by default (i.e. if not found in condor's schedd)
                # boss tracks by schedulerId in the format schedd//submissionday//cluster.job
                # queries to condor schedd's will only return cluster.job
                # so needs to cross link the two via the schdId[condoId] map
                bossStatus[schedulerId] = {
                    'status': 'SD',
                    'statusScheduler': 'Done'
                }

                # for the transition phase, be ready to handle old format
                tokens = schedulerId.split('//')
                schedd = tokens[0]
                if len(tokens) == 2:
                    submitDay = None
                    jobNum = tokens[1]
                else:
                    submitDay = tokens[1]
                    jobNum = tokens[2]
                condorId = schedd + '//' + jobNum
                schdId[condorId] = schedulerId

                # Fill dictionary of schedd and job #'s to check
                if schedd in jobIds.keys():
                    jobIds[schedd].append(jobNum)
                else:
                    jobIds[schedd] = [jobNum]
        else:
            raise SchedulerError('Wrong argument type or object type',
                                 str(type(obj)) + ' ' + str(objType))

        for schedd in jobIds.keys():
            cmd = 'condor_q -xml '
            if schedd != self.hostname:
                cmd += '-name ' + schedd + ' '
            cmd += """-constraint 'BLTaskID=?="%s"'""" % taskId

            pObj = Popen(cmd,
                         shell=True,
                         stdin=PIPE,
                         stdout=PIPE,
                         stderr=STDOUT,
                         close_fds=True)
            (inputFile, outputFp) = (pObj.stdin, pObj.stdout)
            try:
                xmlLine = ''
                while xmlLine.find('<?xml') == -1:
                    # Throw away junk for condor < 7.3, remove when obsolete
                    xmlLine = outputFp.readline()

                outputFile = cStringIO.StringIO(xmlLine + outputFp.read())
                #outputFile = cStringIO.StringIO(outputFp.read()) # >7.3 vers.
            except:
                raise SchedulerError('Problem reading output of command', cmd)

            # If the command succeeded, close returns None
            # Otherwise, close returns the exit code
            if outputFp.close():
                raise SchedulerError("condor_q command or cache file failed.")

            handler = CondorHandler('GlobalJobId', [
                'JobStatus', 'GridJobId', 'ProcId', 'ClusterId',
                'MATCH_GLIDEIN_Gatekeeper', 'GlobalJobId'
            ])
            parser = make_parser()
            try:
                parser.setContentHandler(handler)
                parser.setFeature(feature_external_ges, False)
                parser.parse(outputFile)
            except:
                raise SchedulerError('Problem parsing output of command', cmd)

            jobDicts = handler.getJobInfo()

            for globalJobId in jobDicts.keys():
                clusterId = jobDicts[globalJobId].get('ClusterId', None)
                procId = jobDicts[globalJobId].get('ProcId', None)
                jobId = str(clusterId) + '.' + str(procId)
                condorId = schedd + '//' + jobId
                jobStatus = jobDicts[globalJobId].get('JobStatus', None)

                # Host can be either in GridJobId or Glidein match
                execHost = None
                gridJobId = jobDicts[globalJobId].get('GridJobId', None)
                if gridJobId:
                    uri = gridJobId.split(' ')[1]
                    execHost = uri.split(':')[0]
                glideinHost = jobDicts[globalJobId].get(
                    'MATCH_GLIDEIN_Gatekeeper', None)
                if glideinHost:
                    execHost = glideinHost
                    # strip possible leading https's and leading/trailing extra words
                    for token in glideinHost.replace("https://",
                                                     "").split(" "):
                        if token.find("/") != -1:
                            execHost = token

                # Don't mess with jobs we're not interested in,
                # put what we found into BossLite statusRecord
                if schdId.has_key(condorId):
                    #if bossStatus.has_key(schedd+'//'+jobId):
                    schedulerId = schdId[condorId]
                    statusRecord = {}
                    statusRecord['status'] = statusCodes.get(jobStatus, 'UN')
                    statusRecord['statusScheduler'] = textStatusCodes.get(
                        jobStatus, 'Undefined')
                    statusRecord['statusReason'] = ''
                    statusRecord['service'] = service
                    if execHost:
                        statusRecord['destination'] = execHost

                    bossStatus[schedulerId] = statusRecord

        for job in obj.jobs:
            schedulerId = job.runningJob['schedulerId']
            if bossStatus.has_key(schedulerId):
                for key, value in bossStatus[schedulerId].items():
                    job.runningJob[key] = value
        return