def query(self, obj, service='', objType='node'): """ query status of jobs """ from xml.sax import make_parser from CondorHandler import CondorHandler from xml.sax.handler import feature_external_ges # FUTURE: # Use condor_history -attributes to limit the XML size. Faster on both ends # Convert Condor integer status to BossLite Status codes # Condor status is e.g. from http://pages.cs.wisc.edu/~adesmet/status.html#condor-jobstatus # 0 Unexpanded U # 1 Idle I # 2 Running R # 3 Removed X # 4 Completed C # 5 Held H # 6 Submission_err E statusCodes = {'0':'RE', '1':'S', '2':'R', '3':'K', '4':'SD', '5':'A'} textStatusCodes = { '0':'Ready', '1':'Submitted', '2':'Running', '3':'Cancelled', '4':'Done', '5':'Aborted' } if not type(obj) == Task: raise SchedulerError('Wrong argument type or object type', str(type(obj)) + ' ' + str(objType)) taskId = obj['name'] jobIds = {} bossStatus = {} schdId = {} somethingDone = False # note if some jobs became Done in this round for job in obj.jobs: if not self.valid(job.runningJob): continue # skip if the Job was created but never submitted if job.runningJob['status'] == 'C' : continue # skip if the Job is already Done, nothing more to ask glidein if job.runningJob['statusScheduler'] == 'Done' : continue schedulerId = job.runningJob['schedulerId'] # Jobs are done if condor_q/history does not list them # queries to condor schedd's will only return cluster.job # so needs to cross link the two via the schdId[condoId] map bossStatus[schedulerId] = {'status':'SD', 'statusScheduler':'Done'} schedd = schedulerId.split('//')[0] submissionDay = schedulerId.split('//')[1] jobNum = schedulerId.split('//')[2] condorId = schedd + '//' + jobNum schdId[condorId] = schedulerId # Fill dictionary of schedd and job #'s to check if schedd in jobIds.keys(): jobIds[schedd].append(jobNum) else : jobIds[schedd] = [jobNum] if len(jobIds.keys()) > 0 : # there is something to check on remote condor host self.initializeGsissh(obj) for schedd in jobIds.keys() : if not schedd == self.remoteHost: self.logging.info("ERROR: found jobs for schedd %s in a task targetted for submission host %s" % (schedd,self.remoteHost)) raise Exception("Mixing schedd's in same task is not supported") # to begin with, push a fresh proxy to the remote host command = '%s %s %s %s:%s' % \ (self.remoteCopyCommand, self.gsisshOptions, \ self.x509Proxy(), self.remoteUserHost, taskId) self.logging.debug("Execute command :\n%s" % command) (status, output) = commands.getstatusoutput(command) self.logging.debug("Status,output= %s,%s" % (status, output)) if (status) : self.logging.error("Failed to renew proxy on remote submission host") self.logging.error("Command: %s failed with output=\n%s"%(command,output)) if "already exists" in output: self.removeGsisshSocket() command = "%s %s %s " % \ (self.remoteCommand, self.gsisshOptions, self.remoteUserHost) command += ' "condor_history -userlog %s/condor.log' % taskId command += ' -xml"' self.logging.debug("Execute command :\n%s" % command) pObj = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (inputFile, outputFp, errorFp) = (pObj.stdin, pObj.stdout, pObj.stderr) try: outputFile = cStringIO.StringIO(outputFp.read()) # >7.3 vers. except: raise SchedulerError('Problem reading output of command', command) # If the command succeeded, close returns None # Otherwise, close returns the exit code if outputFp.close(): raise SchedulerError("condor_history command or cache file failed.") else: # close stderr from command ignoring errors try: errorFp.close() except: pass handler = CondorHandler('GlobalJobId', ['JobStatus', 'GridJobId','ProcId','ClusterId', 'JOB_Gatekeeper', 'MATCH_GLIDEIN_Gatekeeper','GlobalJobId']) parser = make_parser() try: parser.setContentHandler(handler) parser.setFeature(feature_external_ges, False) parser.parse(outputFile) except: self.logging.info("Unexpected exception: %s" % sys.exc_info()[0]) self.logging.debug("Error parsing condor_history output:\n%s" % outputFile.getvalue()) raise SchedulerError('Problem parsing output of condor_history command', command) jobDicts = handler.getJobInfo() for globalJobId in jobDicts.keys(): clusterId = jobDicts[globalJobId].get('ClusterId', None) procId = jobDicts[globalJobId].get('ProcId', None) jobId = str(clusterId) + '.' + str(procId) condorId = schedd + '//' + jobId jobStatus = jobDicts[globalJobId].get('JobStatus', None) # Host can be either in Job_Gatekeeper or MATCH_GLIDEIN_Gatekeeper execHost = None glideinHost = None jobGkpr = jobDicts[globalJobId].get('JOB_Gatekeeper', None) matGkpr = jobDicts[globalJobId].get('MATCH_GLIDEIN_Gatekeeper', None) if jobGkpr and not "Unknown" in jobGkpr: glideinHost = jobGkpr else: if matGkpr: glideinHost = matGkpr if glideinHost: execHost = glideinHost # strip possible leading https's and leading/trailing extra words for token in glideinHost.replace("https://","").split(" ") : if token.find("/") != -1 : execHost = token # Don't mess with jobs we're not interested in, # put what we found into BossLite statusRecord if schdId.has_key(condorId): #if bossStatus.has_key(schedd+'//'+jobId): schedulerId = schdId[condorId] statusRecord = {} statusRecord['status'] = statusCodes.get(jobStatus, 'UN') statusRecord['statusScheduler'] = textStatusCodes.get(jobStatus, 'Undefined') statusRecord['statusReason'] = '' statusRecord['service'] = service if execHost: statusRecord['destination'] = execHost if statusRecord['status'] == 'SD' : somethingDone = True bossStatus[schedulerId] = statusRecord if somethingDone : # get ExitCodes from fjrs" command = "%s %s %s " % \ (self.remoteCommand, self.gsisshOptions, self.remoteUserHost) command += '"cd %s; ' % (taskId) # need to put single and double quotes and tab (\t) in # shell command for gsissh. So get horrible escaping here # be very careful with changes command += "egrep -H WrapperExitCode\|ExeExitCode crab_fjr_*xml" command += "|tr '_.\\t\\\"' ' '" # change all delim to blank for awk command += "|awk '{print \\$3\\\" \\\"\\$7\\\" \\\"\\$9}'\"" self.logging.debug("Execute command :\n%s" % command) (status, output) = commands.getstatusoutput(command) self.logging.debug("Status,output= %s\n%s" % (status, output)) if (status) : if "already exists" in output: self.removeGsisshSocket() ExeCodes={} WrapperCodes={} for line in output.split('\n'): jid,code,kind=line.split(' ') if kind == 'ExeExitCode' : ExeCodes[int(jid)]=code if kind == 'WrapperExitCode' : WrapperCodes[int(jid)]=code for job in obj.jobs: # loop on crab job id's jid=job.runningJob['jobId'] schedulerId = job.runningJob['schedulerId'] if bossStatus.has_key(schedulerId): # there's an update from condor for key, value in bossStatus[schedulerId].items(): job.runningJob[key] = value # if this a newly terminated job get the exit code w/o waiting for crab -get if somethingDone : if ExeCodes.has_key(jid): job.runningJob['applicationReturnCode']=ExeCodes[jid] if WrapperCodes.has_key(jid): job.runningJob['wrapperReturnCode']=WrapperCodes[jid] return
def query(self, obj, service='', objType='node'): """ query status of jobs """ from xml.sax import make_parser from CondorHandler import CondorHandler from xml.sax.handler import feature_external_ges jobIds = {} bossIds = {} # FUTURE: # Remove Condor < 7.3 when OK # Use condor_q -attributes to limit the XML size. Faster on both ends # Convert Condor integer status to BossLite Status codes statusCodes = {'0':'RE', '1':'S', '2':'R', '3':'K', '4':'D', '5':'A'} textStatusCodes = { '0':'Ready', '1':'Submitted', '2':'Running', '3':'Cancelled', '4':'Done', '5':'Aborted' } if type(obj) == Task: taskId = obj['name'] for job in obj.jobs: if not self.valid(job.runningJob): continue schedulerId = job.runningJob['schedulerId'] # fix: skip if the Job was created but never submitted if job.runningJob['status'] == 'C' : continue # Jobs are done by default bossIds[schedulerId] = {'status':'SD', 'statusScheduler':'Done'} schedd = schedulerId.split('//')[0] jobNum = schedulerId.split('//')[1] # Fill dictionary of schedd and job #'s to check if schedd in jobIds.keys(): jobIds[schedd].append(jobNum) else : jobIds[schedd] = [jobNum] else: raise SchedulerError('Wrong argument type or object type', str(type(obj)) + ' ' + str(objType)) for schedd in jobIds.keys() : cmd = 'condor_q -xml ' if schedd != self.hostname: cmd += '-name ' + schedd + ' ' cmd += """-constraint 'BLTaskID=?="%s"'""" % taskId pObj = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) (inputFile, outputFp) = (pObj.stdin, pObj.stdout) try: xmlLine = '' while xmlLine.find('<?xml') == -1: # Throw away junk for condor < 7.3, remove when obsolete xmlLine = outputFp.readline() outputFile = cStringIO.StringIO(xmlLine+outputFp.read()) #outputFile = cStringIO.StringIO(outputFp.read()) # >7.3 vers. except: raise SchedulerError('Problem reading output of command', cmd) # If the command succeeded, close returns None # Otherwise, close returns the exit code if outputFp.close(): raise SchedulerError("condor_q command or cache file failed.") handler = CondorHandler('GlobalJobId', ['JobStatus', 'GridJobId','ProcId','ClusterId', 'MATCH_GLIDEIN_Gatekeeper', 'GlobalJobId']) parser = make_parser() try: parser.setContentHandler(handler) parser.setFeature(feature_external_ges, False) parser.parse(outputFile) except: raise SchedulerError('Problem parsing output of command', cmd) jobDicts = handler.getJobInfo() for globalJobId in jobDicts.keys(): clusterId = jobDicts[globalJobId].get('ClusterId', None) procId = jobDicts[globalJobId].get('ProcId', None) jobId = str(clusterId) + '.' + str(procId) jobStatus = jobDicts[globalJobId].get('JobStatus', None) # Host can be either in GridJobId or Glidein match execHost = None gridJobId = jobDicts[globalJobId].get('GridJobId', None) if gridJobId: uri = gridJobId.split(' ')[1] execHost = uri.split(':')[0] glideinHost = jobDicts[globalJobId].get('MATCH_GLIDEIN_Gatekeeper', None) if glideinHost: execHost = glideinHost # Don't mess with jobs we're not interested in, # put what we found into BossLite statusRecord if bossIds.has_key(schedd+'//'+jobId): statusRecord = {} statusRecord['status'] = statusCodes.get(jobStatus, 'UN') statusRecord['statusScheduler'] = textStatusCodes.get(jobStatus, 'Undefined') statusRecord['statusReason'] = '' statusRecord['service'] = service if execHost: statusRecord['destination'] = execHost bossIds[schedd + '//' + jobId] = statusRecord for job in obj.jobs: schedulerId = job.runningJob['schedulerId'] if bossIds.has_key(schedulerId): for key, value in bossIds[schedulerId].items(): job.runningJob[key] = value return
def query(self, obj, service='', objType='node'): """ query status of jobs """ from xml.sax import make_parser from CondorHandler import CondorHandler from xml.sax.handler import feature_external_ges # FUTURE: # Use condor_history -attributes to limit the XML size. Faster on both ends # Convert Condor integer status to BossLite Status codes # Condor status is e.g. from http://pages.cs.wisc.edu/~adesmet/status.html#condor-jobstatus # 0 Unexpanded U # 1 Idle I # 2 Running R # 3 Removed X # 4 Completed C # 5 Held H # 6 Submission_err E statusCodes = { '0': 'RE', '1': 'S', '2': 'R', '3': 'K', '4': 'SD', '5': 'A' } textStatusCodes = { '0': 'Ready', '1': 'Submitted', '2': 'Running', '3': 'Cancelled', '4': 'Done', '5': 'Aborted' } if not type(obj) == Task: raise SchedulerError('Wrong argument type or object type', str(type(obj)) + ' ' + str(objType)) taskId = obj['name'] jobIds = {} bossStatus = {} schdId = {} somethingDone = False # note if some jobs became Done in this round for job in obj.jobs: if not self.valid(job.runningJob): continue # skip if the Job was created but never submitted if job.runningJob['status'] == 'C': continue # skip if the Job is already Done, nothing more to ask glidein if job.runningJob['statusScheduler'] == 'Done': continue schedulerId = job.runningJob['schedulerId'] # Jobs are done if condor_q/history does not list them # queries to condor schedd's will only return cluster.job # so needs to cross link the two via the schdId[condoId] map bossStatus[schedulerId] = { 'status': 'SD', 'statusScheduler': 'Done' } schedd = schedulerId.split('//')[0] submissionDay = schedulerId.split('//')[1] jobNum = schedulerId.split('//')[2] condorId = schedd + '//' + jobNum schdId[condorId] = schedulerId # Fill dictionary of schedd and job #'s to check if schedd in jobIds.keys(): jobIds[schedd].append(jobNum) else: jobIds[schedd] = [jobNum] if len(jobIds.keys()) > 0: # there is something to check on remote condor host self.initializeGsissh(obj) for schedd in jobIds.keys(): if not schedd == self.remoteHost: self.logging.info( "ERROR: found jobs for schedd %s in a task targetted for submission host %s" % (schedd, self.remoteHost)) raise Exception( "Mixing schedd's in same task is not supported") # to begin with, push a fresh proxy to the remote host command = '%s %s %s %s:%s' % \ (self.remoteCopyCommand, self.gsisshOptions, \ self.x509Proxy(), self.remoteUserHost, taskId) self.logging.debug("Execute command :\n%s" % command) (status, output) = commands.getstatusoutput(command) self.logging.debug("Status,output= %s,%s" % (status, output)) if (status): self.logging.error( "Failed to renew proxy on remote submission host") self.logging.error("Command: %s failed with output=\n%s" % (command, output)) if "already exists" in output: self.removeGsisshSocket() command = "%s %s %s %s " % \ (self.unsetenvScram, self.remoteCommand, self.gsisshOptions, self.remoteUserHost) command += ' "condor_history -userlog %s/condor.log' % taskId command += ' -xml"' self.logging.debug("Execute command :\n%s" % command) pObj = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (inputFile, outputFp, errorFp) = (pObj.stdin, pObj.stdout, pObj.stderr) try: outputFile = cStringIO.StringIO(outputFp.read()) # >7.3 vers. except: raise SchedulerError('Problem reading output of command', command) # If the command succeeded, close returns None # Otherwise, close returns the exit code if outputFp.close(): raise SchedulerError( "condor_history command or cache file failed.") else: # close stderr from command ignoring errors try: errorFp.close() except: pass handler = CondorHandler('GlobalJobId', [ 'JobStatus', 'GridJobId', 'ProcId', 'ClusterId', 'JOB_Gatekeeper', 'MATCH_GLIDEIN_Gatekeeper', 'GlobalJobId' ]) parser = make_parser() try: parser.setContentHandler(handler) parser.setFeature(feature_external_ges, False) parser.parse(outputFile) except: self.logging.info("Unexpected exception: %s" % sys.exc_info()[0]) self.logging.debug("Error parsing condor_history output:\n%s" % outputFile.getvalue()) raise SchedulerError( 'Problem parsing output of condor_history command', command) jobDicts = handler.getJobInfo() for globalJobId in jobDicts.keys(): clusterId = jobDicts[globalJobId].get('ClusterId', None) procId = jobDicts[globalJobId].get('ProcId', None) jobId = str(clusterId) + '.' + str(procId) condorId = schedd + '//' + jobId jobStatus = jobDicts[globalJobId].get('JobStatus', None) # Host can be either in Job_Gatekeeper or MATCH_GLIDEIN_Gatekeeper execHost = None glideinHost = None jobGkpr = jobDicts[globalJobId].get('JOB_Gatekeeper', None) matGkpr = jobDicts[globalJobId].get('MATCH_GLIDEIN_Gatekeeper', None) if jobGkpr and not "Unknown" in jobGkpr: glideinHost = jobGkpr else: if matGkpr: glideinHost = matGkpr if glideinHost: execHost = glideinHost # strip possible leading https's and leading/trailing extra words for token in glideinHost.replace("https://", "").split(" "): if token.find("/") != -1: execHost = token # Don't mess with jobs we're not interested in, # put what we found into BossLite statusRecord if schdId.has_key(condorId): #if bossStatus.has_key(schedd+'//'+jobId): schedulerId = schdId[condorId] statusRecord = {} statusRecord['status'] = statusCodes.get(jobStatus, 'UN') statusRecord['statusScheduler'] = textStatusCodes.get( jobStatus, 'Undefined') statusRecord['statusReason'] = '' statusRecord['service'] = service if execHost: statusRecord['destination'] = execHost if statusRecord['status'] == 'SD': somethingDone = True bossStatus[schedulerId] = statusRecord if somethingDone: # get ExitCodes from fjrs" command = "%s %s %s %s " % \ (self.unsetenvScram, self.remoteCommand, self.gsisshOptions, self.remoteUserHost) command += '"cd %s; ' % (taskId) # need to put single and double quotes and tab (\t) in # shell command for gsissh. So get horrible escaping here # be very careful with changes command += "egrep -H WrapperExitCode\|ExeExitCode crab_fjr_*xml" command += "|tr '_.\\t\\\"' ' '" # change all delim to blank for awk command += "|awk '{print \\$3\\\" \\\"\\$7\\\" \\\"\\$9}'\"" self.logging.debug("Execute command :\n%s" % command) (status, output) = commands.getstatusoutput(command) self.logging.debug("Status,output= %s\n%s" % (status, output)) if (status): if "already exists" in output: self.removeGsisshSocket() ExeCodes = {} WrapperCodes = {} for line in output.split('\n'): jid, code, kind = line.split(' ') if kind == 'ExeExitCode': ExeCodes[int(jid)] = code if kind == 'WrapperExitCode': WrapperCodes[int(jid)] = code for job in obj.jobs: # loop on crab job id's jid = job.runningJob['jobId'] schedulerId = job.runningJob['schedulerId'] if bossStatus.has_key( schedulerId): # there's an update from condor for key, value in bossStatus[schedulerId].items(): job.runningJob[key] = value # if this a newly terminated job get the exit code w/o waiting for crab -get if somethingDone: if ExeCodes.has_key(jid): job.runningJob['applicationReturnCode'] = ExeCodes[jid] if WrapperCodes.has_key(jid): job.runningJob['wrapperReturnCode'] = WrapperCodes[jid] return
def query(self, obj, service='', objType='node'): """ query status of jobs """ from xml.sax import make_parser from CondorHandler import CondorHandler from xml.sax.handler import feature_external_ges jobIds = {} bossIds = {} # FUTURE: # Use condor_q -attributes to limit the XML size. Faster on both ends # Convert Condor integer status to BossLite Status codes statusCodes = {'0':'RE', '1':'S', '2':'R', '3':'K', '4':'SD', '5':'A'} textStatusCodes = { '0':'Ready', '1':'Submitted', '2':'Running', '3':'Cancelled', '4':'Done', '5':'Aborted' } if type(obj) == Task: taskId = obj['name'] for job in obj.jobs: if not self.valid(job.runningJob): continue schedulerId = job.runningJob['schedulerId'] # fix: skip if the Job was created but never submitted if job.runningJob['status'] == 'C' : continue # Jobs are done if condor_q does not list them bossIds[schedulerId] = {'status':'SD', 'statusScheduler':'Done'} schedd = schedulerId.split('//')[0] jobNum = schedulerId.split('//')[1] # Fill dictionary of schedd and job #'s to check if schedd in jobIds.keys(): jobIds[schedd].append(jobNum) else : jobIds[schedd] = [jobNum] else: raise SchedulerError('Wrong argument type or object type', str(type(obj)) + ' ' + str(objType)) for schedd in jobIds.keys() : submissionHost = schedd # to begin with, push a fresh proxy to the remote host command = 'gsiscp %s %s %s@%s:%s' % \ (self.gsisshOptions, self.x509Proxy(), self.rcondorUser, submissionHost, taskId) self.logging.debug("Execute command :\n%s" % command) (status, output) = commands.getstatusoutput(command) self.logging.debug("Status,output= %s,%s" % (status, output)) command = "gsissh %s %s@%s " % (self.gsisshOptions, self.rcondorUser, submissionHost) command += ' "condor_q -userlog %s/condor.log' % taskId command += ' -xml"' self.logging.debug("Execute command :\n%s" % command) pObj = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (inputFile, outputFp, errorFp) = (pObj.stdin, pObj.stdout, pObj.stderr) try: outputFile = cStringIO.StringIO(outputFp.read()) # >7.3 vers. except: raise SchedulerError('Problem reading output of command', command) # If the command succeeded, close returns None # Otherwise, close returns the exit code if outputFp.close(): raise SchedulerError("condor_q command or cache file failed.") else: # close stderr from command ignoring errors try: errorFp.close() except: pass handler = CondorHandler('GlobalJobId', ['JobStatus', 'GridJobId','ProcId','ClusterId', 'JOB_Gatekeeper', 'MATCH_GLIDEIN_Gatekeeper','GlobalJobId']) parser = make_parser() try: parser.setContentHandler(handler) parser.setFeature(feature_external_ges, False) parser.parse(outputFile) except: self.logging.info("Unexpected exception: %s" % sys.exc_info()[0]) self.logging.debug("Error parsing condor_q output:\n%s" % outputFile.getvalue()) raise SchedulerError('Problem parsing output of condor_q command', command) jobDicts = handler.getJobInfo() for globalJobId in jobDicts.keys(): clusterId = jobDicts[globalJobId].get('ClusterId', None) procId = jobDicts[globalJobId].get('ProcId', None) jobId = str(clusterId) + '.' + str(procId) jobStatus = jobDicts[globalJobId].get('JobStatus', None) # Host can be either in Job_Gatekeeper or MATCH_GLIDEIN_Gatekeeper execHost = None glideinHost = None jobGkpr = jobDicts[globalJobId].get('JOB_Gatekeeper', None) matGkpr = jobDicts[globalJobId].get('MATCH_GLIDEIN_Gatekeeper', None) if jobGkpr and not "Unknown" in jobGkpr: glideinHost = jobGkpr else: if matGkpr: glideinHost = matGkpr if glideinHost: execHost = glideinHost # strip possible leading https's and leading/trailing extra words for token in glideinHost.replace("https://","").split(" ") : if token.find("/") != -1 : execHost = token # Don't mess with jobs we're not interested in, # put what we found into BossLite statusRecord if bossIds.has_key(schedd+'//'+jobId): statusRecord = {} statusRecord['status'] = statusCodes.get(jobStatus, 'UN') statusRecord['statusScheduler'] = textStatusCodes.get(jobStatus, 'Undefined') statusRecord['statusReason'] = '' statusRecord['service'] = service if execHost: statusRecord['destination'] = execHost bossIds[schedd + '//' + jobId] = statusRecord for job in obj.jobs: schedulerId = job.runningJob['schedulerId'] if bossIds.has_key(schedulerId): for key, value in bossIds[schedulerId].items(): job.runningJob[key] = value return
def query(self, obj, service='', objType='node'): """ query status of jobs """ from xml.sax import make_parser from CondorHandler import CondorHandler from xml.sax.handler import feature_external_ges jobIds = {} bossStatus = {} schdId = {} # FUTURE: # Remove Condor < 7.3 when OK # Use condor_q -attributes to limit the XML size. Faster on both ends # Convert Condor integer status to BossLite Status codes statusCodes = { '0': 'RE', '1': 'S', '2': 'R', '3': 'K', '4': 'D', '5': 'A' } textStatusCodes = { '0': 'Ready', '1': 'Submitted', '2': 'Running', '3': 'Cancelled', '4': 'Done', '5': 'Aborted' } if type(obj) == Task: taskId = obj['name'] for job in obj.jobs: if not self.valid(job.runningJob): continue schedulerId = job.runningJob['schedulerId'] # fix: skip if the Job was created but never submitted if job.runningJob['status'] == 'C': continue # Jobs are done by default (i.e. if not found in condor's schedd) # boss tracks by schedulerId in the format schedd//submissionday//cluster.job # queries to condor schedd's will only return cluster.job # so needs to cross link the two via the schdId[condoId] map bossStatus[schedulerId] = { 'status': 'SD', 'statusScheduler': 'Done' } # for the transition phase, be ready to handle old format tokens = schedulerId.split('//') schedd = tokens[0] if len(tokens) == 2: submitDay = None jobNum = tokens[1] else: submitDay = tokens[1] jobNum = tokens[2] condorId = schedd + '//' + jobNum schdId[condorId] = schedulerId # Fill dictionary of schedd and job #'s to check if schedd in jobIds.keys(): jobIds[schedd].append(jobNum) else: jobIds[schedd] = [jobNum] else: raise SchedulerError('Wrong argument type or object type', str(type(obj)) + ' ' + str(objType)) for schedd in jobIds.keys(): cmd = 'condor_q -xml ' if schedd != self.hostname: cmd += '-name ' + schedd + ' ' cmd += """-constraint 'BLTaskID=?="%s"'""" % taskId pObj = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) (inputFile, outputFp) = (pObj.stdin, pObj.stdout) try: xmlLine = '' while xmlLine.find('<?xml') == -1: # Throw away junk for condor < 7.3, remove when obsolete xmlLine = outputFp.readline() outputFile = cStringIO.StringIO(xmlLine + outputFp.read()) #outputFile = cStringIO.StringIO(outputFp.read()) # >7.3 vers. except: raise SchedulerError('Problem reading output of command', cmd) # If the command succeeded, close returns None # Otherwise, close returns the exit code if outputFp.close(): raise SchedulerError("condor_q command or cache file failed.") handler = CondorHandler('GlobalJobId', [ 'JobStatus', 'GridJobId', 'ProcId', 'ClusterId', 'MATCH_GLIDEIN_Gatekeeper', 'GlobalJobId' ]) parser = make_parser() try: parser.setContentHandler(handler) parser.setFeature(feature_external_ges, False) parser.parse(outputFile) except: raise SchedulerError('Problem parsing output of command', cmd) jobDicts = handler.getJobInfo() for globalJobId in jobDicts.keys(): clusterId = jobDicts[globalJobId].get('ClusterId', None) procId = jobDicts[globalJobId].get('ProcId', None) jobId = str(clusterId) + '.' + str(procId) condorId = schedd + '//' + jobId jobStatus = jobDicts[globalJobId].get('JobStatus', None) # Host can be either in GridJobId or Glidein match execHost = None gridJobId = jobDicts[globalJobId].get('GridJobId', None) if gridJobId: uri = gridJobId.split(' ')[1] execHost = uri.split(':')[0] glideinHost = jobDicts[globalJobId].get( 'MATCH_GLIDEIN_Gatekeeper', None) if glideinHost: execHost = glideinHost # strip possible leading https's and leading/trailing extra words for token in glideinHost.replace("https://", "").split(" "): if token.find("/") != -1: execHost = token # Don't mess with jobs we're not interested in, # put what we found into BossLite statusRecord if schdId.has_key(condorId): #if bossStatus.has_key(schedd+'//'+jobId): schedulerId = schdId[condorId] statusRecord = {} statusRecord['status'] = statusCodes.get(jobStatus, 'UN') statusRecord['statusScheduler'] = textStatusCodes.get( jobStatus, 'Undefined') statusRecord['statusReason'] = '' statusRecord['service'] = service if execHost: statusRecord['destination'] = execHost bossStatus[schedulerId] = statusRecord for job in obj.jobs: schedulerId = job.runningJob['schedulerId'] if bossStatus.has_key(schedulerId): for key, value in bossStatus[schedulerId].items(): job.runningJob[key] = value return