def submitJob( self, executableFile, proxy, numberOfJobs = 1, processors = 1 ): """ Method to submit job """ self.log.verbose( "Executable file path: %s" % executableFile ) if not os.access( executableFile, 5 ): os.chmod( executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH ) batchIDList = [] stampDict = {} if numberOfJobs == 1: jdlName, diracStamp = self.__writeJDL( executableFile, processors = processors ) cmd = ['glite-ce-job-submit', '-n', '-a', '-N', '-r', '%s/%s' % ( self.ceName, self.queue ), '%s' % jdlName ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) os.unlink( jdlName ) if result['OK']: if result['Value'][0]: # We have got a non-zero status code errorString = '\n'.join( result['Value'][1:] ).strip() return S_ERROR( 'Pilot submission failed with error: %s ' % errorString ) pilotJobReference = result['Value'][1].strip() if not pilotJobReference: return S_ERROR( 'No pilot reference returned from the glite job submission command' ) if not pilotJobReference.startswith( 'https' ): return S_ERROR( 'Invalid pilot reference %s' % pilotJobReference ) batchIDList.append( pilotJobReference ) stampDict[pilotJobReference] = diracStamp else: delegationID = makeGuid() cmd = [ 'glite-ce-delegate-proxy', '-e', '%s' % self.ceName, '%s' % delegationID ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) if not result['OK']: self.log.error( 'Failed to delegate proxy', result['Message'] ) return result for _i in range( numberOfJobs ): jdlName, diracStamp = self.__writeJDL( executableFile, processors = processors ) cmd = ['glite-ce-job-submit', '-n', '-N', '-r', '%s/%s' % ( self.ceName, self.queue ), '-D', '%s' % delegationID, '%s' % jdlName ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) os.unlink( jdlName ) if not result['OK']: break if result['Value'][0] != 0: break pilotJobReference = result['Value'][1].strip() if pilotJobReference and pilotJobReference.startswith( 'https' ): batchIDList.append( pilotJobReference ) stampDict[pilotJobReference] = diracStamp else: break if batchIDList: result = S_OK( batchIDList ) result['PilotStampDict'] = stampDict else: result = S_ERROR( 'No pilot references obtained from the glite job submission' ) return result
def getJobStatus( self, jobIDList ): """ Get the status information for the given list of jobs """ if self.proxyRenewal % 60 == 0: self.proxyRenewal += 1 statusList = ['REGISTERED', 'PENDING', 'IDLE', 'RUNNING', 'REALLY-RUNNING'] cmd = ['glite-ce-job-status', '-L', '2', '--all', '-e', '%s' % self.ceName, '-s', '%s' % ':'.join( statusList ) ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) if result['OK']: delegationIDs = [] for line in result['Value'][1].split( '\n' ): if line.find( 'Deleg Proxy ID' ) != -1: delegationID = line.split()[-1].replace( '[', '' ).replace( ']', '' ) if delegationID not in delegationIDs: delegationIDs.append( delegationID ) if delegationIDs: cmd = ['glite-ce-proxy-renew', '-e', self.ceName ] cmd.extend( delegationIDs ) self.log.info( 'Refreshing proxy for:', ' '.join( delegationIDs ) ) result = executeGridCommand( self.proxy, cmd, self.gridEnv ) workingDirectory = self.ceParameters['WorkingDirectory'] fd, idFileName = tempfile.mkstemp( suffix = '.ids', prefix = 'CREAM_', dir = workingDirectory ) idFile = os.fdopen( fd, 'w' ) idFile.write( '##CREAMJOBS##' ) for id_ in jobIDList: if ":::" in id_: ref, stamp = id_.split( ':::' ) else: ref = id_ idFile.write( '\n' + ref ) idFile.close() cmd = ['glite-ce-job-status', '-n', '-i', '%s' % idFileName ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) os.unlink( idFileName ) resultDict = {} if not result['OK']: self.log.error( 'Failed to get job status', result['Message'] ) return result if result['Value'][0]: if result['Value'][2]: return S_ERROR( result['Value'][2] ) else: return S_ERROR( 'Error while interrogating job statuses' ) if result['Value'][1]: resultDict = self.__parseJobStatus( result['Value'][1] ) if not resultDict: return S_ERROR( 'No job statuses returned' ) # If CE does not know about a job, set the status to Unknown for job in jobIDList: if not resultDict.has_key( job ): resultDict[job] = 'Unknown' return S_OK( resultDict )
def submitJob( self, executableFile, proxy, numberOfJobs = 1 ): """ Method to submit job """ self.log.verbose( "Executable file path: %s" % executableFile ) if not os.access( executableFile, 5 ): os.chmod( executableFile, 0755 ) batchIDList = [] stampDict = {} if numberOfJobs == 1: jdlName, diracStamp = self.__writeJDL( executableFile ) cmd = ['glite-ce-job-submit', '-n', '-a', '-N', '-r', '%s/%s' % ( self.ceName, self.queue ), '%s' % jdlName ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) if result['OK']: pilotJobReference = result['Value'][1].strip() batchIDList.append( pilotJobReference ) stampDict[pilotJobReference] = diracStamp os.unlink( jdlName ) else: delegationID = makeGuid() cmd = [ 'glite-ce-delegate-proxy', '-e', '%s' % self.ceName, '%s' % delegationID ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) if not result['OK']: self.log.error('Failed to delegate proxy: %s' % result['Message']) return result for i in range( numberOfJobs ): jdlName, diracStamp = self.__writeJDL( executableFile ) cmd = ['glite-ce-job-submit', '-n', '-N', '-r', '%s/%s' % ( self.ceName, self.queue ), '-D', '%s' % delegationID, '%s' % jdlName ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) if not result['OK']: break if result['Value'][0] != 0: break pilotJobReference = result['Value'][1].strip() if pilotJobReference: batchIDList.append( pilotJobReference ) stampDict[pilotJobReference] = diracStamp else: break os.unlink( jdlName ) os.unlink( executableFile ) if batchIDList: result = S_OK( batchIDList ) result['PilotStampDict'] = stampDict else: result = S_ERROR('No pilot references obtained from the glite job submission') return result
def getJobOutput( self, jobID, localDir = None ): """ Get the specified job standard output and error files. If the localDir is provided, the output is returned as file in this directory. Otherwise, the output is returned as strings. """ if jobID.find( ':::' ) != -1: pilotRef, stamp = jobID.split( ':::' ) else: pilotRef = jobID stamp = '' if not stamp: return S_ERROR( 'Pilot stamp not defined for %s' % pilotRef ) outURL = self.ceParameters.get( 'OutputURL', 'gsiftp://localhost' ) if outURL == 'gsiftp://localhost': result = self.__resolveOutputURL( pilotRef ) if not result['OK']: return result outURL = result['Value'] outputURL = os.path.join( outURL, '%s.out' % stamp ) errorURL = os.path.join( outURL, '%s.err' % stamp ) workingDirectory = self.ceParameters['WorkingDirectory'] outFileName = os.path.join( workingDirectory, os.path.basename( outputURL ) ) errFileName = os.path.join( workingDirectory, os.path.basename( errorURL ) ) cmd = ['globus-url-copy', '%s' % outputURL, 'file://%s' % outFileName ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) output = '' if result['OK']: if not result['Value'][0]: outFile = open( outFileName, 'r' ) output = outFile.read() outFile.close() os.unlink( outFileName ) else: error = '\n'.join( result['Value'][1:] ) return S_ERROR( error ) else: return S_ERROR( 'Failed to retrieve output for %s' % jobID ) cmd = ['globus-url-copy', '%s' % errorURL, '%s' % errFileName ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) error = '' if result['OK']: if not result['Value'][0]: errFile = open( errFileName, 'r' ) error = errFile.read() errFile.close() os.unlink( errFileName ) else: return S_ERROR( 'Failed to retrieve error for %s' % jobID ) return S_OK( ( output, error ) )
def getJobOutput(self, jobID, localDir=None): """ Get the specified job standard output and error files. If the localDir is provided, the output is returned as file in this directory. Otherwise, the output is returned as strings. """ if jobID.find(":::") != -1: pilotRef, stamp = jobID.split(":::") else: pilotRef = jobID stamp = "" if not stamp: return S_ERROR("Pilot stamp not defined for %s" % pilotRef) outURL = self.ceParameters.get("OutputURL", "gsiftp://localhost") if outURL == "gsiftp://localhost": result = self.__resolveOutputURL(pilotRef) if not result["OK"]: return result outURL = result["Value"] outputURL = os.path.join(outURL, "%s.out" % stamp) errorURL = os.path.join(outURL, "%s.err" % stamp) workingDirectory = self.ceParameters["WorkingDirectory"] outFileName = os.path.join(workingDirectory, os.path.basename(outputURL)) errFileName = os.path.join(workingDirectory, os.path.basename(errorURL)) cmd = ["globus-url-copy", "%s" % outputURL, "file://%s" % outFileName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) output = "" if result["OK"]: if not result["Value"][0]: outFile = open(outFileName, "r") output = outFile.read() outFile.close() os.unlink(outFileName) else: return S_ERROR("Failed to retrieve output for %s" % jobID) cmd = ["globus-url-copy", "%s" % errorURL, "%s" % errFileName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) error = "" if result["OK"]: if not result["Value"][0]: errFile = open(errFileName, "r") error = errFile.read() errFile.close() os.unlink(errFileName) else: return S_ERROR("Failed to retrieve error for %s" % jobID) return S_OK((output, error))
def getPilotLoggingInfo( grid, pilotRef ): """ Get LoggingInfo of a GRID job """ if grid == 'gLite': cmd = [ 'glite-wms-job-logging-info', '-v', '3', '--noint', pilotRef ] elif grid == 'CREAM': cmd = [ 'glite-ce-job-status', '-L', '2', '%s' % pilotRef ] elif grid == 'HTCondorCE': ## need to import here, otherwise import errors happen from DIRAC.Resources.Computing.HTCondorCEComputingElement import getCondorLogFile resLog = getCondorLogFile( pilotRef ) if not resLog['OK']: return resLog logFile = resLog['Value'] cmd = [ 'cat', " ".join(logFile) ] else: return S_ERROR( 'Pilot logging not available for %s CEs' % grid ) gridEnv = getGridEnv() ret = executeGridCommand( '', cmd, gridEnv ) if not ret['OK']: return ret status, output, error = ret['Value'] if status: return S_ERROR( error ) return S_OK( output )
def __getSummary( self ): res = self.__isSummaryValid() if not res['OK']: return res comm = ['glite-transfer-status', '--verbose', '-s', self.ftsServer, self.ftsGUID] res = executeGridCommand( '', comm, self.gridEnv ) if not res['OK']: return res returnCode, output, errStr = res['Value'] # Returns a non zero status if error if not returnCode == 0: return S_ERROR( errStr ) # Parse the output to get a summary dictionary lines = output.splitlines() summaryDict = {} for line in lines: line = line.split( ':\t' ) key = line[0].replace( '\t', '' ) value = line[1].replace( '\t', '' ) summaryDict[key] = value self.requestStatus = summaryDict['Status'] self.submitTime = summaryDict['Submit time'] self.statusSummary = {} for status in self.fileStates: if summaryDict[status] != '0': self.statusSummary[status] = int( summaryDict[status] ) return S_OK()
def getJobStatus( self, jobIDList ): """ Get the status information for the given list of jobs """ workingDirectory = self.ceParameters['WorkingDirectory'] fd, idFileName = tempfile.mkstemp( suffix = '.ids', prefix = 'CREAM_', dir = workingDirectory ) idFile = os.fdopen( fd, 'w' ) idFile.write( '##CREAMJOBS##' ) for id in jobIDList: idFile.write( '\n' + id ) idFile.close() cmd = ['glite-ce-job-status', '-n', '-i', '%s' % idFileName ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) os.unlink( idFileName ) resultDict = {} if not result['OK']: self.log.error('Failed to get job status',result['Message']) return result if result['Value'][1]: resultDict = self.__parseJobStatus( result['Value'][1] ) # If CE does not know about a job, set the status to Unknown for job in jobIDList: if not resultDict.has_key( job ): resultDict[job] = 'Unknown' return S_OK( resultDict )
def getCEStatus(self): """ Method to return information on running and pending jobs. """ statusList = ["REGISTERED", "PENDING", "IDLE", "RUNNING", "REALLY-RUNNING"] cmd = ["glite-ce-job-status", "-n", "-a", "-e", "%s" % self.ceName, "-s", "%s" % ":".join(statusList)] result = executeGridCommand(self.proxy, cmd, self.gridEnv) resultDict = {} if not result["OK"]: return result if result["Value"][0]: if result["Value"][2]: return S_ERROR(result["Value"][2]) else: return S_ERROR("Error while interrogating CE status") if result["Value"][1]: resultDict = self.__parseJobStatus(result["Value"][1]) running = 0 waiting = 0 for ref, status in resultDict.items(): if status == "Scheduled": waiting += 1 if status == "Running": running += 1 result = S_OK() result["RunningJobs"] = running result["WaitingJobs"] = waiting result["SubmittedJobs"] = 0 return result
def getCEStatus( self ): """ Method to return information on running and pending jobs. """ statusList = ['REGISTERED', 'PENDING', 'IDLE', 'RUNNING', 'REALLY-RUNNING'] cmd = ['glite-ce-job-status', '-n', '-a', '-e', '%s' % self.ceName, '-s', '%s' % ':'.join( statusList ) ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) resultDict = {} if not result['OK']: return result if result['Value'][0]: if result['Value'][2]: return S_ERROR(result['Value'][2]) else: return S_ERROR('Error while interrogating CE status') if result['Value'][1]: resultDict = self.__parseJobStatus( result['Value'][1] ) running = 0 waiting = 0 for ref, status in resultDict.items(): if status == 'Scheduled': waiting += 1 if status == 'Running': running += 1 result = S_OK() result['RunningJobs'] = running result['WaitingJobs'] = waiting result['SubmittedJobs'] = 0 return result
def submitFTS2(self, stageFiles=False): """ submit fts job using FTS2 client """ if self.FTSGUID: return S_ERROR("FTSJob already has been submitted") surls = self._surlPairs() if not surls: return S_ERROR("No files to submit") fd, fileName = tempfile.mkstemp() surlFile = os.fdopen(fd, "w") surlFile.write(surls) surlFile.close() submitCommand = ["glite-transfer-submit", "-s", self.FTSServer, "-f", fileName, "-o", "--compare-checksums"] if self.TargetToken: submitCommand.append("-t %s" % self.TargetToken) if self.SourceToken: submitCommand.append("-S %s" % self.SourceToken) if stageFiles: submitCommand.append("--copy-pin-lifetime 86400") submit = executeGridCommand("", submitCommand) os.remove(fileName) if not submit["OK"]: return submit returnCode, output, errStr = submit["Value"] if not returnCode == 0: return S_ERROR(errStr) self.FTSGUID = output.replace("\n", "") self.Status = "Submitted" for ftsFile in self: ftsFile.FTSGUID = self.FTSGUID ftsFile.Status = "Submitted" return S_OK()
def getPilotLoggingInfo( proxy, grid, pilotRef ): """ Get LoggingInfo of a GRID job """ if grid == 'LCG': cmd = [ 'edg-job-get-logging-info', '-v', '2' ] elif grid == 'gLite': cmd = [ 'glite-wms-job-logging-info', '-v', '3' ] else: return S_ERROR( 'Unknnown GRID %s' % grid ) cmd.extend( ['--noint', pilotRef] ) gridEnv = '' setup = gConfig.getValue( '/DIRAC/Setup', '' ) if setup: instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' ) if instance: gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' ) ret = executeGridCommand( proxy, cmd, gridEnv ) if not ret['OK']: return ret status, output, error = ret['Value'] if status: return S_ERROR( error ) return S_OK( output )
def getCEStatus( self ): """ Method to return information on running and pending jobs. """ cmd = ['arcstat', '-c', self.ceHost, '-j', self.ceParameters['JobListFile'] ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) resultDict = {} if not result['OK']: return result if result['Value'][0]: if result['Value'][2]: return S_ERROR(result['Value'][2]) else: return S_ERROR('Error while interrogating CE status') if result['Value'][1]: resultDict = self.__parseJobStatus( result['Value'][1] ) running = 0 waiting = 0 for ref in resultDict: status = resultDict[ref] if status == 'Scheduled': waiting += 1 if status == 'Running': running += 1 result = S_OK() result['RunningJobs'] = running result['WaitingJobs'] = waiting result['SubmittedJobs'] = 0 return result
def __submitFTSTransfer( self ): """ create and execute glite-transfer-submit CLI command :param self: self reference """ comm = [ 'glite-transfer-submit', '-s', self.ftsServer, '-f', self.surlFile, '-o' ] if self.targetToken: comm += [ '-t', self.targetToken ] if self.sourceToken: comm += [ '-S', self.sourceToken ] if self.__cksmTest: comm.append( "--compare-checksums" ) gLogger.verbose( 'Executing %s' % ' '.join( comm ) ) res = executeGridCommand( '', comm ) os.remove( self.surlFile ) if not res['OK']: return res returnCode, output, errStr = res['Value'] if not returnCode == 0: return S_ERROR( errStr ) guid = output.replace( '\n', '' ) if not checkGuid( guid ): return S_ERROR( 'Wrong GUID format returned' ) self.ftsGUID = guid # if self.priority != 3: # comm = ['glite-transfer-setpriority','-s', self.ftsServer,self.ftsGUID,str(self.priority)] # executeGridCommand('',comm) return res
def __getFullOutput( self ): comm = ['glite-transfer-status', '-s', self.ftsServer, '-l', self.ftsGUID] res = executeGridCommand( '', comm, self.gridEnv ) if not res['OK']: return res returnCode, output, errStr = res['Value'] # Returns a non zero status if error if not returnCode == 0: return S_ERROR( errStr ) statusExp = re.compile( "^(\S+)" ) self.requestStatus = re.search( statusExp, output ).group( 1 ) output = output.replace( "%s\n" % self.requestStatus, "", 1 ) toRemove = ["'", "<", ">"] for char in toRemove: output = output.replace( char, '' ) regExp = re.compile( "[ ]+Source:[ ]+(\S+)\n[ ]+Destination:[ ]+(\S+)\n[ ]+State:[ ]+(\S+)\n[ ]+Retries:[ ]+(\d+)\n[ ]+Reason:[ ]+([\S ]+).+?[ ]+Duration:[ ]+(\d+)", re.S ) fileInfo = re.findall( regExp, output ) for source, target, status, retries, reason, duration in fileInfo: lfn = '' for candidate in sortList( self.fileDict.keys() ): if re.search( candidate, source ): lfn = candidate if not lfn: continue self.__setFileParameter( lfn, 'Source', source ) self.__setFileParameter( lfn, 'Target', target ) self.__setFileParameter( lfn, 'Status', status ) if reason == '(null)': reason = '' self.__setFileParameter( lfn, 'Reason', reason.replace( "\n", " " ) ) self.__setFileParameter( lfn, 'Duration', int( duration ) ) return S_OK()
def getCEStatus( self, jobIDList = None ): """ Method to return information on running and pending jobs. :param list jobIDList: list of job IDs to be considered """ statusList = ['REGISTERED', 'PENDING', 'IDLE', 'RUNNING', 'REALLY-RUNNING'] cmd = ['glite-ce-job-status', '-n', '-a', '-e', '%s' % self.ceName, '-s', '%s' % ':'.join( statusList ) ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) resultDict = {} if not result['OK']: return result if result['Value'][0]: if result['Value'][0] == 11: return S_ERROR( 'Segmentation fault while calling glite-ce-job-status' ) elif result['Value'][2]: return S_ERROR( result['Value'][2] ) elif "Authorization error" in result['Value'][1]: return S_ERROR( "Authorization error" ) elif "FaultString" in result['Value'][1]: res = re.search( 'FaultString=\[([\w\s]+)\]', result['Value'][1] ) fault = '' if res: fault = res.group( 1 ) detail = '' res = re.search( 'FaultDetail=\[([\w\s]+)\]', result['Value'][1] ) if res: detail = res.group( 1 ) return S_ERROR( "Error: %s:%s" % (fault,detail) ) else: return S_ERROR( 'Error while interrogating CE status' ) if result['Value'][1]: resultDict = self.__parseJobStatus( result['Value'][1] ) running = 0 waiting = 0 statusDict = {} for ref, status in resultDict.items(): if jobIDList is not None and not ref in jobIDList: continue if status == 'Scheduled': waiting += 1 if status == 'Running': running += 1 statusDict[ref] = status result = S_OK() result['RunningJobs'] = running result['WaitingJobs'] = waiting result['SubmittedJobs'] = 0 result['JobStatusDict'] = statusDict return result
def __parseOutput( self, full = False ): """ execute glite-transfer-status command and parse its output :param self: self reference :param bool full: glite-transfer-status verbosity level, when set, collect information of files as well """ if full: res = self.__isMonitorValid() else: res = self.__isSummaryValid() if not res['OK']: return res comm = [ 'glite-transfer-status', '--verbose', '-s', self.ftsServer, self.ftsGUID ] if full: comm.append( '-l' ) res = executeGridCommand( '', comm ) if not res['OK']: return res returnCode, output, errStr = res['Value'] # Returns a non zero status if error if not returnCode == 0: return S_ERROR( errStr ) toRemove = ["'", "<", ">"] for char in toRemove: output = output.replace( char, '' ) regExp = re.compile( "Status:\s+(\S+)" ) self.requestStatus = re.search( regExp, output ).group( 1 ) regExp = re.compile( "Submit time:\s+(\S+ \S+)" ) self.submitTime = re.search( regExp, output ).group( 1 ) self.statusSummary = {} for state in self.fileStates: regExp = re.compile( "\s+%s:\s+(\d+)" % state ) self.statusSummary[state] = int( re.search( regExp, output ).group( 1 ) ) if not full: return S_OK() regExp = re.compile( "[ ]+Source:[ ]+(\S+)\n[ ]+Destination:[ ]+(\S+)\n[ ]+State:[ ]+(\S+)\n[ ]+Retries:[ ]+(\d+)\n[ ]+Reason:[ ]+([\S ]+).+?[ ]+Duration:[ ]+(\d+)", re.S ) fileInfo = re.findall( regExp, output ) for source, target, status, retries, reason, duration in fileInfo: lfn = '' for candidate in sorted( self.fileDict ): if re.search( candidate, source ): lfn = candidate if not lfn: continue self.__setFileParameter( lfn, 'Source', source ) self.__setFileParameter( lfn, 'Target', target ) self.__setFileParameter( lfn, 'Status', status ) if reason == '(null)': reason = '' self.__setFileParameter( lfn, 'Reason', reason.replace( "\n", " " ) ) self.__setFileParameter( lfn, 'Duration', int( duration ) ) return S_OK()
def getCEStatus(self, jobIDList=None): """ Method to return information on running and pending jobs. :param list jobIDList: list of job IDs to be considered """ statusList = ["REGISTERED", "PENDING", "IDLE", "RUNNING", "REALLY-RUNNING"] cmd = ["glite-ce-job-status", "-n", "-a", "-e", "%s" % self.ceName, "-s", "%s" % ":".join(statusList)] result = executeGridCommand(self.proxy, cmd, self.gridEnv) resultDict = {} if not result["OK"]: return result if result["Value"][0]: if result["Value"][0] == 11: return S_ERROR("Segmentation fault while calling glite-ce-job-status") elif result["Value"][2]: return S_ERROR(result["Value"][2]) elif "Authorization error" in result["Value"][1]: return S_ERROR("Authorization error") elif "FaultString" in result["Value"][1]: res = re.search("FaultString=\[([\w\s]+)\]", result["Value"][1]) fault = "" if res: fault = res.group(1) detail = "" res = re.search("FaultDetail=\[([\w\s]+)\]", result["Value"][1]) if res: detail = res.group(1) return S_ERROR("Error: %s:%s" % (fault, detail)) else: return S_ERROR("Error while interrogating CE status") if result["Value"][1]: resultDict = self.__parseJobStatus(result["Value"][1]) running = 0 waiting = 0 statusDict = {} for ref, status in resultDict.items(): if jobIDList is not None and not ref in jobIDList: continue if status == "Scheduled": waiting += 1 if status == "Running": running += 1 statusDict[ref] = status result = S_OK() result["RunningJobs"] = running result["WaitingJobs"] = waiting result["SubmittedJobs"] = 0 result["JobStatusDict"] = statusDict return result
def getWMSPilotOutput( pilotRef ): """ Get Output of a GRID job """ tmp_dir = mkdtemp() cmd = [ 'glite-wms-job-output', '--noint', '--dir', tmp_dir, pilotRef] gridEnv = getGridEnv() ret = executeGridCommand( '', cmd, gridEnv ) if not ret['OK']: shutil.rmtree( tmp_dir ) return ret status, output, error = ret['Value'] for errorString in [ 'already retrieved', 'Output not yet Ready', 'not yet ready', 'the status is ABORTED', 'No output files' ]: if errorString in error: shutil.rmtree( tmp_dir ) return S_ERROR( error ) if errorString in output: shutil.rmtree( tmp_dir ) return S_ERROR( output ) if status: shutil.rmtree( tmp_dir ) return S_ERROR( error ) # Get the list of files tmp_dir = os.path.join( tmp_dir, os.listdir( tmp_dir )[0] ) result = S_OK() result['FileList'] = outputSandboxFiles for filename in outputSandboxFiles: tmpname = os.path.join( tmp_dir, filename ) if os.path.exists( tmpname ): myfile = file( tmpname, 'r' ) f = myfile.read() myfile.close() else: f = '' result[filename] = f shutil.rmtree( tmp_dir ) return result
def killJob(self, jobIDList): """ Kill the specified jobs """ jobList = list(jobIDList) if type(jobIDList) in StringTypes: jobList = [jobIDList] cmd = ["glite-ce-job-cancel", "-n", "-N"] + jobList result = executeGridCommand(self.proxy, cmd, self.gridEnv) if not result["OK"]: return result if result["Value"][0] != 0: return S_ERROR("Failed kill job: %s" % result["Value"][0][1]) return S_OK()
def killJob( self, jobIDList ): """ Kill the specified jobs """ jobList = list( jobIDList ) if type( jobIDList ) in StringTypes: jobList = [ jobIDList ] cmd = ['glite-ce-job-cancel','-n','-N']+jobList result = executeGridCommand( self.proxy, cmd, self.gridEnv ) if not result['OK']: return result if result['Value'][0] != 0: return S_ERROR( 'Failed kill job: %s' % result['Value'][0][1] ) return S_OK()
def killJob( self, jobIDList ): """ Kill the specified jobs """ jobList = list( jobIDList ) if isinstance(jobIDList, basestring): jobList = [ jobIDList ] for jobID in jobList: cmd = ['globus-job-clean', '-f', jobID] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) if not result['OK']: return result if result['Value'][0] != 0: return S_ERROR( 'Failed kill job: %s' % result['Value'][1].strip() ) return S_OK()
def killJob(self, jobIDList): """ Kill the specified jobs """ jobList = list(jobIDList) if isinstance(jobIDList, basestring): jobList = [jobIDList] cmd = ['glite-ce-job-cancel', '-n', '-N'] + jobList result = executeGridCommand(self.proxy, cmd, self.gridEnv) if not result['OK']: return result if result['Value'][0] != 0: errorString = '\n'.join(result['Value'][1:]).strip() return S_ERROR('Failed kill job: %s' % errorString) return S_OK()
def killJob(self, jobIDList): """ Kill the specified jobs #FIXME: Needs to be tested """ jobList = list(jobIDList) if isinstance(jobIDList, basestring): jobList = [jobIDList] for jobID in jobList: cmd = ['globus-job-clean', jobID] result = executeGridCommand(self.proxy, cmd, self.gridEnv) if not result['OK']: return result if result['Value'][0] != 0: return S_ERROR('Failed kill job: %s' % result['Value'][0][1]) return S_OK()
def killJob( self, jobIDList ): """ Kill the specified jobs """ jobList = list( jobIDList ) if isinstance( jobIDList, basestring ): jobList = [ jobIDList ] cmd = ['glite-ce-job-cancel', '-n', '-N'] + jobList result = executeGridCommand( self.proxy, cmd, self.gridEnv ) if not result['OK']: return result if result['Value'][0] != 0: errorString = '\n'.join( result['Value'][1:] ).strip() return S_ERROR( 'Failed kill job: %s' % errorString ) return S_OK()
def killJob(self, jobIDList): """ Kill the specified jobs """ jobList = list(jobIDList) if isinstance(jobIDList, six.string_types): jobList = [jobIDList] for jobID in jobList: cmd = ['globus-job-clean', '-f', jobID] result = executeGridCommand(self.proxy, cmd, self.gridEnv) if not result['OK']: return result if result['Value'][0] != 0: return S_ERROR('Failed kill job: %s' % result['Value'][1].strip()) return S_OK()
def getJobOutput(self, jobID, _localDir=None): """ TODO: condor can copy the output automatically back to the submission, so we just need to pick it up from the proper folder """ self.log.verbose("Getting job output for jobID: %s " % jobID) _job, condorID = condorIDFromJobRef(jobID) # FIXME: the WMSAdministrator does not know about the # SiteDirector WorkingDirectory, it might not even run on the # same machine #workingDirectory = self.ceParameters.get( 'WorkingDirectory', DEFAULT_WORKINGDIRECTORY ) if not self.useLocalSchedd: cmd = ['condor_transfer_data', '-pool', '%s:9619' % self.ceName, '-name', self.ceName, condorID] result = executeGridCommand(self.proxy, cmd, self.gridEnv) self.log.verbose(result) if not result['OK']: self.log.error("Failed to get job output from htcondor", result['Message']) return result output = '' error = '' resOut = findFile(self.workingDirectory, '%s.out' % condorID) if not resOut['OK']: self.log.error("Failed to find output file for condor job", jobID) return resOut outputfilename = resOut['Value'][0] resErr = findFile(self.workingDirectory, '%s.err' % condorID) if not resErr['OK']: self.log.error("Failed to find error file for condor job", jobID) return resErr errorfilename = resErr['Value'][0] try: with open(outputfilename) as outputfile: output = outputfile.read() except IOError as e: self.log.error("Failed to open outputfile", str(e)) return S_ERROR("Failed to get pilot output") try: with open(errorfilename) as errorfile: error = errorfile.read() except IOError as e: self.log.error("Failed to open errorfile", str(e)) return S_ERROR("Failed to get pilot error") return S_OK((output, error))
def getJobStatus( self, jobIDList ): """ Get the status information for the given list of jobs """ workingDirectory = self.ceParameters['WorkingDirectory'] fd, name = tempfile.mkstemp( suffix = '.list', prefix = 'StatJobs_', dir = workingDirectory ) jobListFile = os.fdopen( fd, 'w' ) jobTmpList = list( jobIDList ) if type( jobIDList ) in StringTypes: jobTmpList = [ jobIDList ] jobList = [] for j in jobTmpList: if ":::" in j: job = j.split(":::")[0] else: job = j jobList.append( job ) jobListFile.write( job+'\n' ) cmd = ['arcstat','-c',self.ceHost,'-i',name,'-j',self.ceParameters['JobListFile']] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) os.unlink( name ) resultDict = {} if not result['OK']: self.log.error( 'Failed to get job status', result['Message'] ) return result if result['Value'][0]: if result['Value'][2]: return S_ERROR(result['Value'][2]) else: return S_ERROR('Error while interrogating job statuses') if result['Value'][1]: resultDict = self.__parseJobStatus( result['Value'][1] ) if not resultDict: return S_ERROR('No job statuses returned') # If CE does not know about a job, set the status to Unknown for job in jobList: if not resultDict.has_key( job ): resultDict[job] = 'Unknown' return S_OK( resultDict )
def submitJob(self, executableFile, proxy, numberOfJobs=1): """ Method to submit job """ self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod( executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH) batchIDList = [] stampDict = {} i = 0 while i < numberOfJobs: i += 1 xrslName, diracStamp = self.__writeXRSL(executableFile) cmd = [ 'arcsub', '-j', self.ceParameters['JobListFile'], '-c', '%s' % self.ceHost, '%s' % xrslName ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) os.unlink(xrslName) if not result['OK']: break if result['Value'][0] != 0: break pilotJobReference = result['Value'][1].strip() if pilotJobReference and pilotJobReference.startswith( 'Job submitted with jobid:'): pilotJobReference = pilotJobReference.replace( 'Job submitted with jobid:', '').strip() batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp else: break #os.unlink( executableFile ) if batchIDList: result = S_OK(batchIDList) result['PilotStampDict'] = stampDict else: result = S_ERROR( 'No pilot references obtained from the glite job submission') return result
def parseJobSubmitStdout(self, proxy, cmd, taskQueueID, rb): """ Parse Job Submit stdout to return pilot reference """ start = time.time() self.log.verbose('Executing Job Submit for TaskQueue', taskQueueID) ret = executeGridCommand(proxy, cmd, self.gridEnv) if not ret['OK']: self.log.error('Failed to execute Job Submit:', ret['Message']) self.__sendErrorMail(rb, 'Job Submit', cmd, ret, proxy) return False if ret['Value'][0] != 0: self.log.error('Error executing Job Submit:', str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3])) self.__sendErrorMail(rb, 'Job Submit', cmd, ret, proxy) return False self.log.info('Job Submit Execution Time: %.2f for TaskQueue %d' % ((time.time() - start), taskQueueID)) stdout = ret['Value'][1] stderr = ret['Value'][2] submittedPilot = None failed = 1 rb = '' for line in List.fromChar(stdout, '\n'): m = re.search("(https:\S+)", line) if (m): glite_id = m.group(1) submittedPilot = glite_id if not rb: m = re.search("https://(.+):.+", glite_id) rb = m.group(1) failed = 0 if failed: self.log.error('Job Submit returns no Reference:', str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3])) return False self.log.info('Reference %s for TaskQueue %s' % (glite_id, taskQueueID)) return glite_id, rb
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ workingDirectory = self.ceParameters["WorkingDirectory"] fd, name = tempfile.mkstemp(suffix=".list", prefix="StatJobs_", dir=workingDirectory) jobListFile = os.fdopen(fd, "w") jobTmpList = list(jobIDList) if type(jobIDList) in StringTypes: jobTmpList = [jobIDList] jobList = [] for j in jobTmpList: if ":::" in j: job = j.split(":::")[0] else: job = j jobList.append(job) jobListFile.write(job + "\n") cmd = ["arcstat", "-c", self.ceHost, "-i", name, "-j", self.ceParameters["JobListFile"]] result = executeGridCommand(self.proxy, cmd, self.gridEnv) os.unlink(name) resultDict = {} if not result["OK"]: self.log.error("Failed to get job status", result["Message"]) return result if result["Value"][0]: if result["Value"][2]: return S_ERROR(result["Value"][2]) else: return S_ERROR("Error while interrogating job statuses") if result["Value"][1]: resultDict = self.__parseJobStatus(result["Value"][1]) if not resultDict: return S_ERROR("No job statuses returned") # If CE does not know about a job, set the status to Unknown for job in jobList: if not resultDict.has_key(job): resultDict[job] = "Unknown" return S_OK(resultDict)
def submitJob(self, executableFile, proxy, numberOfJobs=1): """ Method to submit job """ self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod(executableFile, 0o755) subName = self.__writeSub(executableFile, numberOfJobs) jobStamps = [] for _i in range(numberOfJobs): jobStamps.append(makeGuid()[:8]) cmd = ['condor_submit', '-terse', subName] # the options for submit to remote are different than the other remoteScheddOptions scheddOptions = [] if self.useLocalSchedd else ['-pool', '%s:9619' % self.ceName, '-remote', self.ceName] for op in scheddOptions: cmd.insert(-1, op) result = executeGridCommand(self.proxy, cmd, self.gridEnv) self.log.verbose(result) os.unlink(subName) if not result['OK']: self.log.error("Failed to submit jobs to htcondor", result['Message']) return result if result['Value'][0]: # We have got a non-zero status code errorString = result['Value'][2] if result['Value'][2] else result['Value'][1] return S_ERROR('Pilot submission failed with error: %s ' % errorString.strip()) pilotJobReferences = self.__getPilotReferences(result['Value'][1].strip()) if not pilotJobReferences['OK']: return pilotJobReferences pilotJobReferences = pilotJobReferences['Value'] self.log.verbose("JobStamps: %s " % jobStamps) self.log.verbose("pilotRefs: %s " % pilotJobReferences) result = S_OK(pilotJobReferences) result['PilotStampDict'] = dict(zip(pilotJobReferences, jobStamps)) self.log.verbose("Result for submission: %s " % result) return result
def parseListMatchStdout(self, proxy, cmd, taskQueueID, rb): """ Parse List Match stdout to return list of matched CE's """ self.log.verbose('Executing List Match for TaskQueue', taskQueueID) start = time.time() ret = executeGridCommand(proxy, cmd, self.gridEnv) if not ret['OK']: self.log.error('Failed to execute List Match:', ret['Message']) self.__sendErrorMail(rb, 'List Match', cmd, ret, proxy) return False if ret['Value'][0] != 0: self.log.error('Error executing List Match:', str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3])) self.__sendErrorMail(rb, 'List Match', cmd, ret, proxy) return False self.log.info('List Match Execution Time: %.2f for TaskQueue %d' % ((time.time() - start), taskQueueID)) stdout = ret['Value'][1] stderr = ret['Value'][2] availableCEs = [] # Parse std.out for line in List.fromChar(stdout, '\n'): if re.search('/jobmanager-', line) or re.search('/cream-', line): # TODO: the line has to be stripped from extra info availableCEs.append(line) if not availableCEs: self.log.info('List-Match failed to find CEs for TaskQueue', taskQueueID) self.log.info(stdout) self.log.info(stderr) else: self.log.debug('List-Match returns:', str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3])) self.log.info( 'List-Match found %s CEs for TaskQueue' % len(availableCEs), taskQueueID) self.log.verbose(', '.join(availableCEs)) return availableCEs
def submitJob(self, executableFile, proxy, numberOfJobs=1): """ Method to submit job """ self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod(executableFile, 0755) batchIDList = [] stampDict = {} for _i in xrange(numberOfJobs): _jdlName, diracStamp = self.__writeRSL(executableFile) queueName = '%s/%s' % (self.ceName, self.queue) cmd = ['globus-job-submit', queueName, "-s", executableFile] #cmd = ['globus-job-submit', '-r %s' % queueName, '-f %s' % jdlName ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) self.log.verbose(result) #os.unlink( jdlName ) if result['OK']: if result['Value'][0]: # We have got a non-zero status code errorString = result['Value'][2] if result['Value'][ 2] else result['Value'][1] return S_ERROR('Pilot submission failed with error: %s ' % errorString.strip()) pilotJobReference = result['Value'][1].strip() if not pilotJobReference: return S_ERROR( 'No pilot reference returned from the glite job submission command' ) if not pilotJobReference.startswith('https'): return S_ERROR('Invalid pilot reference %s' % pilotJobReference) batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp if batchIDList: result = S_OK(batchIDList) result['PilotStampDict'] = stampDict else: result = S_ERROR( 'No pilot references obtained from the glite job submission') return result
def submitJob(self, executableFile, proxy, numberOfJobs=1): """ Method to submit job """ self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod(executableFile, 0755) subName = self.__writeSub(executableFile, numberOfJobs) jobStamps = [] for _i in range(numberOfJobs): jobStamps.append(makeGuid()[:8]) cmd = ['condor_submit', '-terse', subName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) self.log.verbose(result) os.unlink(subName) if not result['OK']: self.log.error("Failed to submit jobs to htcondor", result['Message']) return result if result['Value'][0]: # We have got a non-zero status code errorString = result['Value'][2] if result['Value'][2] else result[ 'Value'][1] return S_ERROR('Pilot submission failed with error: %s ' % errorString.strip()) pilotJobReferences = self.__getPilotReferences( result['Value'][1].strip()) if not pilotJobReferences['OK']: return pilotJobReferences pilotJobReferences = pilotJobReferences['Value'] self.log.verbose("JobStamps: %s " % jobStamps) self.log.verbose("pilotRefs: %s " % pilotJobReferences) result = S_OK(pilotJobReferences) result['PilotStampDict'] = dict(zip(pilotJobReferences, jobStamps)) self.log.verbose("Result for submission: %s " % result) return result
def getJobLog(self, jobID): """Get pilot job logging info :param str jobID: pilot job identifier :return: string representing the logging info of a given pilot job """ # pilotRef may integrate the pilot stamp # it has to be removed before being passed in parameter jobID = jobID.split(":::")[0] cmd = ["glite-ce-job-status", "-L", "2", "%s" % jobID] ret = executeGridCommand("", cmd, self.gridEnv) if not ret["OK"]: return ret status, output, error = ret["Value"] if status: return S_ERROR(error) return S_OK(output)
def parseJobSubmitStdout(self, proxy, cmd, taskQueueID, rb): """ Parse Job Submit stdout to return pilot reference """ start = time.time() self.log.verbose("Executing Job Submit for TaskQueue", taskQueueID) ret = executeGridCommand(proxy, cmd, self.gridEnv) if not ret["OK"]: self.log.error("Failed to execute Job Submit:", ret["Message"]) self.__sendErrorMail(rb, "Job Submit", cmd, ret, proxy) return False if ret["Value"][0] != 0: self.log.error("Error executing Job Submit:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])) self.__sendErrorMail(rb, "Job Submit", cmd, ret, proxy) return False self.log.info("Job Submit Execution Time: %.2f for TaskQueue %d" % ((time.time() - start), taskQueueID)) stdout = ret["Value"][1] stderr = ret["Value"][2] submittedPilot = None failed = 1 rb = "" for line in List.fromChar(stdout, "\n"): m = re.search("(https:\S+)", line) if m: glite_id = m.group(1) submittedPilot = glite_id if not rb: m = re.search("https://(.+):.+", glite_id) rb = m.group(1) failed = 0 if failed: self.log.error("Job Submit returns no Reference:", str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])) return False self.log.info("Reference %s for TaskQueue %s" % (glite_id, taskQueueID)) return glite_id, rb
def parseJobSubmitStdout( self, proxy, cmd, taskQueueID, rb ): """ Parse Job Submit stdout to return pilot reference """ start = time.time() self.log.verbose( 'Executing Job Submit for TaskQueue', taskQueueID ) ret = executeGridCommand( proxy, cmd, self.gridEnv ) if not ret['OK']: self.log.error( 'Failed to execute Job Submit:', ret['Message'] ) self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy ) return False if ret['Value'][0] != 0: self.log.error( 'Error executing Job Submit:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy ) return False self.log.info( 'Job Submit Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) ) stdout = ret['Value'][1] stderr = ret['Value'][2] submittedPilot = None failed = 1 rb = '' for line in List.fromChar( stdout, '\n' ): m = re.search( "(https:\S+)", line ) if ( m ): glite_id = m.group( 1 ) submittedPilot = glite_id if not rb: m = re.search( "https://(.+):.+", glite_id ) rb = m.group( 1 ) failed = 0 if failed: self.log.error( 'Job Submit returns no Reference:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) return False self.log.info( 'Reference %s for TaskQueue %s' % ( glite_id, taskQueueID ) ) return glite_id, rb
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ workingDirectory = self.ceParameters['WorkingDirectory'] fd, idFileName = tempfile.mkstemp(suffix='.ids', prefix='CREAM_', dir=workingDirectory) idFile = os.fdopen(fd, 'w') idFile.write('##CREAMJOBS##') for id_ in jobIDList: if ":::" in id_: ref, stamp = id_.split(':::') else: ref = id_ idFile.write('\n' + ref) idFile.close() cmd = ['glite-ce-job-status', '-n', '-i', '%s' % idFileName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) os.unlink(idFileName) resultDict = {} if not result['OK']: self.log.error('Failed to get job status', result['Message']) return result if result['Value'][0]: if result['Value'][2]: return S_ERROR(result['Value'][2]) else: return S_ERROR('Error while interrogating job statuses') if result['Value'][1]: resultDict = self.__parseJobStatus(result['Value'][1]) if not resultDict: return S_ERROR('No job statuses returned') # If CE does not know about a job, set the status to Unknown for job in jobIDList: if not resultDict.has_key(job): resultDict[job] = 'Unknown' return S_OK(resultDict)
def getJobOutput(self, jobID, localDir=None): """ Get the specified job standard output and error files. If the localDir is provided, the output is returned as file in this directory. Otherwise, the output is returned as strings. """ if jobID.find(':::') != -1: pilotRef, stamp = jobID.split(':::') else: pilotRef = jobID stamp = '' if not stamp: return S_ERROR('Pilot stamp not defined for %s' % pilotRef) arcID = os.path.basename(pilotRef) if "WorkingDirectory" in self.ceParameters: workingDirectory = os.path.join( self.ceParameters['WorkingDirectory'], arcID) else: workingDirectory = arcID outFileName = os.path.join(workingDirectory, '%s.out' % stamp) errFileName = os.path.join(workingDirectory, '%s.err' % stamp) cmd = ['arcget', '-j', self.ceParameters['JobListFile'], pilotRef] result = executeGridCommand(self.proxy, cmd, self.gridEnv) output = '' if result['OK']: if not result['Value'][0]: outFile = open(outFileName, 'r') output = outFile.read() outFile.close() os.unlink(outFileName) errFile = open(errFileName, 'r') error = errFile.read() errFile.close() os.unlink(errFileName) else: error = '\n'.join(result['Value'][1:]) return S_ERROR(error) else: return S_ERROR('Failed to retrieve output for %s' % jobID) return S_OK((output, error))
def __resolveOutputURL(self, pilotRef): """Resolve the URL of the pilot output files""" cmd = ["glite-ce-job-status", "-L", "2", "%s" % pilotRef, "| grep -i osb"] result = executeGridCommand(self.proxy, cmd, self.gridEnv) url = "" if result["OK"]: if not result["Value"][0]: output = result["Value"][1] for line in output.split("\n"): line = line.strip() if line.find("OSB") != -1: match = re.search(r"\[(.*)\]", line) if match: url = match.group(1) if url: return S_OK(url) return S_ERROR("output URL not found for %s" % pilotRef) else: return S_ERROR("Failed to retrieve long status for %s" % pilotRef)
def getJobOutput( self, jobID, localDir = None ): """ Get the specified job standard output and error files. If the localDir is provided, the output is returned as file in this directory. Otherwise, the output is returned as strings. """ if jobID.find( ':::' ) != -1: pilotRef, stamp = jobID.split( ':::' ) else: pilotRef = jobID stamp = '' if not stamp: return S_ERROR( 'Pilot stamp not defined for %s' % pilotRef ) arcID = os.path.basename(pilotRef) if "WorkingDirectory" in self.ceParameters: workingDirectory = os.path.join( self.ceParameters['WorkingDirectory'], arcID ) else: workingDirectory = arcID outFileName = os.path.join( workingDirectory, '%s.out' % stamp ) errFileName = os.path.join( workingDirectory, '%s.err' % stamp ) cmd = ['arcget', '-j', self.ceParameters['JobListFile'], pilotRef ] result = executeGridCommand( self.proxy, cmd, self.gridEnv ) output = '' if result['OK']: if not result['Value'][0]: outFile = open( outFileName, 'r' ) output = outFile.read() outFile.close() os.unlink( outFileName ) errFile = open( errFileName, 'r' ) error = errFile.read() errFile.close() os.unlink( errFileName ) else: error = '\n'.join( result['Value'][1:] ) return S_ERROR( error ) else: return S_ERROR( 'Failed to retrieve output for %s' % jobID ) return S_OK( ( output, error ) )
def getCEStatus(self): """ Method to return information on running and pending jobs. """ cmd = [ 'arcstat', '-c', self.ceHost, '-j', self.ceParameters['JobListFile'] ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) resultDict = {} if not result['OK']: return result if result['Value'][0] == 1 and result['Value'][1] == "No jobs\n": result = S_OK() result['RunningJobs'] = 0 result['WaitingJobs'] = 0 result['SubmittedJobs'] = 0 return result if result['Value'][0]: if result['Value'][2]: return S_ERROR(result['Value'][2]) else: return S_ERROR('Error while interrogating CE status') if result['Value'][1]: resultDict = self.__parseJobStatus(result['Value'][1]) running = 0 waiting = 0 for ref in resultDict: status = resultDict[ref] if status == 'Scheduled': waiting += 1 if status == 'Running': running += 1 result = S_OK() result['RunningJobs'] = running result['WaitingJobs'] = waiting result['SubmittedJobs'] = 0 return result
def submitFTS2(self, command='glite-transfer-submit', pinTime=False): """ submit fts job using FTS2 client """ if self.FTSGUID: return S_ERROR("FTSJob has already been submitted") surls = self._surlPairs() if not surls: return S_ERROR("No files to submit") fd, fileName = tempfile.mkstemp() surlFile = os.fdopen(fd, 'w') surlFile.write(surls) surlFile.close() submitCommand = command.split() + \ [ "-s", self.FTSServer, "-f", fileName, "-o", "-K" ] if self.TargetToken: submitCommand += ["-t", self.TargetToken] if self.SourceToken: submitCommand += ["-S", self.SourceToken] if pinTime: submitCommand += [ "--copy-pin-lifetime", "%d" % pinTime, "--bring-online", '86400' ] submit = executeGridCommand("", submitCommand) os.remove(fileName) if not submit["OK"]: return submit returnCode, output, errStr = submit["Value"] if returnCode != 0: return S_ERROR(errStr if errStr else output) self.FTSGUID = output.replace("\n", "") self.Status = "Submitted" for ftsFile in self: ftsFile.FTSGUID = self.FTSGUID ftsFile.Status = "Submitted" return S_OK()
def getPilotLoggingInfo(grid, pilotRef): """ Get LoggingInfo of a GRID job """ if grid == 'gLite': cmd = ['glite-wms-job-logging-info', '-v', '3', '--noint', pilotRef] elif grid == 'CREAM': cmd = ['glite-ce-job-status', '-L', '2', '%s' % pilotRef] else: return S_ERROR('Pilot logging not available for %s CEs' % grid) gridEnv = getGridEnv() ret = executeGridCommand('', cmd, gridEnv) if not ret['OK']: return ret status, output, error = ret['Value'] if status: return S_ERROR(error) return S_OK(output)
def _getChildrenReferences(self, proxy, parentReference, taskQueueID): """ Get reference for all Children """ cmd = ['glite-wms-job-status', parentReference] start = time.time() self.log.verbose('Executing Job Status for TaskQueue', taskQueueID) ret = executeGridCommand(proxy, cmd, self.gridEnv) if not ret['OK']: self.log.error('Failed to execute Job Status', ret['Message']) return [] if ret['Value'][0] != 0: self.log.error('Error executing Job Status:', str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3])) return [] self.log.info('Job Status Execution Time: %.2f' % (time.time() - start)) stdout = ret['Value'][1] # stderr = ret['Value'][2] references = [] failed = 1 for line in List.fromChar(stdout, '\n'): match = re.search("Status info for the Job : (https:\S+)", line) if (match): glite_id = match.group(1) if glite_id not in references and glite_id != parentReference: references.append(glite_id) failed = 0 if failed: error = str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3]) self.log.error('Job Status returns no Child Reference:', error) return [parentReference] return references
def __resolveOutputURL(self, pilotRef): """ Resolve the URL of the pilot output files """ cmd = ['glite-ce-job-status', '-L', '2', '%s' % pilotRef, '| grep -i osb'] result = executeGridCommand(self.proxy, cmd, self.gridEnv) url = '' if result['OK']: if not result['Value'][0]: output = result['Value'][1] for line in output.split('\n'): line = line.strip() if line.find('OSB') != -1: match = re.search(r'\[(.*)\]', line) if match: url = match.group(1) if url: return S_OK(url) return S_ERROR('output URL not found for %s' % pilotRef) else: return S_ERROR('Failed to retrieve long status for %s' % pilotRef)
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ resultDict = {} self.log.verbose("JobIDList: %s" % jobIDList) for jobInfo in jobIDList: jobID = jobInfo.split(":::")[0] #jobRef = jobInfo.split(":::")[1] cmd = ['globus-job-status', jobID] result = executeGridCommand(self.proxy, cmd, self.gridEnv) self.log.info("Result from globus-job-status %s " % str(result)) if not result['OK']: self.log.error('Failed to get job status for jobID', jobID) continue if result['Value'][0]: if result['Value'][2]: return S_ERROR(result['Value'][2]) else: return S_ERROR('Error while interrogating job statuses') if result['Value'][1]: resultDict[jobID] = self.__parseJobStatus(result['Value'][1]) if not resultDict: return S_ERROR('No job statuses returned') # If CE does not know about a job, set the status to Unknown for jobInfo in jobIDList: jobID = jobInfo.split(":::")[0] if jobID not in resultDict: resultDict[jobInfo] = 'Unknown' return S_OK(resultDict)
def getCEStatus(self): """ Method to return information on running and pending jobs. """ statusList = [ 'REGISTERED', 'PENDING', 'IDLE', 'RUNNING', 'REALLY-RUNNING' ] cmd = [ 'glite-ce-job-status', '-n', '-a', '-e', '%s' % self.ceName, '-s', '%s' % ':'.join(statusList) ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) resultDict = {} if not result['OK']: return result if result['Value'][0]: if result['Value'][2]: return S_ERROR(result['Value'][2]) else: return S_ERROR('Error while interrogating CE status') if result['Value'][1]: resultDict = self.__parseJobStatus(result['Value'][1]) running = 0 waiting = 0 for ref, status in resultDict.items(): if status == 'Scheduled': waiting += 1 if status == 'Running': running += 1 result = S_OK() result['RunningJobs'] = running result['WaitingJobs'] = waiting result['SubmittedJobs'] = 0 return result
def __getFullOutput(self): comm = [ 'glite-transfer-status', '-s', self.ftsServer, '-l', self.ftsGUID ] res = executeGridCommand('', comm, self.gridEnv) if not res['OK']: return res returnCode, output, errStr = res['Value'] # Returns a non zero status if error if not returnCode == 0: return S_ERROR(errStr) statusExp = re.compile("^(\S+)") self.requestStatus = re.search(statusExp, output).group(1) output = output.replace("%s\n" % self.requestStatus, "", 1) toRemove = ["'", "<", ">"] for char in toRemove: output = output.replace(char, '') regExp = re.compile( "[ ]+Source:[ ]+(\S+)\n[ ]+Destination:[ ]+(\S+)\n[ ]+State:[ ]+(\S+)\n[ ]+Retries:[ ]+(\d+)\n[ ]+Reason:[ ]+([\S ]+).+?[ ]+Duration:[ ]+(\d+)", re.S) fileInfo = re.findall(regExp, output) for source, target, status, retries, reason, duration in fileInfo: lfn = '' for candidate in sortList(self.fileDict.keys()): if re.search(candidate, source): lfn = candidate if not lfn: continue self.__setFileParameter(lfn, 'Source', source) self.__setFileParameter(lfn, 'Target', target) self.__setFileParameter(lfn, 'Status', status) if reason == '(null)': reason = '' self.__setFileParameter(lfn, 'Reason', reason.replace("\n", " ")) self.__setFileParameter(lfn, 'Duration', int(duration)) return S_OK()
def getJobOutput(self, jobID, localDir=None): """ Get the specified job standard output and error files. If the localDir is provided, the output is returned as file in this directory. Otherwise, the output is returned as strings. """ if jobID.find(':::') != -1: pilotRef, stamp = jobID.split(':::') else: pilotRef = jobID stamp = '' if not stamp: return S_ERROR('Pilot stamp not defined for %s' % pilotRef) outURL = self.ceParameters.get('OutputURL', 'gsiftp://localhost') if outURL == 'gsiftp://localhost': result = self.__resolveOutputURL(pilotRef) if not result['OK']: return result outURL = result['Value'] outputURL = os.path.join(outURL, '%s.out' % stamp) errorURL = os.path.join(outURL, '%s.err' % stamp) workingDirectory = self.ceParameters['WorkingDirectory'] outFileName = os.path.join(workingDirectory, os.path.basename(outputURL)) errFileName = os.path.join(workingDirectory, os.path.basename(errorURL)) cmd = ['globus-url-copy', '%s' % outputURL, 'file://%s' % outFileName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) output = '' if result['OK']: if not result['Value'][0]: outFile = open(outFileName, 'r') output = outFile.read() outFile.close() os.unlink(outFileName) elif result['Value'][ 0] == 1 and "No such file or directory" in result['Value'][ 2]: output = "Standard Output is not available on the CREAM service" if os.path.exists(outFileName): os.unlink(outFileName) return S_ERROR(output) else: error = '\n'.join(result['Value'][1:]) return S_ERROR(error) else: return S_ERROR('Failed to retrieve output for %s' % jobID) cmd = ['globus-url-copy', '%s' % errorURL, '%s' % errFileName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) error = '' if result['OK']: if not result['Value'][0]: errFile = open(errFileName, 'r') error = errFile.read() errFile.close() os.unlink(errFileName) elif result['Value'][0] == 1 and "No such file or directory" in result[ 'Value'][2]: error = "Standard Error is not available on the CREAM service" if os.path.exists(errFileName): os.unlink(errFileName) return S_ERROR(error) else: return S_ERROR('Failed to retrieve error for %s' % jobID) return S_OK((output, error))
def getJobStatus(self, jobIDList): """ Get the status information for the given list of jobs """ if self.proxyRenewal % 60 == 0: self.proxyRenewal += 1 statusList = [ 'REGISTERED', 'PENDING', 'IDLE', 'RUNNING', 'REALLY-RUNNING' ] cmd = [ 'glite-ce-job-status', '-L', '2', '--all', '-e', '%s' % self.ceName, '-s', '%s' % ':'.join(statusList) ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) if result['OK']: delegationIDs = [] for line in result['Value'][1].split('\n'): if line.find('Deleg Proxy ID') != -1: delegationID = line.split()[-1].replace('[', '').replace( ']', '') if delegationID not in delegationIDs: delegationIDs.append(delegationID) if delegationIDs: # Renew proxies in batches to avoid timeouts chunkSize = 10 for i in xrange(0, len(delegationIDs), chunkSize): chunk = delegationIDs[i:i + chunkSize] cmd = ['glite-ce-proxy-renew', '-e', self.ceName] cmd.extend(chunk) self.log.info('Refreshing proxy for:', ' '.join(chunk)) result = executeGridCommand(self.proxy, cmd, self.gridEnv) if result['OK']: status, output, error = result['Value'] if status: self.log.error( "Failed to renew proxy delegation", 'Output:\n' + output + '\nError:\n' + error) workingDirectory = self.ceParameters['WorkingDirectory'] fd, idFileName = tempfile.mkstemp(suffix='.ids', prefix='CREAM_', dir=workingDirectory) idFile = os.fdopen(fd, 'w') idFile.write('##CREAMJOBS##') for id_ in jobIDList: if ":::" in id_: ref, _stamp = id_.split(':::') else: ref = id_ idFile.write('\n' + ref) idFile.close() cmd = ['glite-ce-job-status', '-n', '-i', '%s' % idFileName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) os.unlink(idFileName) resultDict = {} if not result['OK']: self.log.error('Failed to get job status', result['Message']) return result if result['Value'][0]: if result['Value'][2]: return S_ERROR(result['Value'][2]) return S_ERROR('Error while interrogating job statuses') if result['Value'][1]: resultDict = self.__parseJobStatus(result['Value'][1]) if not resultDict: return S_ERROR('No job statuses returned') # If CE does not know about a job, set the status to Unknown for job in jobIDList: if job not in resultDict: resultDict[job] = 'Unknown' return S_OK(resultDict)
def getCEStatus(self, jobIDList=None): """ Method to return information on running and pending jobs. :param jobIDList: list of job IDs to be considered :type jobIDList: python:list """ statusList = [ 'REGISTERED', 'PENDING', 'IDLE', 'RUNNING', 'REALLY-RUNNING' ] cmd = [ 'glite-ce-job-status', '-n', '-a', '-e', '%s' % self.ceName, '-s', '%s' % ':'.join(statusList) ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) resultDict = {} if not result['OK']: return result if result['Value'][0]: if result['Value'][0] == 11: return S_ERROR( 'Segmentation fault while calling glite-ce-job-status') elif result['Value'][2]: return S_ERROR(result['Value'][2]) elif "Authorization error" in result['Value'][1]: return S_ERROR("Authorization error") elif "FaultString" in result['Value'][1]: res = re.search(r'FaultString=\[([\w\s]+)\]', result['Value'][1]) fault = '' if res: fault = res.group(1) detail = '' res = re.search(r'FaultDetail=\[([\w\s]+)\]', result['Value'][1]) if res: detail = res.group(1) return S_ERROR("Error: %s:%s" % (fault, detail)) else: return S_ERROR('Error while interrogating CE status') if result['Value'][1]: resultDict = self.__parseJobStatus(result['Value'][1]) running = 0 waiting = 0 statusDict = {} for ref, status in resultDict.iteritems(): if jobIDList is not None and ref not in jobIDList: continue if status == 'Scheduled': waiting += 1 if status == 'Running': running += 1 statusDict[ref] = status result = S_OK() result['RunningJobs'] = running result['WaitingJobs'] = waiting result['SubmittedJobs'] = 0 result['JobStatusDict'] = statusDict return result
def submitJob(self, executableFile, proxy, numberOfJobs=1, processors=1): """ Method to submit job """ self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod( executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) batchIDList = [] stampDict = {} if numberOfJobs == 1: jdlName, diracStamp = self.__writeJDL(executableFile, processors=processors) cmd = [ 'glite-ce-job-submit', '-n', '-a', '-N', '-r', '%s/%s' % (self.ceName, self.queue), '%s' % jdlName ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) os.unlink(jdlName) if result['OK']: if result['Value'][0]: # We have got a non-zero status code errorString = '\n'.join(result['Value'][1:]).strip() return S_ERROR('Pilot submission failed with error: %s ' % errorString) pilotJobReference = result['Value'][1].strip() if not pilotJobReference: return S_ERROR( 'No pilot reference returned from the glite job submission command' ) if not pilotJobReference.startswith('https'): return S_ERROR('Invalid pilot reference %s' % pilotJobReference) batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp else: delegationID = makeGuid() cmd = [ 'glite-ce-delegate-proxy', '-e', '%s' % self.ceName, '%s' % delegationID ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) if not result['OK']: self.log.error('Failed to delegate proxy', result['Message']) return result for _i in range(numberOfJobs): jdlName, diracStamp = self.__writeJDL(executableFile, processors=processors) cmd = [ 'glite-ce-job-submit', '-n', '-N', '-r', '%s/%s' % (self.ceName, self.queue), '-D', '%s' % delegationID, '%s' % jdlName ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) os.unlink(jdlName) if not result['OK']: self.log.error( "General error in execution of glite-ce-job-submit command" ) break if result['Value'][0] != 0: self.log.error("Error in glite-ce-job-submit command", result['Value'][1] + result['Value'][2]) break pilotJobReference = result['Value'][1].strip() if pilotJobReference and pilotJobReference.startswith('https'): batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp else: break if batchIDList: result = S_OK(batchIDList) result['PilotStampDict'] = stampDict else: result = S_ERROR( 'No pilot references obtained from the glite job submission') return result
def getJobStatus(self, jobIDList): """Get the status information for the given list of jobs""" if self.proxyRenewal % 60 == 0: self.proxyRenewal += 1 statusList = ["REGISTERED", "PENDING", "IDLE", "RUNNING", "REALLY-RUNNING"] cmd = [ "glite-ce-job-status", "-L", "2", "--all", "-e", "%s" % self.ceName, "-s", "%s" % ":".join(statusList), ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) if result["OK"]: delegationIDs = [] for line in result["Value"][1].split("\n"): if line.find("Deleg Proxy ID") != -1: delegationID = line.split()[-1].replace("[", "").replace("]", "") if delegationID not in delegationIDs: delegationIDs.append(delegationID) if delegationIDs: # Renew proxies in batches to avoid timeouts chunkSize = 10 for i in range(0, len(delegationIDs), chunkSize): chunk = delegationIDs[i : i + chunkSize] cmd = ["glite-ce-proxy-renew", "-e", self.ceName] cmd.extend(chunk) self.log.info("Refreshing proxy for:", " ".join(chunk)) result = executeGridCommand(self.proxy, cmd, self.gridEnv) if result["OK"]: status, output, error = result["Value"] if status: self.log.error( "Failed to renew proxy delegation", "Output:\n" + output + "\nError:\n" + error ) workingDirectory = self.ceParameters["WorkingDirectory"] fd, idFileName = tempfile.mkstemp(suffix=".ids", prefix="CREAM_", dir=workingDirectory) idFile = os.fdopen(fd, "w") idFile.write("##CREAMJOBS##") for id_ in jobIDList: if ":::" in id_: ref, _stamp = id_.split(":::") else: ref = id_ idFile.write("\n" + ref) idFile.close() cmd = ["glite-ce-job-status", "-n", "-i", "%s" % idFileName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) os.unlink(idFileName) resultDict = {} if not result["OK"]: self.log.error("Failed to get job status", result["Message"]) return result if result["Value"][0]: if result["Value"][2]: return S_ERROR(result["Value"][2]) return S_ERROR("Error while interrogating job statuses") if result["Value"][1]: resultDict = self.__parseJobStatus(result["Value"][1]) if not resultDict: return S_ERROR("No job statuses returned") # If CE does not know about a job, set the status to Unknown for job in jobIDList: if job not in resultDict: resultDict[job] = PilotStatus.UNKNOWN return S_OK(resultDict)
def getWMSPilotOutput( proxy, grid, pilotRef ): """ Get Output of a GRID job """ tmp_dir = mkdtemp() if grid == 'LCG': cmd = [ 'edg-job-get-output' ] elif grid == 'gLite': cmd = [ 'glite-wms-job-output' ] else: return S_ERROR( 'Unknown GRID %s' % grid ) cmd.extend( ['--noint', '--dir', tmp_dir, pilotRef] ) gridEnv = getGridEnv() ret = executeGridCommand( proxy, cmd, gridEnv ) if not ret['OK']: shutil.rmtree( tmp_dir ) return ret status, output, error = ret['Value'] for errorString in [ 'already retrieved', 'Output not yet Ready', 'not yet ready', 'the status is ABORTED' ]: if error.find( errorString ) != -1: shutil.rmtree( tmp_dir ) return S_ERROR( error ) if status: shutil.rmtree( tmp_dir ) return S_ERROR( error ) # Get the list of files # LCG always creates an unique sub-directory # gLite does it too now result = executeGridCommand( proxy, ['glite-version'], gridEnv ) if not result['OK']: shutil.rmtree( tmp_dir ) return result status, output, error = result['Value'] if output.find( '3.2' ) != -1: tmp_dir = os.path.join( tmp_dir, os.listdir( tmp_dir )[0] ) result = S_OK() result['FileList'] = outputSandboxFiles for filename in outputSandboxFiles: tmpname = os.path.join( tmp_dir, filename ) if os.path.exists( tmpname ): myfile = file( tmpname, 'r' ) f = myfile.read() myfile.close() else: f = '' # HACK: removed after the current scheme has been in production for at least 1 week if filename == 'std.out' and f: filename = 'StdOut' if filename == 'std.err' and f: filename = 'StdErr' result[filename] = f shutil.rmtree( tmp_dir ) return result
def monitorFTS2(self, full=False): """ monitor fts job """ if not self.FTSGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") monitorCommand = [ "glite-transfer-status", "--verbose", "-s", self.FTSServer, self.FTSGUID ] if full: monitorCommand.append("-l") monitor = executeGridCommand("", monitorCommand) if not monitor["OK"]: return monitor returnCode, outputStr, errStr = monitor["Value"] # Returns a non zero status if error if returnCode != 0: return S_ERROR(errStr) outputStr = outputStr.replace("'", "").replace("<", "").replace(">", "") # # set FTS job status regExp = re.compile("Status:\s+(\S+)") self.Status = re.search(regExp, outputStr).group(1) statusSummary = {} for state in FTSFile.ALL_STATES: regExp = re.compile("\s+%s:\s+(\d+)" % state) if regExp.search(outputStr): statusSummary[state] = int( re.search(regExp, outputStr).group(1)) total = sum(statusSummary.values()) completed = sum( [statusSummary.get(state, 0) for state in FTSFile.FINAL_STATES]) self.Completeness = 100 * completed / total if not full: return S_OK(statusSummary) regExp = re.compile( "[ ]+Source:[ ]+(\S+)\n[ ]+Destination:[ ]+(\S+)\n[ ]+State:[ ]+(\S+)\n[ ]+Retries:[ ]+(\d+)\n[ ]+Reason:[ ]+([\S ]+).+?[ ]+Duration:[ ]+(\d+)", re.S) fileInfo = re.findall(regExp, outputStr) for sourceURL, targetURL, fileStatus, retries, reason, duration in fileInfo: candidateFile = None for ftsFile in self: if ftsFile.SourceSURL == sourceURL: candidateFile = ftsFile break if not candidateFile: continue candidateFile.Status = fileStatus candidateFile.Error = reason if candidateFile.Status == "Failed": for missingSource in self.missingSourceErrors: if missingSource.match(reason): candidateFile.Error = "MissingSource" # # register successful files if self.Status in FTSJob.FINALSTATES: return self.finalize() return S_OK()
def getJobOutput(self, jobID): """Get the specified job standard output and error files. The output is returned as strings. """ if jobID.find(":::") != -1: pilotRef, stamp = jobID.split(":::") else: pilotRef = jobID stamp = "" if not stamp: return S_ERROR("Pilot stamp not defined for %s" % pilotRef) outURL = self.ceParameters.get("OutputURL", "gsiftp://localhost") if outURL == "gsiftp://localhost": result = self.__resolveOutputURL(pilotRef) if not result["OK"]: return result outURL = result["Value"] outputURL = os.path.join(outURL, "%s.out" % stamp) errorURL = os.path.join(outURL, "%s.err" % stamp) workingDirectory = self.ceParameters["WorkingDirectory"] outFileName = os.path.join(workingDirectory, os.path.basename(outputURL)) errFileName = os.path.join(workingDirectory, os.path.basename(errorURL)) cmd = ["globus-url-copy", "%s" % outputURL, "file://%s" % outFileName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) output = "" if result["OK"]: if not result["Value"][0]: outFile = open(outFileName, "r") output = outFile.read() outFile.close() os.unlink(outFileName) elif result["Value"][0] == 1 and "No such file or directory" in result["Value"][2]: output = "Standard Output is not available on the CREAM service" if os.path.exists(outFileName): os.unlink(outFileName) return S_ERROR(output) else: error = "\n".join(result["Value"][1:]) return S_ERROR(error) else: return S_ERROR("Failed to retrieve output for %s" % jobID) cmd = ["globus-url-copy", "%s" % errorURL, "%s" % errFileName] result = executeGridCommand(self.proxy, cmd, self.gridEnv) error = "" if result["OK"]: if not result["Value"][0]: errFile = open(errFileName, "r") error = errFile.read() errFile.close() os.unlink(errFileName) elif result["Value"][0] == 1 and "No such file or directory" in result["Value"][2]: error = "Standard Error is not available on the CREAM service" if os.path.exists(errFileName): os.unlink(errFileName) return S_ERROR(error) else: return S_ERROR("Failed to retrieve error for %s" % jobID) return S_OK((output, error))
def submitJob(self, executableFile, proxy, numberOfJobs=1): """ Method to submit job """ self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod(executableFile, 0755) batchIDList = [] stampDict = {} if numberOfJobs == 1: jdlName, diracStamp = self.__writeJDL(executableFile) cmd = [ 'glite-ce-job-submit', '-n', '-a', '-N', '-r', '%s/%s' % (self.ceName, self.queue), '%s' % jdlName ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) if result['OK']: if result['Value'][0]: # We have got a non-zero status code return S_ERROR('Pilot submission failed with error: %s ' % result['Value'][2].strip()) pilotJobReference = result['Value'][1].strip() if not pilotJobReference: return S_ERROR( 'No pilot reference returned from the glite job submission command' ) batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp os.unlink(jdlName) else: delegationID = makeGuid() cmd = [ 'glite-ce-delegate-proxy', '-e', '%s' % self.ceName, '%s' % delegationID ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) if not result['OK']: self.log.error('Failed to delegate proxy: %s' % result['Message']) return result for i in range(numberOfJobs): jdlName, diracStamp = self.__writeJDL(executableFile) cmd = [ 'glite-ce-job-submit', '-n', '-N', '-r', '%s/%s' % (self.ceName, self.queue), '-D', '%s' % delegationID, '%s' % jdlName ] result = executeGridCommand(self.proxy, cmd, self.gridEnv) os.unlink(jdlName) if not result['OK']: break if result['Value'][0] != 0: break pilotJobReference = result['Value'][1].strip() if pilotJobReference: batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp else: break os.unlink(executableFile) if batchIDList: result = S_OK(batchIDList) result['PilotStampDict'] = stampDict else: result = S_ERROR( 'No pilot references obtained from the glite job submission') return result
def monitorFTS2(self, command="glite-transfer-status", full=False): """ monitor fts job """ if not self.FTSGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") monitorCommand = command.split() + \ ["--verbose", "-s", self.FTSServer, self.FTSGUID ] if full: monitorCommand.append("-l") monitor = executeGridCommand("", monitorCommand) if not monitor["OK"]: return monitor returnCode, outputStr, errStr = monitor["Value"] # Returns a non zero status if error if returnCode != 0: if 'was not found' in outputStr and not errStr: errStr = 'Job was not found' return S_ERROR(errStr) outputStr = outputStr.replace("'", "").replace("<", "").replace(">", "") # # set FTS job status regExp = re.compile("Status:\\s+(\\S+)") # with FTS3 this can be uppercase self.Status = re.search(regExp, outputStr).group(1) statusSummary = {} # This is capitalized, even in FTS3! for state in FTSFile.ALL_STATES: regExp = re.compile("\\s+%s:\\s+(\\d+)" % state) if regExp.search(outputStr): statusSummary[state] = int( re.search(regExp, outputStr).group(1)) total = sum(statusSummary.values()) completed = sum( [statusSummary.get(state, 0) for state in FTSFile.FINAL_STATES]) self.Completeness = 100 * completed / total if total else 0 if not full: return S_OK(statusSummary) # The order of informations is not the same for glite- and fts- !!! # In order: new fts-, old fts-, glite- iExptr = None for iExptr, exptr in enumerate( ('[ ]+Source:[ ]+(\\S+)\n[ ]+Destination:[ ]+(\\S+)\n[ ]+State:[ ]+(\\S+)\n[ ]+Reason:[ ]+([\\S ]+).+?[ ]+Duration:[ ]+(\\d+)\n[ ]+Staging:[ ]+(\\d+)\n[ ]+Retries:[ ]+(\\d+)', '[ ]+Source:[ ]+(\\S+)\n[ ]+Destination:[ ]+(\\S+)\n[ ]+State:[ ]+(\\S+)\n[ ]+Reason:[ ]+([\\S ]+).+?[ ]+Duration:[ ]+(\\d+)\n[ ]+Retries:[ ]+(\\d+)', '[ ]+Source:[ ]+(\\S+)\n[ ]+Destination:[ ]+(\\S+)\n[ ]+State:[ ]+(\\S+)\n[ ]+Retries:[ ]+(\\d+)\n[ ]+Reason:[ ]+([\\S ]+).+?[ ]+Duration:[ ]+(\\d+)' )): regExp = re.compile(exptr, re.S) fileInfo = re.findall(regExp, outputStr) if fileInfo: break if not fileInfo: return S_ERROR("Error monitoring job (no regexp match)") for info in fileInfo: if iExptr == 0: # version >= 3.2.30 sourceURL, targetURL, fileStatus, reason, duration, _retries, _staging = info elif iExptr == 1: # version FTS3 < 3.2.30 sourceURL, targetURL, fileStatus, reason, duration, _retries = info elif iExptr == 2: # version FTS2 sourceURL, targetURL, fileStatus, _retries, reason, duration = info else: return S_ERROR('Error monitoring job (implement match %d)' % iExptr) candidateFile = None for ftsFile in self: if ftsFile.SourceSURL == sourceURL: candidateFile = ftsFile break if not candidateFile: continue # Can be uppercase for FTS3 if not candidateFile.TargetSURL: candidateFile.TargetSURL = targetURL candidateFile.Status = fileStatus candidateFile.Error = reason candidateFile._duration = duration if candidateFile.Status == "Failed": for missingSource in self.missingSourceErrors: if missingSource.match(reason): candidateFile.Error = "MissingSource" # If the staging info was present, record it if len(info) > 6: candidateFile._staging = info[6] # # register successful files if self.Status in FTSJob.FINALSTATES: return self.finalize() return S_OK()