def __init__( self, jobManagerClient=None, sbRPCClient=None, sbTransferClient=None, useCertificates=False, timeout=600, delegatedDN=None, delegatedGroup=None, ): """WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.delegatedDN = delegatedDN self.delegatedGroup = delegatedGroup self.timeout = timeout self._jobManager = jobManagerClient self.operationsHelper = Operations() self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient=sbRPCClient, transferClient=sbTransferClient, useCertificates=useCertificates )
def getSandbox( self ): """ Get job sandbox """ if 'jobID' not in request.params: c.error = "Maybe you forgot the jobID ?" return render( "/error.mako" ) jobID = int(request.params['jobID']) sbType = 'Output' if 'sandbox' in request.params: sbType = str(request.params['sandbox']) client = SandboxStoreClient(useCertificates=True, delegatedDN=str( credentials.getUserDN() ), delegatedGroup=str( credentials.getSelectedGroup() ), setup = credentials.getSelectedSetup() ) result = client.downloadSandboxForJob(jobID,sbType,inMemory=True) if not result['OK']: c.error = "Error: %s" % result['Message'] return render( "/error.mako" ) data = result['Value'] fname = "%s_%sSandbox.tar" % (str(jobID),sbType) response.headers['Content-type'] = 'application/x-tar' response.headers['Content-Disposition'] = 'attachment; filename="%s"' % fname response.headers['Content-Length'] = len( data ) return data
def initializeOptimizer( self ): """Initialize specific parameters for JobSanityAgent. """ #Test control flags N.B. JDL check is mandatory self.inputDataCheck = self.am_getOption( 'InputDataCheck', 1 ) self.outputDataCheck = self.am_getOption( 'OutputDataCheck', 0 ) self.inputSandboxCheck = self.am_getOption( 'InputSandboxCheck', 1 ) self.platformCheck = self.am_getOption( 'PlatformCheck', 0 ) #Other parameters self.successStatus = self.am_getOption( 'SuccessfulJobStatus', 'OutputReady' ) self.maxDataPerJob = self.am_getOption( 'MaxInputDataPerJob', 100 ) #Sandbox self.sandboxClient = SandboxStoreClient( useCertificates = True ) self.log.debug( 'JDL Check ==> Enabled' ) if self.inputDataCheck: self.log.debug( 'Input Data Check ==> Enabled' ) else: self.log.debug( 'Input Data Check ==> Disabled' ) if self.outputDataCheck: self.log.debug( 'Output Data Check ==> Enabled' ) else: self.log.debug( 'Output Data Check ==> Disabled' ) if self.inputSandboxCheck: self.log.debug( 'Input Sbox Check ==> Enabled' ) else: self.log.debug( 'Input Sbox Check ==> Disabled' ) if self.platformCheck: self.log.debug( 'Platform Check ==> Enabled' ) else: self.log.debug( 'Platform Check ==> Disabled' ) return S_OK()
def uploadSandbox( fileList ): tmpDir = tempfile.mkdtemp( prefix = "upload.", dir = gWorkDir ) fileList = [] for fileName in fileList: fDir = os.path.dirname( fileName ) if fDir: fDir = os.path.join( tmpDir, fDir ) if not os.path.isdir( fDir ): try: os.makedirs( fDir ) except: gLogger.exception( "Could not create temporal dir %s" % fDir ) bottle.abort( 500 ) absFile = os.path.join( tmpDir, fileName ) fileList.append( absFile ) ofd = reqFiles[ fileName ] dfd = open( absFile, "w" ) dBuf = ofd.read( 524288 ) while dBug: dfd.write( dBuf ) dBuf = ofd.read( 524288 ) dfd.close() dBuf.close() sbClient = SandboxStoreClient( useCertificates = True, delegatedDN = gOAData.userDN, delegatedGroup = gOAData.userGroup ) result = sbClient.uploadFilesAsSandbox( fileList ) shutil.rmtree( tmpDir ) return result
def getSandbox(self): """ Get job sandbox """ if 'jobID' not in request.params: c.error = "Maybe you forgot the jobID ?" return render("/error.mako") jobID = int(request.params['jobID']) sbType = 'Output' if 'sandbox' in request.params: sbType = str(request.params['sandbox']) client = SandboxStoreClient(useCertificates=True, delegatedDN=str(credentials.getUserDN()), delegatedGroup=str( credentials.getSelectedGroup()), setup=credentials.getSelectedSetup()) result = client.downloadSandboxForJob(jobID, sbType, inMemory=True) if not result['OK']: c.error = "Error: %s" % result['Message'] return render("/error.mako") data = result['Value'] fname = "%s_%sSandbox.tar" % (str(jobID), sbType) response.headers['Content-type'] = 'application/x-tar' response.headers[ 'Content-Disposition'] = 'attachment; filename="%s"' % fname response.headers['Content-Length'] = len(data) return data
def __uploadInputSandbox(self, classAdJob, jobDescriptionObject=None): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) realFiles = [] badFiles = [] diskFiles = [] for isFile in inputSandbox: if not isFile.startswith(("lfn:", "LFN:", "SB:", "%s", "%(")): realFiles.append(isFile) stringIOFiles = [] stringIOFilesSize = 0 if jobDescriptionObject is not None: if isinstance(jobDescriptionObject, StringIO): stringIOFiles = [jobDescriptionObject] stringIOFilesSize = len(jobDescriptionObject.getvalue()) gLogger.debug("Size of the stringIOFiles: " + str(stringIOFilesSize)) else: return S_ERROR(EWMSJDL, "jobDescriptionObject is not a StringIO object") # Check real files for isFile in realFiles: if not os.path.exists(isFile): # we are passing in real files, we expect them to be on disk badFiles.append(isFile) gLogger.warn("inputSandbox file/directory " + isFile + " not found. Keep looking for the others") continue diskFiles.append(isFile) diskFilesSize = File.getGlobbedTotalSize(diskFiles) gLogger.debug("Size of the diskFiles: " + str(diskFilesSize)) totalSize = diskFilesSize + stringIOFilesSize gLogger.verbose("Total size of the inputSandbox: " + str(totalSize)) okFiles = stringIOFiles + diskFiles if badFiles: result = S_ERROR(EWMSJDL, "Input Sandbox is not valid") result["BadFile"] = badFiles result["TotalSize"] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates, delegatedDN=self.delegatedDN, delegatedGroup=self.delegatedGroup, ) result = self.sandboxClient.uploadFilesAsSandbox(okFiles) if not result["OK"]: return result inputSandbox.append(result["Value"]) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK()
def test_uploadFilesAsSandbox( self ): ourSSC = importlib.import_module( 'DIRAC.WorkloadManagementSystem.Client.SandboxStoreClient' ) ourSSC.TransferClient = MagicMock() ssc = SandboxStoreClient() fileList = [StringIO.StringIO( 'try' )] res = ssc.uploadFilesAsSandbox( fileList ) print res
def test_uploadFilesAsSandbox(self): ourSSC = importlib.import_module('DIRAC.WorkloadManagementSystem.Client.SandboxStoreClient') ourSSC.TransferClient = MagicMock() ssc = SandboxStoreClient() fileList = [StringIO.StringIO('try')] res = ssc.uploadFilesAsSandbox(fileList) print(res)
def test_uploadFilesAsSandbox(mocker, setUp): mocker.patch( "DIRAC.WorkloadManagementSystem.Client.SandboxStoreClient.TransferClient", return_value=MagicMock()) ssc = SandboxStoreClient() fileList = [BytesIO(b"try")] res = ssc.uploadFilesAsSandbox(fileList) print(res)
def __uploadInputSandbox(self, classAdJob): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates, rpcClient=self.sbRPCClient, transferClient=self.sbTransferClient ) inputSandbox = self.__getInputSandboxEntries(classAdJob) realFiles = [] badFiles = [] okFiles = [] realFiles = [] for file in inputSandbox: valid = True for tag in ( "lfn:", "LFN:", "SB:", "%s", ): # in case of parametric input sandbox, there is %s passed, so have to ignore it also if file.find(tag) == 0: valid = False break if valid: realFiles.append(file) # If there are no files, skip! if not realFiles: return S_OK() # Check real files for file in realFiles: if not os.path.exists(file): badFiles.append(file) print "inputSandbox file/directory " + file + " not found" continue okFiles.append(file) # print "Total size of the inputSandbox: "+str(totalSize) totalSize = File.getGlobbedTotalSize(okFiles) if badFiles: result = S_ERROR("Input Sandbox is not valid") result["BadFile"] = badFiles result["TotalSize"] = totalSize return result if okFiles: result = sandboxClient.uploadFilesAsSandbox(okFiles) if not result["OK"]: return result inputSandbox.append(result["Value"]) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK()
def __uploadInputSandbox(self, classAdJob): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates, rpcClient=self.sbRPCClient, transferClient=self.sbTransferClient) inputSandbox = self.__getInputSandboxEntries(classAdJob) realFiles = [] badFiles = [] okFiles = [] realFiles = [] for file in inputSandbox: valid = True for tag in ( 'lfn:', 'LFN:', 'SB:', '%s' ): #in case of parametric input sandbox, there is %s passed, so have to ignore it also if file.find(tag) == 0: valid = False break if valid: realFiles.append(file) #If there are no files, skip! if not realFiles: return S_OK() #Check real files for file in realFiles: if not os.path.exists(file): badFiles.append(file) print "inputSandbox file/directory " + file + " not found" continue okFiles.append(file) #print "Total size of the inputSandbox: "+str(totalSize) totalSize = File.getGlobbedTotalSize(okFiles) if badFiles: result = S_ERROR('Input Sandbox is not valid') result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: result = sandboxClient.uploadFilesAsSandbox(okFiles) if not result['OK']: return result inputSandbox.append(result['Value']) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK()
def __assignSandboxesToJob(self, jobID, classAdJob): sandboxClient = SandboxStoreClient() inputSandboxes = self.__getInputSandboxEntries(classAdJob) sbToAssign = [] for isb in inputSandboxes: if isb.find("SB:") == 0: sbToAssign.append(isb) if sbToAssign: assignList = [(isb, 'Input') for isb in sbToAssign] result = sandboxClient.assignSandboxesToJob(jobID, assignList) if not result['OK']: return result return S_OK()
def __assignSandboxesToJob( self, jobID, classAdJob ): sandboxClient = SandboxStoreClient() inputSandboxes = self.__getInputSandboxEntries( classAdJob ) sbToAssign = [] for isb in inputSandboxes: if isb.find( "SB:" ) == 0: sbToAssign.append( isb ) if sbToAssign: assignList = [ ( isb, 'Input' ) for isb in sbToAssign ] result = sandboxClient.assignSandboxesToJob( jobID, assignList ) if not result[ 'OK' ]: return result return S_OK()
def getSandbox(): request = bottle.request if 'sburl' not in request.query: bottle.abort( 400, "Missing sburl parameter" ) sburl = urllib.unquote( request.query[ 'sburl' ] ) sbClient = SandboxStoreClient( useCertificates = True, delegatedDN = gOAData.userDN, delegatedGroup = gOAData.userGroup ) tmpDir = tempfile.mkdtemp( prefix = "down.", dir = gWorkDir ) result = sbClient.downloadSandbox( sburl, tmpDir, unpack = False ) if not result[ 'OK' ]: print result os.rmdir( tmpDir ) bottle.abort( 401, "Can't download %s" % sburl ) print result
def __init__(self, jobManagerClient=False, sbRPCClient=False, sbTransferClient=False, useCertificates=False, timeout=120): """ WMS Client constructor """ self.jobManagerClient = jobManagerClient self.useCertificates = useCertificates self.timeout = timeout self.sandboxClient = SandboxStoreClient( useCertificates=useCertificates, rpcClient=sbRPCClient, transferClient=sbTransferClient)
def initialize(self): """ Standard constructor """ import threading self.am_setOption("PollingTime", 5) self.am_setOption("ThreadStartDelay", 1) self.am_setOption("SubmitPools", []) self.am_setOption("DefaultSubmitPools", []) self.am_setOption("minThreadsInPool", 0) self.am_setOption("maxThreadsInPool", 2) self.am_setOption("totalThreadsInPool", 40) self.callBackLock = threading.Lock() self.pendingJobs = {} self.monitoringEndPoints = {} """ #SandBox Settings """ self.__tmpSandBoxDir = "/tmp/" self.sandboxClient = SandboxStoreClient() self.failedFlag = True self.sandboxSizeLimit = 1024 * 1024 * 10 self.fileList = 0 self.outputSandboxSize = 0 self.cleanDataAfterFinish = True return DIRAC.S_OK()
def uploadSandbox(self, fileData): with TmpDir() as tmpDir: fileList = [] for fName in fileData: for entry in fileData[fName]: tmpFile = os.path.join(tmpDir, entry.filename) if tmpFile not in fileList: fileList.append(tmpFile) dfd = open(tmpFile, "w") dfd.write(entry.body) dfd.close() sbClient = SandboxStoreClient() result = sbClient.uploadFilesAsSandbox(fileList) if not result['OK']: return WErr(500, result['Message']) return WOK(result['Value'])
def initializeOptimizer( self ): """Initialize specific parameters for JobSanityAgent. """ #Test control flags N.B. JDL check is mandatory self.inputDataCheck = self.am_getOption( 'InputDataCheck', 1 ) self.outputDataCheck = self.am_getOption( 'OutputDataCheck', 0 ) self.inputSandboxCheck = self.am_getOption( 'InputSandboxCheck', 1 ) self.platformCheck = self.am_getOption( 'PlatformCheck', 0 ) #Other parameters self.voName = getVO( 'lhcb' ) self.successStatus = self.am_getOption( 'SuccessfulJobStatus', 'OutputReady' ) self.maxDataPerJob = self.am_getOption( 'MaxInputDataPerJob', 100 ) #Sandbox self.sandboxClient = SandboxStoreClient( useCertificates = True ) self.log.debug( 'JDL Check ==> Enabled' ) if self.inputDataCheck: self.log.debug( 'Input Data Check ==> Enabled' ) else: self.log.debug( 'Input Data Check ==> Disabled' ) if self.outputDataCheck: self.log.debug( 'Output Data Check ==> Enabled' ) else: self.log.debug( 'Output Data Check ==> Disabled' ) if self.inputSandboxCheck: self.log.debug( 'Input Sbox Check ==> Enabled' ) else: self.log.debug( 'Input Sbox Check ==> Disabled' ) if self.platformCheck: self.log.debug( 'Platform Check ==> Enabled' ) else: self.log.debug( 'Platform Check ==> Disabled' ) return S_OK()
def __uploadInputSandbox( self, classAdJob, jobDescriptionObject = None ): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries( classAdJob ) realFiles = [] badFiles = [] diskFiles = [] for isFile in inputSandbox: valid = True for tag in ( 'lfn:', 'LFN:', 'SB:', '%s' ): # in case of parametric input sandbox, there is %s passed, so have to ignore it also if isFile.find( tag ) == 0: valid = False break if valid: realFiles.append( isFile ) stringIOFiles = [] stringIOFilesSize = 0 if jobDescriptionObject is not None: if isinstance( jobDescriptionObject, StringIO.StringIO ): stringIOFiles = [jobDescriptionObject] stringIOFilesSize = len( jobDescriptionObject.buf ) gLogger.debug( "Size of the stringIOFiles: " + str( stringIOFilesSize ) ) else: return S_ERROR( "jobDescriptionObject is not a StringIO object" ) # Check real files for isFile in realFiles: if not os.path.exists( isFile ): # we are passing in real files, we expect them to be on disk badFiles.append( isFile ) gLogger.warn( "inputSandbox file/directory " + isFile + " not found. Keep looking for the others" ) continue diskFiles.append( isFile ) diskFilesSize = File.getGlobbedTotalSize( diskFiles ) gLogger.debug( "Size of the diskFiles: " + str( diskFilesSize ) ) totalSize = diskFilesSize + stringIOFilesSize gLogger.verbose( "Total size of the inputSandbox: " + str( totalSize ) ) okFiles = stringIOFiles + diskFiles if badFiles: result = S_ERROR( 'Input Sandbox is not valid' ) result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates = self.useCertificates ) result = self.sandboxClient.uploadFilesAsSandbox( okFiles ) if not result[ 'OK' ]: return result inputSandbox.append( result[ 'Value' ] ) classAdJob.insertAttributeVectorString( "InputSandbox", inputSandbox ) return S_OK()
def web_getSandbox( self ): if 'jobID' not in self.request.arguments: self.finish( {"success":"false", "error":"Maybe you forgot the jobID ?"} ) return jobID = int( self.request.arguments['jobID'][0] ) sbType = 'Output' if 'sandbox' in self.request.arguments: sbType = str( self.request.arguments['sandbox'][0] ) userData = self.getSessionData() client = SandboxStoreClient( useCertificates = True, delegatedDN = str( userData["user"]["DN"] ), delegatedGroup = str( userData["user"]["group"] ), setup = userData["setup"] ) result = yield self.threadTask( client.downloadSandboxForJob, jobID, sbType, inMemory = True ) if not result['OK']: self.finish( {"success":"false", "error":"Error: %s" % result['Message']} ) return if "check" in self.request.arguments: self.finish( {"success":"true"} ) return data = result['Value'] fname = "%s_%sSandbox.tar" % ( str( jobID ), sbType ) self.set_header( 'Content-type', 'application/x-tar' ) self.set_header( 'Content-Disposition', 'attachment; filename="%s"' % fname ) self.set_header( 'Content-Length', len( data ) ) self.set_header( 'Cache-Control', "no-cache, no-store, must-revalidate, max-age=0" ) self.set_header( 'Pragma', "no-cache" ) self.finish( data )
def uploadSandbox( self, fileData ): with TmpDir() as tmpDir: fileList = [] for fName in fileData: for entry in fileData[ fName ]: tmpFile = os.path.join( tmpDir, entry.filename ) if tmpFile not in fileList: fileList.append( tmpFile ) dfd = open( tmpFile, "w" ) dfd.write( entry.body ) dfd.close() sbClient = SandboxStoreClient() result = sbClient.uploadFilesAsSandbox( fileList ) if not result[ 'OK' ]: return WErr( 500, result[ 'Message' ] ) return WOK( result[ 'Value' ] )
def listSandboxes( type, id ): type = type.lower() if type not in ( 'job', 'pilot' ): bottle.abort( 404 ) try: id = int( id ) except ValueError: bottle.abort( 400, "id has to be an integer" ) sbClient = SandboxStoreClient( useCertificates = True, delegatedDN = gOAData.userDN, delegatedGroup = gOAData.userGroup ) if type == "job": result = sbClient.getSandboxesForJob( id ) else: result = sbClient.getSandboxesForPilot( id ) if not result[ 'OK' ]: bottle.abort( 500, result[ 'Message' ] ) return result[ 'Value' ]
def __init__( self, jobManagerClient = False, sbRPCClient = False, sbTransferClient = False, useCertificates = False, timeout = 120 ): """ WMS Client constructor """ self.jobManagerClient = jobManagerClient self.useCertificates = useCertificates self.timeout = timeout self.sandboxClient = SandboxStoreClient( useCertificates = useCertificates, rpcClient = sbRPCClient, transferClient = sbTransferClient )
def test_SSCChain( self ): """ full test of functionalities """ ssc = SandboxStoreClient() smDB = SandboxMetadataDB() exeScriptLocation = find_all( 'exe-script.py', '.', 'WorkloadManagementSystem' )[0] fileList = [exeScriptLocation] res = ssc.uploadFilesAsSandbox( fileList ) self.assert_( res['OK'] ) # SEPFN = res['Value'].split( '|' )[1] res = ssc.uploadFilesAsSandboxForJob( fileList, 1, 'Input' ) self.assert_( res['OK'] ) # res = ssc.downloadSandboxForJob( 1, 'Input' ) #to run this would need the RSS on # self.assert_( res['OK'] ) # only ones needing the DB res = smDB.getUnusedSandboxes() self.assert_( res['OK'] )
def test_SSCChain(): """full test of functionalities""" ssc = SandboxStoreClient() smDB = SandboxMetadataDB() exeScriptLocation = find_all("exe-script.py", "..", "/DIRAC/tests/Integration")[0] fileList = [exeScriptLocation] res = ssc.uploadFilesAsSandbox(fileList) assert res["OK"] is True, res["Message"] # SEPFN = res['Value'].split( '|' )[1] res = ssc.uploadFilesAsSandboxForJob(fileList, 1, "Input") assert res["OK"] is True, res["Message"] res = ssc.downloadSandboxForJob(1, "Input") # to run this we need the RSS on print(res) # for debug... assert res["OK"] is True, res["Message"] # only ones needing the DB res = smDB.getUnusedSandboxes() print(res) assert res["OK"] is True, res["Message"]
def __init__(self, jobManagerClient=None, sbRPCClient=None, sbTransferClient=None, useCertificates=False, timeout=600): """ WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.timeout = timeout self.jobManager = jobManagerClient self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient=sbRPCClient, transferClient=sbTransferClient, useCertificates=useCertificates)
def test_SSCChain(self): """ full test of functionalities """ ssc = SandboxStoreClient() smDB = SandboxMetadataDB() exeScriptLocation = find_all('exe-script.py', '..', '/DIRAC/tests/Integration')[0] fileList = [exeScriptLocation] res = ssc.uploadFilesAsSandbox(fileList) assert res['OK'] is True # SEPFN = res['Value'].split( '|' )[1] res = ssc.uploadFilesAsSandboxForJob(fileList, 1, 'Input') assert res['OK'] is True res = ssc.downloadSandboxForJob(1, 'Input') # to run this we need the RSS on print res # for debug... assert res['OK'] is True # only ones needing the DB res = smDB.getUnusedSandboxes() print res assert res['OK'] is True
def _getJobSB( self, jid, objName ): with TmpDir() as tmpDir: if objName == "outputsandbox": objName = "Output" else: objName = "Input" result = SandboxStoreClient().downloadSandboxForJob( int( jid ), objName, tmpDir, inMemory = True ) if not result[ 'OK' ]: msg = result[ 'Message' ] if msg.find( "No %s sandbox" % objName ) == 0: return WErr( 404, "No %s sandbox defined for job %s" % ( jid, objName.lower() ) ) return WErr( 500, result[ 'Message' ] ) return WOK( result[ 'Value' ] )
def test_SSCChain(self): """ full test of functionalities """ ssc = SandboxStoreClient() smDB = SandboxMetadataDB() exeScriptLocation = find_all('exe-script.py', '..', '/DIRAC/tests/Integration')[0] fileList = [exeScriptLocation] res = ssc.uploadFilesAsSandbox(fileList) assert res['OK'] is True # SEPFN = res['Value'].split( '|' )[1] res = ssc.uploadFilesAsSandboxForJob(fileList, 1, 'Input') assert res['OK'] is True res = ssc.downloadSandboxForJob(1, 'Input') # to run this we need the RSS on print(res) # for debug... assert res['OK'] is True # only ones needing the DB res = smDB.getUnusedSandboxes() print(res) assert res['OK'] is True
def __init__( self, user , publicIP, looptime , parentthread, output, getinfo ): threading.Thread.__init__( self ) self.sshConnect = ConnectionUtils( user , publicIP ) self.looptime = looptime self.parentthread = parentthread self.output = output self.getinfo = getinfo """ #SandBox Settings """ self.sandboxClient = SandboxStoreClient() self.failedFlag = True self.sandboxSizeLimit = 1024 * 1024 * 10
def __init__( self, jobManagerClient = None, sbRPCClient = None, sbTransferClient = None, useCertificates = False, timeout = 600 ): """ WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.timeout = timeout self.jobManager = jobManagerClient self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient = sbRPCClient, transferClient = sbTransferClient, useCertificates = useCertificates )
def __uploadInputSandbox( self, classAdJob ): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries( classAdJob ) badFiles = [] okFiles = [] realFiles = [] for isFile in inputSandbox: valid = True for tag in ( 'lfn:', 'LFN:', 'SB:', '%s' ): # in case of parametric input sandbox, there is %s passed, so have to ignore it also if isFile.find( tag ) == 0: valid = False break if valid: realFiles.append( isFile ) # If there are no files, skip! if not realFiles: return S_OK() # Check real files for isFile in realFiles: if not os.path.exists( isFile ): badFiles.append( isFile ) gLogger.warn( "inputSandbox file/directory " + isFile + " not found. Keep looking for the others" ) continue okFiles.append( isFile ) totalSize = File.getGlobbedTotalSize( okFiles ) gLogger.verbose( "Total size of the inputSandbox: " + str( totalSize ) ) if badFiles: result = S_ERROR( 'Input Sandbox is not valid' ) result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates = self.useCertificates ) result = self.sandboxClient.uploadFilesAsSandbox( okFiles ) if not result[ 'OK' ]: return result inputSandbox.append( result[ 'Value' ] ) classAdJob.insertAttributeVectorString( "InputSandbox", inputSandbox ) return S_OK()
class WMSClient( object ): def __init__( self, jobManagerClient = None, sbRPCClient = None, sbTransferClient = None, useCertificates = False, timeout = 600 ): """ WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.timeout = timeout self.jobManager = jobManagerClient self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient = sbRPCClient, transferClient = sbTransferClient, useCertificates = useCertificates ) ############################################################################### def __getInputSandboxEntries( self, classAdJob ): if classAdJob.lookupAttribute( "InputSandbox" ): inputSandbox = classAdJob.get_expression( "InputSandbox" ) inputSandbox = inputSandbox.replace( '","', "\n" ) inputSandbox = inputSandbox.replace( '{', "" ) inputSandbox = inputSandbox.replace( '}', "" ) inputSandbox = inputSandbox.replace( '"', "" ) inputSandbox = inputSandbox.replace( ',', "" ) inputSandbox = inputSandbox.split() else: inputSandbox = [] return inputSandbox def __uploadInputSandbox( self, classAdJob ): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries( classAdJob ) badFiles = [] okFiles = [] realFiles = [] for isFile in inputSandbox: valid = True for tag in ( 'lfn:', 'LFN:', 'SB:', '%s' ): # in case of parametric input sandbox, there is %s passed, so have to ignore it also if isFile.find( tag ) == 0: valid = False break if valid: realFiles.append( isFile ) # If there are no files, skip! if not realFiles: return S_OK() # Check real files for isFile in realFiles: if not os.path.exists( isFile ): badFiles.append( isFile ) gLogger.warn( "inputSandbox file/directory " + isFile + " not found. Keep looking for the others" ) continue okFiles.append( isFile ) totalSize = File.getGlobbedTotalSize( okFiles ) gLogger.verbose( "Total size of the inputSandbox: " + str( totalSize ) ) if badFiles: result = S_ERROR( 'Input Sandbox is not valid' ) result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates = self.useCertificates ) result = self.sandboxClient.uploadFilesAsSandbox( okFiles ) if not result[ 'OK' ]: return result inputSandbox.append( result[ 'Value' ] ) classAdJob.insertAttributeVectorString( "InputSandbox", inputSandbox ) return S_OK() def submitJob( self, jdl ): """ Submit one job specified by its JDL to WMS """ if os.path.exists( jdl ): fic = open ( jdl, "r" ) jdlString = fic.read() fic.close() else: # If file JDL does not exist, assume that the JDL is passed as a string jdlString = jdl # Check the validity of the input JDL jdlString = jdlString.strip() if jdlString.find( "[" ) != 0: jdlString = "[%s]" % jdlString classAdJob = ClassAd( jdlString ) if not classAdJob.isOK(): return S_ERROR( 'Invalid job JDL' ) # Check the size and the contents of the input sandbox result = self.__uploadInputSandbox( classAdJob ) if not result['OK']: return result # Submit the job now and get the new job ID if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) result = self.jobManager.submitJob( classAdJob.asJDL() ) if 'requireProxyUpload' in result and result['requireProxyUpload']: gLogger.warn( "Need to upload the proxy" ) return result def killJob( self, jobID ): """ Kill running job. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) return self.jobManager.killJob( jobID ) def deleteJob( self, jobID ): """ Delete job(s) from the WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) return self.jobManager.deleteJob( jobID ) def rescheduleJob( self, jobID ): """ Reschedule job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) return self.jobManager.rescheduleJob( jobID ) def resetJob( self, jobID ): """ Reset job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) return self.jobManager.resetJob( jobID )
class JobSanityAgent( OptimizerModule ): """ The specific Optimizer must provide the following methods: - checkJob() - the main method called for each job and it can provide: - initializeOptimizer() before each execution cycle """ ############################################################################# def initializeOptimizer( self ): """Initialize specific parameters for JobSanityAgent. """ #Test control flags N.B. JDL check is mandatory self.inputDataCheck = self.am_getOption( 'InputDataCheck', 1 ) self.outputDataCheck = self.am_getOption( 'OutputDataCheck', 0 ) self.inputSandboxCheck = self.am_getOption( 'InputSandboxCheck', 1 ) self.platformCheck = self.am_getOption( 'PlatformCheck', 0 ) #Other parameters self.successStatus = self.am_getOption( 'SuccessfulJobStatus', 'OutputReady' ) self.maxDataPerJob = self.am_getOption( 'MaxInputDataPerJob', 100 ) #Sandbox self.sandboxClient = SandboxStoreClient( useCertificates = True ) self.log.debug( 'JDL Check ==> Enabled' ) if self.inputDataCheck: self.log.debug( 'Input Data Check ==> Enabled' ) else: self.log.debug( 'Input Data Check ==> Disabled' ) if self.outputDataCheck: self.log.debug( 'Output Data Check ==> Enabled' ) else: self.log.debug( 'Output Data Check ==> Disabled' ) if self.inputSandboxCheck: self.log.debug( 'Input Sbox Check ==> Enabled' ) else: self.log.debug( 'Input Sbox Check ==> Disabled' ) if self.platformCheck: self.log.debug( 'Platform Check ==> Enabled' ) else: self.log.debug( 'Platform Check ==> Disabled' ) return S_OK() ############################################################################# def checkJob( self, job, classAdJob ): """ This method controls the order and presence of each sanity check for submitted jobs. This should be easily extended in the future to accommodate any other potential checks. """ #Job JDL check message = "Job: %s JDL: OK," % job self.log.debug( "Checking Loop Starts for job %s" % job ) jobType = self.jobDB.getJobAttribute( job, 'JobType' ) if not jobType['OK']: return S_ERROR( 'Could not determine job type' ) jobType = jobType['Value'] #Input data check if self.inputDataCheck: voName = classAdJob.getAttributeString( "VirtualOrganization" ) inputData = self.checkInputData( job, jobType, voName ) if inputData['OK']: number = inputData['Value'] message += 'InputData: ' + number + ', ' else: minorStatus = inputData['Value'] self.log.info( message ) self.log.info( 'Job: ' + str( job ) + ' Failed input data check.' ) return S_ERROR( minorStatus ) #Platform check # disabled if self.platformCheck: platform = self.checkPlatformSupported( job, classAdJob ) if platform['OK']: arch = platform['Value'] message += 'Platform: ' + arch + ' OK, ' else: res = 'No supported platform for job ' + str( job ) + '.' minorStatus = platform['Value'] self.log.info( message ) self.log.info( res ) return S_ERROR( message ) #Output data exists check if self.outputDataCheck: # disabled if jobType != 'user': outputData = self.checkOutputDataExists( job, classAdJob ) if outputData['OK']: if outputData.has_key( 'SUCCESS' ): success = self.successStatus minorStatus = outputData['SUCCESS'] report = outputData['Value'] message += report self.log.info( message ) self.setJobParam( job, 'JobSanityCheck', message ) self.updateJobStatus( job, success, minorStatus ) # FIXME: this can not be a S_OK(), Job has to be aborted if OutPut data is present return S_OK( 'Found successful job' ) else: flag = outputData['Value'] message += 'Output Data: ' + flag + ', ' else: res = 'Job: ' + str( job ) + ' Failed since output data exists.' minorStatus = outputData['Value'] self.log.info( message ) self.log.info( res ) return S_ERROR( message ) #Input Sandbox uploaded check if self.inputSandboxCheck: # disabled inputSandbox = self.checkInputSandbox( job, classAdJob ) if inputSandbox['OK']: sbChecked = inputSandbox['Value'] message += ' Input Sandboxes: %s, OK.' % sbChecked else: res = 'Job: %s failed due some missing sandboxes' % job minorStatus = inputSandbox['Message'] self.log.info( message ) self.log.info( res ) return S_ERROR( minorStatus ) self.log.info( message ) self.setJobParam( job, 'JobSanityCheck', message ) return self.setNextOptimizer( job ) ############################################################################# def checkInputData( self, job, jobType, voName ): """This method checks both the amount of input datasets for the job and whether the LFN conventions are correct. """ maxData = int( self.maxDataPerJob ) totalData = 0 slashFlag = 0 incorrectDataFlag = 0 result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) ) self.log.warn( result['Message'] ) result = S_ERROR() result['Value'] = 'Input Data Specification' return result if not result['Value']: return S_OK( 'No input LFNs' ) data = result['Value'] # seems to be [''] when null, which isn't an empty list ;) ok = False for i in data: if i: ok = True if not ok: self.log.debug( 'Job %s has no input data requirement' % ( job ) ) return S_OK( 'No input LFNs' ) self.log.debug( 'Job %s has an input data requirement and will be checked' % ( job ) ) data = result['Value'] repData = '\n' for i in data: repData += i + '\n' self.log.debug( 'Data is: %s' % ( repData ) ) totalData = len( data ) if totalData: for i in data: j = i.replace( 'LFN:', '' ) if not re.search( '^/' + voName + '/', j ): incorrectDataFlag += 1 if re.search( '//', j ): slashFlag += 1 if incorrectDataFlag: result = S_ERROR() result['Value'] = "Input data not correctly specified" return result if slashFlag: result = S_ERROR() result['Value'] = "Input data contains //" return result #only check limit for user jobs if jobType.lower() == 'user' and totalData > maxData: message = '%s datasets selected. Max limit is %s.' % ( totalData, maxData ) self.setJobParam( job, 'DatasetCheck', message ) result = S_ERROR() result['Value'] = "Exceeded Maximum Dataset Limit (%s)" % ( maxData ) return result number = str( totalData ) result = S_OK() result['Value'] = number + ' LFNs OK' return result ############################################################################# def checkOutputDataExists( self, job, classAdJob ): """If the job output data is already in the LFC, this method will fail the job for the attention of the data manager. To be tidied for DIRAC3... """ # FIXME: To implement checkOutputDataExists return S_OK() ############################################################################# def checkPlatformSupported( self, job, classAdJob ): """This method queries the CS for available platforms supported by DIRAC and will check these against what the job requests. """ # FIXME: To implement checkPlatformSupported return S_OK() ############################################################################# def checkInputSandbox( self, job, classAdJob ): """The number of input sandbox files, as specified in the job JDL are checked in the JobDB. """ ownerName = classAdJob.getAttributeString( "Owner" ) if not ownerName: ownerDN = classAdJob.getAttributeString( "OwnerDN" ) ownerName = CS.getUsernameForDN( ownerDN ) ownerGroup = classAdJob.getAttributeString( "OwnerGroup" ) jobSetup = classAdJob.getAttributeString( "DIRACSetup" ) isbList = classAdJob.getListFromExpression( 'InputSandbox' ) sbsToAssign = [] for isb in isbList: if isb.find( "SB:" ) == 0: self.log.info( "Found a sandbox", isb ) sbsToAssign.append( ( isb, "Input" ) ) numSBsToAssign = len( sbsToAssign ) if not numSBsToAssign: return S_OK( 0 ) self.log.info( "Assigning %s sandboxes on behalf of %s@%s" % ( numSBsToAssign, ownerName, ownerGroup ) ) result = self.sandboxClient.assignSandboxesToJob( job, sbsToAssign, ownerName, ownerGroup, jobSetup ) if not result[ 'OK' ]: self.log.error( "Could not assign sandboxes in the SandboxStore", "assigned to job %s" % job ) return S_ERROR( "Cannot assign sandbox to job" ) assigned = result[ 'Value' ] if assigned != numSBsToAssign: self.log.error( "Could not assign all sandboxes (%s). Only assigned %s" % ( numSBsToAssign, assigned ) ) return S_OK( numSBsToAssign )
class WMSClient(object): def __init__(self, jobManagerClient=None, sbRPCClient=None, sbTransferClient=None, useCertificates=False, timeout=600): """ WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.timeout = timeout self.jobManager = jobManagerClient self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient=sbRPCClient, transferClient=sbTransferClient, useCertificates=useCertificates) ############################################################################### def __getInputSandboxEntries(self, classAdJob): if classAdJob.lookupAttribute("InputSandbox"): inputSandbox = classAdJob.get_expression("InputSandbox") inputSandbox = inputSandbox.replace('","', "\n") inputSandbox = inputSandbox.replace('{', "") inputSandbox = inputSandbox.replace('}', "") inputSandbox = inputSandbox.replace('"', "") inputSandbox = inputSandbox.replace(',', "") inputSandbox = inputSandbox.split() else: inputSandbox = [] return inputSandbox def __uploadInputSandbox(self, classAdJob): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) badFiles = [] okFiles = [] realFiles = [] for isFile in inputSandbox: valid = True for tag in ( 'lfn:', 'LFN:', 'SB:', '%s' ): # in case of parametric input sandbox, there is %s passed, so have to ignore it also if isFile.find(tag) == 0: valid = False break if valid: realFiles.append(isFile) # If there are no files, skip! if not realFiles: return S_OK() # Check real files for isFile in realFiles: if not os.path.exists(isFile): badFiles.append(isFile) gLogger.warn("inputSandbox file/directory " + isFile + " not found. Keep looking for the others") continue okFiles.append(isFile) totalSize = File.getGlobbedTotalSize(okFiles) gLogger.verbose("Total size of the inputSandbox: " + str(totalSize)) if badFiles: result = S_ERROR('Input Sandbox is not valid') result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates) result = self.sandboxClient.uploadFilesAsSandbox(okFiles) if not result['OK']: return result inputSandbox.append(result['Value']) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK() def submitJob(self, jdl): """ Submit one job specified by its JDL to WMS """ if os.path.exists(jdl): fic = open(jdl, "r") jdlString = fic.read() fic.close() else: # If file JDL does not exist, assume that the JDL is passed as a string jdlString = jdl # Check the validity of the input JDL jdlString = jdlString.strip() if jdlString.find("[") != 0: jdlString = "[%s]" % jdlString classAdJob = ClassAd(jdlString) if not classAdJob.isOK(): return S_ERROR('Invalid job JDL') # Check the size and the contents of the input sandbox result = self.__uploadInputSandbox(classAdJob) if not result['OK']: return result # Submit the job now and get the new job ID if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) result = self.jobManager.submitJob(classAdJob.asJDL()) if 'requireProxyUpload' in result and result['requireProxyUpload']: gLogger.warn("Need to upload the proxy") return result def killJob(self, jobID): """ Kill running job. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.killJob(jobID) def deleteJob(self, jobID): """ Delete job(s) from the WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.deleteJob(jobID) def rescheduleJob(self, jobID): """ Reschedule job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.rescheduleJob(jobID) def resetJob(self, jobID): """ Reset job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.resetJob(jobID)
class WMSClient(object): """Class exposing the following jobs methods: submit kill delete remove reschedule reset """ def __init__( self, jobManagerClient=None, sbRPCClient=None, sbTransferClient=None, useCertificates=False, timeout=600, delegatedDN=None, delegatedGroup=None, ): """WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.delegatedDN = delegatedDN self.delegatedGroup = delegatedGroup self.timeout = timeout self._jobManager = jobManagerClient self.operationsHelper = Operations() self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient=sbRPCClient, transferClient=sbTransferClient, useCertificates=useCertificates ) @property def jobManager(self): if not self._jobManager: self._jobManager = JobManagerClient( useCertificates=self.useCertificates, delegatedDN=self.delegatedDN, delegatedGroup=self.delegatedGroup, timeout=self.timeout, ) return self._jobManager def __getInputSandboxEntries(self, classAdJob): if classAdJob.lookupAttribute("InputSandbox"): inputSandbox = classAdJob.get_expression("InputSandbox") inputSandbox = inputSandbox.replace('","', "\n") inputSandbox = inputSandbox.replace("{", "") inputSandbox = inputSandbox.replace("}", "") inputSandbox = inputSandbox.replace('"', "") inputSandbox = inputSandbox.replace(",", "") inputSandbox = inputSandbox.split() else: inputSandbox = [] return inputSandbox def __uploadInputSandbox(self, classAdJob, jobDescriptionObject=None): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) realFiles = [] badFiles = [] diskFiles = [] for isFile in inputSandbox: if not isFile.startswith(("lfn:", "LFN:", "SB:", "%s", "%(")): realFiles.append(isFile) stringIOFiles = [] stringIOFilesSize = 0 if jobDescriptionObject is not None: if isinstance(jobDescriptionObject, StringIO): stringIOFiles = [jobDescriptionObject] stringIOFilesSize = len(jobDescriptionObject.getvalue()) gLogger.debug("Size of the stringIOFiles: " + str(stringIOFilesSize)) else: return S_ERROR(EWMSJDL, "jobDescriptionObject is not a StringIO object") # Check real files for isFile in realFiles: if not os.path.exists(isFile): # we are passing in real files, we expect them to be on disk badFiles.append(isFile) gLogger.warn("inputSandbox file/directory " + isFile + " not found. Keep looking for the others") continue diskFiles.append(isFile) diskFilesSize = File.getGlobbedTotalSize(diskFiles) gLogger.debug("Size of the diskFiles: " + str(diskFilesSize)) totalSize = diskFilesSize + stringIOFilesSize gLogger.verbose("Total size of the inputSandbox: " + str(totalSize)) okFiles = stringIOFiles + diskFiles if badFiles: result = S_ERROR(EWMSJDL, "Input Sandbox is not valid") result["BadFile"] = badFiles result["TotalSize"] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates, delegatedDN=self.delegatedDN, delegatedGroup=self.delegatedGroup, ) result = self.sandboxClient.uploadFilesAsSandbox(okFiles) if not result["OK"]: return result inputSandbox.append(result["Value"]) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK() def submitJob(self, jdl, jobDescriptionObject=None): """Submit one job specified by its JDL to WMS. The JDL may actually be the desciption of a parametric job, resulting in multiple DIRAC jobs submitted to the DIRAC WMS """ if os.path.exists(jdl): with open(jdl, "r") as fic: jdlString = fic.read() else: # If file JDL does not exist, assume that the JDL is passed as a string jdlString = jdl jdlString = jdlString.strip() gLogger.debug("Submitting JDL", jdlString) # Strip of comments in the jdl string newJdlList = [] for line in jdlString.split("\n"): if not line.strip().startswith("#"): newJdlList.append(line) jdlString = "\n".join(newJdlList) # Check the validity of the input JDL if jdlString.find("[") != 0: jdlString = "[%s]" % jdlString classAdJob = ClassAd(jdlString) if not classAdJob.isOK(): return S_ERROR(EWMSJDL, "Invalid job JDL") # Check the size and the contents of the input sandbox result = self.__uploadInputSandbox(classAdJob, jobDescriptionObject) if not result["OK"]: return result # Submit the job now and get the new job ID result = getParameterVectorLength(classAdJob) if not result["OK"]: return result nJobs = result["Value"] result = self.jobManager.submitJob(classAdJob.asJDL()) if nJobs: gLogger.debug("Applying transactional job submission") # The server applies transactional bulk submission, we should confirm the jobs if result["OK"]: jobIDList = result["Value"] if len(jobIDList) == nJobs: # Confirm the submitted jobs confirmed = False for _attempt in range(3): result = self.jobManager.confirmBulkSubmission(jobIDList) if result["OK"]: confirmed = True break time.sleep(1) if not confirmed: # The bulk submission failed, try to remove the created jobs resultDelete = self.jobManager.removeJob(jobIDList) error = "Job submission failed to confirm bulk transaction" if not resultDelete["OK"]: error += "; removal of created jobs failed" return S_ERROR(EWMSSUBM, error) else: return S_ERROR(EWMSSUBM, "The number of submitted jobs does not match job description") if result.get("requireProxyUpload"): gLogger.warn("Need to upload the proxy") return result def killJob(self, jobID): """Kill running job. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ return self.jobManager.killJob(jobID) def deleteJob(self, jobID): """Delete job(s) (set their status to DELETED) from the WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ return self.jobManager.deleteJob(jobID) def removeJob(self, jobID): """Fully remove job(s) from the WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ return self.jobManager.removeJob(jobID) def rescheduleJob(self, jobID): """Reschedule job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ return self.jobManager.rescheduleJob(jobID) def resetJob(self, jobID): """Reset job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ return self.jobManager.resetJob(jobID)
def __uploadInputSandbox(self, classAdJob, jobDescriptionObject=None): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) realFiles = [] badFiles = [] diskFiles = [] for isFile in inputSandbox: valid = True for tag in ( 'lfn:', 'LFN:', 'SB:', '%s' ): # in case of parametric input sandbox, there is %s passed, so have to ignore it also if isFile.find(tag) == 0: valid = False break if valid: realFiles.append(isFile) stringIOFiles = [] stringIOFilesSize = 0 if jobDescriptionObject is not None: if isinstance(jobDescriptionObject, StringIO.StringIO): stringIOFiles = [jobDescriptionObject] stringIOFilesSize = len(jobDescriptionObject.buf) gLogger.debug("Size of the stringIOFiles: " + str(stringIOFilesSize)) else: return S_ERROR("jobDescriptionObject is not a StringIO object") # Check real files for isFile in realFiles: if not os.path.exists( isFile ): # we are passing in real files, we expect them to be on disk badFiles.append(isFile) gLogger.warn("inputSandbox file/directory " + isFile + " not found. Keep looking for the others") continue diskFiles.append(isFile) diskFilesSize = File.getGlobbedTotalSize(diskFiles) gLogger.debug("Size of the diskFiles: " + str(diskFilesSize)) totalSize = diskFilesSize + stringIOFilesSize gLogger.verbose("Total size of the inputSandbox: " + str(totalSize)) okFiles = stringIOFiles + diskFiles if badFiles: result = S_ERROR('Input Sandbox is not valid') result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates) result = self.sandboxClient.uploadFilesAsSandbox(okFiles) if not result['OK']: return result inputSandbox.append(result['Value']) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK()
def removeDeletedJobs(self): """Fully remove jobs that are already in status "DELETED", unless there are still requests. :returns: S_OK/S_ERROR """ res = self._getJobsList({"Status": JobStatus.DELETED}) if not res["OK"]: return res jobList = res["Value"] if not jobList: self.log.info("No jobs to remove") return S_OK() self.log.info("Unassigning sandboxes from soon to be deleted jobs", "(%d)" % len(jobList)) result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result["OK"]: self.log.error("Cannot unassign jobs to sandboxes", result["Message"]) return result self.log.info("Attempting to remove deleted jobs", "(%d)" % len(jobList)) # remove from jobList those that have still Operations to do in RMS reqClient = ReqClient() res = reqClient.getRequestIDsForJobs(jobList) if not res["OK"]: return res if res["Value"]["Successful"]: notFinal = set() # Check whether these requests are in a final status for job, reqID in res["Value"]["Successful"].items(): # If not, remove job from list to remove if reqClient.getRequestStatus(reqID).get( "Value") not in Request.FINAL_STATES: # Keep that job notFinal.add(job) else: # Remove the request, if failed, keep the job res1 = reqClient.deleteRequest(reqID) if not res1["OK"]: notFinal.add(job) if notFinal: self.log.info( "Some jobs won't be removed, as still having Requests not in final status", "(n=%d)" % len(notFinal)) jobList = list(set(jobList) - notFinal) if not jobList: return S_OK() ownerJobsDict = self._getOwnerJobsDict(jobList) fail = False for owner, jobsList in ownerJobsDict.items(): ownerDN = owner.split(";")[0] ownerGroup = owner.split(";")[1] self.log.verbose( "Attempting to remove jobs", "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup)) wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN, delegatedGroup=ownerGroup) result = wmsClient.removeJob(jobsList) if not result["OK"]: self.log.error( "Could not remove jobs", "for %s : %s (n=%d) : %s" % (ownerDN, ownerGroup, len(jobsList), result["Message"]), ) fail = True if fail: return S_ERROR() return S_OK()
def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) result = self.deleteJobOversizedSandbox( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot schedule removal of oversized sandboxes", result[ 'Message' ] ) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop( jobList.index( job ) ) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK()
def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK()
class BigDataJobMonitoring(AgentModule): def initialize(self): """ Standard constructor """ import threading self.am_setOption("PollingTime", 5) self.am_setOption("ThreadStartDelay", 1) self.am_setOption("SubmitPools", []) self.am_setOption("DefaultSubmitPools", []) self.am_setOption("minThreadsInPool", 0) self.am_setOption("maxThreadsInPool", 2) self.am_setOption("totalThreadsInPool", 40) self.callBackLock = threading.Lock() self.pendingJobs = {} self.monitoringEndPoints = {} """ #SandBox Settings """ self.__tmpSandBoxDir = "/tmp/" self.sandboxClient = SandboxStoreClient() self.failedFlag = True self.sandboxSizeLimit = 1024 * 1024 * 10 self.fileList = 0 self.outputSandboxSize = 0 self.cleanDataAfterFinish = True return DIRAC.S_OK() def execute(self): """Main Agent code: 1.- Query BigDataDB for existing Running, Queue, or Submitted jobs 2.- Ask about the status 3.- Change the status into DB in the case of had changed """ self.pendingJobs["Submitted"] = BigDataDB.getBigDataJobsByStatus("Submitted") self.pendingJobs["Running"] = BigDataDB.getBigDataJobsByStatus("Running") self.pendingJobs["Unknown"] = BigDataDB.getBigDataJobsByStatus("Unknown") self.__getMonitoringPools() self.log.verbose("monitoring pools", self.monitoringEndPoints) for status in self.pendingJobs: self.log.verbose("Analizing %s jobs" % status) JobStatus = 0 if self.pendingJobs[status]["OK"]: for jobId in self.pendingJobs[status]["Value"]: self.log.verbose("Analizing job %s" % jobId) getSoftIdAndSiteName = BigDataDB.getSoftwareJobIDByJobID(jobId[0]) self.log.verbose("Site and SoftID:", getSoftIdAndSiteName) for runningEndPoint in self.monitoringEndPoints: if (self.monitoringEndPoints[runningEndPoint]["NameNode"] == getSoftIdAndSiteName[0][1]) and ( getSoftIdAndSiteName[0][0] != "" ): # Depending on the BigData Software the Query should be different if self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"] == "hadoop": if self.monitoringEndPoints[runningEndPoint]["BigDataSoftwareVersion"] == "hdv1": if ( self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"]["HLLName"] == "none" ): self.log.info( "Hadoop V.1 Monitoring submmission command with Hadoop jobID: ", getSoftIdAndSiteName[0][0], ) from BigDataDIRAC.WorkloadManagementSystem.Client.HadoopV1Client import ( HadoopV1Client, ) HadoopV1cli = HadoopV1Client( self.monitoringEndPoints[runningEndPoint]["User"], self.monitoringEndPoints[runningEndPoint]["PublicIP"], self.monitoringEndPoints[runningEndPoint]["Port"], ) JobStatus = HadoopV1cli.jobStatus( getSoftIdAndSiteName[0][0], self.monitoringEndPoints[runningEndPoint]["User"], self.monitoringEndPoints[runningEndPoint]["PublicIP"], ) if (JobStatus["OK"] == True) and ( self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1" ): if JobStatus["Value"][1].strip() == "Succeded": result = HadoopV1cli.newJob( self.__tmpSandBoxDir, jobId[0], getSoftIdAndSiteName[0][0] ) if result["OK"] == True: result = BigDataDB.updateHadoopIDAndJobStatus( jobId[0], result["Value"] ) BigDataDB.setJobStatus(jobId[0], "Running") JobStatus["OK"] = False else: self.log.info("New result from new Job", result) if JobStatus["OK"] == True: if JobStatus["Value"][1].strip() == "Succeded": BigDataDB.setJobStatus(jobId[0], "Done") if self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1": self.__updateInteractiveSandBox( jobId[0], self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"], self.monitoringEndPoints[runningEndPoint][ "BigDataSoftwareVersion" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLName" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLVersion" ], HadoopV1cli, ) else: self.__updateSandBox( jobId[0], self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"], self.monitoringEndPoints[runningEndPoint][ "BigDataSoftwareVersion" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLName" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLVersion" ], HadoopV1cli, ) getStatus = HadoopV1cli.jobCompleteStatus(getSoftIdAndSiteName[0][0]) if getStatus["OK"]: result = self.getJobFinalStatusInfo(getStatus["Value"][1]) if result["OK"]: self.sendJobAccounting(result["Value"], jobId[0]) if self.cleanDataAfterFinish: self.__deleteData(jobId[0], HadoopV1cli) if JobStatus["Value"][1].strip() == "Unknown": BigDataDB.setJobStatus(jobId[0], "Submitted") if JobStatus["Value"][1].strip() == "Running": BigDataDB.setJobStatus(jobId[0], "Running") if self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"] == "hadoop": if self.monitoringEndPoints[runningEndPoint]["BigDataSoftwareVersion"] == "hdv2": if ( self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"]["HLLName"] == "none" ): self.log.info( "Hadoop V.2 Monitoring submmission command with Hadoop jobID: ", getSoftIdAndSiteName[0][0], ) from BigDataDIRAC.WorkloadManagementSystem.Client.HadoopV2Client import ( HadoopV2Client, ) HadoopV2cli = HadoopV2Client( self.monitoringEndPoints[runningEndPoint]["User"], self.monitoringEndPoints[runningEndPoint]["PublicIP"], ) JobStatus = HadoopV2cli.jobStatus( getSoftIdAndSiteName[0][0], self.monitoringEndPoints[runningEndPoint]["User"], self.monitoringEndPoints[runningEndPoint]["PublicIP"], ) if (JobStatus["OK"] == True) and ( self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1" ): if JobStatus["Value"].strip() == "Succeded": result = HadoopV2cli.newJob( self.__tmpSandBoxDir, jobId[0], getSoftIdAndSiteName[0][0] ) if result["OK"] == True: result = BigDataDB.updateHadoopIDAndJobStatus( jobId[0], result["Value"] ) BigDataDB.setJobStatus(jobId[0], "Running") JobStatus["OK"] = False else: self.log.info("New result from new Job", result) if JobStatus["OK"] == True: if JobStatus["Value"] == "Succeded": BigDataDB.setJobStatus(jobId[0], "Done") if self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1": self.__updateInteractiveSandBox( jobId[0], self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"], self.monitoringEndPoints[runningEndPoint][ "BigDataSoftwareVersion" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLName" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLVersion" ], HadoopV2cli, ) else: self.__updateSandBox( jobId[0], self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"], self.monitoringEndPoints[runningEndPoint][ "BigDataSoftwareVersion" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLName" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLVersion" ], HadoopV2cli, ) getStatus = HadoopV2cli.jobCompleteStatus(getSoftIdAndSiteName[0][0]) if getStatus["OK"]: result = self.getJobFinalStatusInfo(getStatus["Value"][1]) if result["OK"]: self.sendJobAccounting(result["Value"], jobId[0]) # if self.cleanDataAfterFinish: # self.__deleteData( jobId[0], HadoopV2cli ) if JobStatus["Value"] == "Unknown": BigDataDB.setJobStatus(jobId[0], "Submitted") if JobStatus["Value"] == "Running": BigDataDB.setJobStatus(jobId[0], "Running") return DIRAC.S_OK() def sendJobAccounting(self, dataFromBDSoft, jobId): accountingReport = AccountingJob() accountingReport.setStartTime() result = jobDB.getJobAttributes(jobId) getting = result["Value"] if dataFromBDSoft["CPUTime"] == 0: cpuTime = 0 if getting["EndExecTime"] != "None": epoch = datetime(1970, 1, 1) td = datetime.strptime(getting["EndExecTime"], "%Y-%m-%d %H:%M:%S") - epoch EndExecTime = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10 ** 6) / 1e6 td = datetime.strptime(getting["SubmissionTime"], "%Y-%m-%d %H:%M:%S") - epoch SubmissionTime = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10 ** 6) / 1e6 cpuTime = EndExecTime - SubmissionTime else: cpuTime = dataFromBDSoft["CPUTime"] / 1000 acData = { "User": getting["Owner"], "UserGroup": getting["OwnerGroup"], "JobGroup": "cesga", "JobType": "User", "JobClass": "unknown", "ProcessingType": "unknown", "FinalMajorStatus": getting["Status"], "FinalMinorStatus": getting["MinorStatus"], "CPUTime": cpuTime, "Site": getting["Site"], # Based on the factor to convert raw CPU to Normalized units (based on the CPU Model) "NormCPUTime": 0, "ExecTime": cpuTime, "InputDataSize": dataFromBDSoft["InputDataSize"], "OutputDataSize": dataFromBDSoft["OutputDataSize"], "InputDataFiles": dataFromBDSoft["InputDataFiles"], "OutputDataFiles": len(self.fileList), "DiskSpace": 0, "InputSandBoxSize": 0, "OutputSandBoxSize": self.outputSandboxSize, "ProcessedEvents": 0, } accountingReport.setEndTime() accountingReport.setValuesFromDict(acData) self.log.debug("Info for accounting: ", acData) result = accountingReport.commit() self.log.debug("Accounting insertion: ", result) return result def getJobFinalStatusInfo(self, jobData): JobOutputInfo = {} resulting = re.search("Read=(\d+)", jobData) if resulting != None: JobOutputInfo["InputDataSize"] = int(resulting.group(0).split("=")[1]) else: JobOutputInfo["InputDataSize"] = 0 resulting = re.search("Written=(\d+)", jobData) if resulting != None: JobOutputInfo["OutputDataSize"] = int(resulting.group(0).split("=")[1]) else: JobOutputInfo["OutputDataSize"] = 0 resulting = re.search("Map input records=(\d+)", jobData) if resulting != None: JobOutputInfo["InputDataFiles"] = int(resulting.group(0).split("=")[1]) else: JobOutputInfo["InputDataFiles"] = 0 resulting = re.search("CPU.*?=(\d+)", jobData) if resulting != None: JobOutputInfo["CPUTime"] = int(resulting.group(0).split("=")[1]) else: JobOutputInfo["CPUTime"] = 0 JobOutputInfo["ExecTime"] = 0 return S_OK(JobOutputInfo) def __deleteData(self, jobid, cli): source = self.__tmpSandBoxDir + str(jobid) shutil.rmtree(source) result = cli.delData(source) if not result["OK"]: self.log.error("Error the data on BigData cluster could not be deleted", result) return S_ERROR("Data can not be deleted") return "Data deleted" def __updateInteractiveSandBox(self, jobid, software, version, hll, hllversion, cli): # Detele content of InputSandbox jobInfo = BigDataDB.getJobIDInfo(jobid) source = self.__tmpSandBoxDir + str(jobid) + "/*_out" dest = self.__tmpSandBoxDir + str(jobid) result = 0 result = cli.delHadoopData(self.__tmpSandBoxDir + str(jobid) + "/InputSandbox" + str(jobid)) self.log.debug("ATENTION::Deleting InputSandBox Contain:", result) result = cli.getdata(dest, source) self.log.debug("Step 0:getting data from hadoop:", result) if not result["OK"]: self.log.error("Error to get the data from BigData Cluster to DIRAC:", result) self.log.debug("Step:1:GetFilePaths:") outputSandbox = self.get_filepaths(self.__tmpSandBoxDir + str(jobid)) self.log.debug("Step:2:OutputSandBox:", self.__tmpSandBoxDir + str(jobid)) self.log.debug("Step:2:OutputSandBox:", outputSandbox) resolvedSandbox = self.__resolveOutputSandboxFiles(outputSandbox) self.log.debug("Step:3:ResolveSandbox:", resolvedSandbox) if not resolvedSandbox["OK"]: self.log.warn("Output sandbox file resolution failed:") self.log.warn(resolvedSandbox["Message"]) self.__report("Failed", "Resolving Output Sandbox") self.fileList = resolvedSandbox["Value"]["Files"] missingFiles = resolvedSandbox["Value"]["Missing"] if missingFiles: self.jobReport.setJobParameter("OutputSandboxMissingFiles", ", ".join(missingFiles), sendFlag=False) if self.fileList and jobid: self.outputSandboxSize = getGlobbedTotalSize(self.fileList) self.log.info("Attempting to upload Sandbox with limit:", self.sandboxSizeLimit) result = self.sandboxClient.uploadFilesAsSandboxForJob( self.fileList, jobid, "Output", self.sandboxSizeLimit ) # 1024*1024*10 if not result["OK"]: self.log.error("Output sandbox upload failed with message", result["Message"]) if result.has_key("SandboxFileName"): outputSandboxData = result["SandboxFileName"] self.log.info("Attempting to upload %s as output data" % (outputSandboxData)) outputData.append(outputSandboxData) self.jobReport.setJobParameter("OutputSandbox", "Sandbox uploaded to grid storage", sendFlag=False) self.jobReport.setJobParameter( "OutputSandboxLFN", self.__getLFNfromOutputFile(outputSandboxData)[0], sendFlag=False ) else: self.log.info("Could not get SandboxFileName to attempt upload to Grid storage") return S_ERROR( "Output sandbox upload failed and no file name supplied for failover to Grid storage" ) else: # Do not overwrite in case of Error if not self.failedFlag: self.__report("Completed", "Output Sandbox Uploaded") self.log.info("Sandbox uploaded successfully") return "OK" def __updateSandBox(self, jobid, software, version, hll, hllversion, cli): jobInfo = BigDataDB.getJobIDInfo(jobid) source = ( self.__tmpSandBoxDir + str(jobid) + "/InputSandbox" + str(jobid) + "/" + self.__getJobName(jobInfo[0][0]).replace(" ", "") + "_" + str(jobid) ) dest = ( self.__tmpSandBoxDir + str(jobid) + "/" + self.__getJobName(jobInfo[0][0]).replace(" ", "") + "_" + str(jobid) ) result = 0 if (software == "hadoop") and (version == "hdv1") and (hll == "none"): result = cli.getData(source, dest) if (software == "hadoop") and (version == "hdv2") and (hll == "none"): result = cli.getData(source, dest) if not result["OK"]: self.log.error("Error to get the data from BigData Software DFS:", result) result = cli.getdata(dest, dest) if not result["OK"]: self.log.error("Error to get the data from BigData Cluster to DIRAC:", result) outputSandbox = self.get_filepaths(dest) resolvedSandbox = self.__resolveOutputSandboxFiles(outputSandbox) if not resolvedSandbox["OK"]: self.log.warn("Output sandbox file resolution failed:") self.log.warn(resolvedSandbox["Message"]) self.__report("Failed", "Resolving Output Sandbox") self.fileList = resolvedSandbox["Value"]["Files"] missingFiles = resolvedSandbox["Value"]["Missing"] if missingFiles: self.jobReport.setJobParameter("OutputSandboxMissingFiles", ", ".join(missingFiles), sendFlag=False) if self.fileList and jobid: self.outputSandboxSize = getGlobbedTotalSize(self.fileList) self.log.info("Attempting to upload Sandbox with limit:", self.sandboxSizeLimit) result = self.sandboxClient.uploadFilesAsSandboxForJob( self.fileList, jobid, "Output", self.sandboxSizeLimit ) # 1024*1024*10 if not result["OK"]: self.log.error("Output sandbox upload failed with message", result["Message"]) if result.has_key("SandboxFileName"): outputSandboxData = result["SandboxFileName"] self.log.info("Attempting to upload %s as output data" % (outputSandboxData)) outputData.append(outputSandboxData) self.jobReport.setJobParameter("OutputSandbox", "Sandbox uploaded to grid storage", sendFlag=False) self.jobReport.setJobParameter( "OutputSandboxLFN", self.__getLFNfromOutputFile(outputSandboxData)[0], sendFlag=False ) else: self.log.info("Could not get SandboxFileName to attempt upload to Grid storage") return S_ERROR( "Output sandbox upload failed and no file name supplied for failover to Grid storage" ) else: # Do not overwrite in case of Error if not self.failedFlag: self.__report("Completed", "Output Sandbox Uploaded") self.log.info("Sandbox uploaded successfully") return "OK" def __getLFNfromOutputFile(self, outputFile, outputPath=""): """Provides a generic convention for VO output data files if no path is specified. """ if not re.search("^LFN:", outputFile): localfile = outputFile initial = self.owner[:1] vo = getVOForGroup(self.userGroup) if not vo: vo = "dirac" basePath = "/" + vo + "/user/" + initial + "/" + self.owner if outputPath: # If output path is given, append it to the user path and put output files in this directory if outputPath.startswith("/"): outputPath = outputPath[1:] else: # By default the output path is constructed from the job id subdir = str(self.jobID / 1000) outputPath = subdir + "/" + str(self.jobID) lfn = os.path.join(basePath, outputPath, os.path.basename(localfile)) else: # if LFN is given, take it as it is localfile = os.path.basename(outputFile.replace("LFN:", "")) lfn = outputFile.replace("LFN:", "") return (lfn, localfile) def get_filepaths(self, directory): """ This function will generate the file names in a directory """ file_paths = [] for root, directories, files in os.walk(directory): for filename in files: filepath = os.path.join(root, filename) file_paths.append(filepath) return file_paths def __resolveOutputSandboxFiles(self, outputSandbox): """Checks the output sandbox file list and resolves any specified wildcards. Also tars any specified directories. """ missing = [] okFiles = [] for i in outputSandbox: self.log.verbose("Looking at OutputSandbox file/directory/wildcard: %s" % i) globList = glob.glob(i) for check in globList: if os.path.isfile(check): self.log.verbose("Found locally existing OutputSandbox file: %s" % check) okFiles.append(check) if os.path.isdir(check): self.log.verbose("Found locally existing OutputSandbox directory: %s" % check) cmd = ["tar", "cf", "%s.tar" % check, check] result = systemCall(60, cmd) if not result["OK"]: self.log.error("Failed to create OutputSandbox tar", result["Message"]) elif result["Value"][0]: self.log.error("Failed to create OutputSandbox tar", result["Value"][2]) if os.path.isfile("%s.tar" % (check)): self.log.verbose("Appending %s.tar to OutputSandbox" % check) okFiles.append("%s.tar" % (check)) else: self.log.warn("Could not tar OutputSandbox directory: %s" % check) missing.append(check) for i in outputSandbox: if not i in okFiles: if not "%s.tar" % i in okFiles: if not re.search("\*", i): if not i in missing: missing.append(i) result = {"Missing": missing, "Files": okFiles} return S_OK(result) def __getJobName(self, jobName): result = re.split("_", jobName) return result[0] def __getMonitoringPools(self): for monitoringPool in self.am_getOption("SubmitPools"): self.log.verbose("Monitoring Pools", monitoringPool) pathPools = self.am_getModuleParam("section") + "/" + monitoringPool + "/EndPointMonitoring" monitorings = gConfig.getValue(pathPools) splitted = re.split(",", monitorings) for endpoint in splitted: self.configureFromSection("/Resources/BigDataEndpoints/", endpoint) return "OK" def configureFromSection(self, mySection, endPoint): """ get CS for monitoring endpoints """ self.log.debug("Configuring from %s" % mySection) monitoringBDEndPointDict = BigDataDB.getRunningEnPointDict(endPoint) if not monitoringBDEndPointDict["OK"]: self.log.error("Error in RunninggBDEndPointDict: %s" % monitoringBDEndPointDict["Message"]) return monitoringBDEndPointDict self.log.verbose("Trying to configure RunningBDEndPointDict:", monitoringBDEndPointDict) monitoringBDEndPointDict = monitoringBDEndPointDict["Value"] for option in [ "NameNode", "Port", "SiteName", "BigDataSoftware", "BigDataSoftwareVersion", "HighLevelLanguage", "LimitQueueJobsEndPoint", "URL", "PublicIP", ]: if option not in monitoringBDEndPointDict.keys(): self.log.error('Missing option in "%s" EndPoint definition:' % endPoint, option) continue self.monitoringEndPoints[endPoint] = {} self.monitoringEndPoints[endPoint]["NameNode"] = monitoringBDEndPointDict["NameNode"] self.monitoringEndPoints[endPoint]["Port"] = int(monitoringBDEndPointDict["Port"]) self.monitoringEndPoints[endPoint]["SiteName"] = monitoringBDEndPointDict["SiteName"] self.monitoringEndPoints[endPoint]["BigDataSoftware"] = monitoringBDEndPointDict["BigDataSoftware"] self.monitoringEndPoints[endPoint]["BigDataSoftwareVersion"] = monitoringBDEndPointDict[ "BigDataSoftwareVersion" ] self.monitoringEndPoints[endPoint]["LimitQueueJobsEndPoint"] = int( monitoringBDEndPointDict["LimitQueueJobsEndPoint"] ) self.monitoringEndPoints[endPoint]["URL"] = monitoringBDEndPointDict["URL"] self.monitoringEndPoints[endPoint]["User"] = monitoringBDEndPointDict["User"] self.monitoringEndPoints[endPoint]["PublicIP"] = monitoringBDEndPointDict["PublicIP"] self.monitoringEndPoints[endPoint]["IsInteractive"] = monitoringBDEndPointDict["IsInteractive"] self.monitoringEndPoints[endPoint]["HighLevelLanguage"] = monitoringBDEndPointDict["HighLevelLanguage"]
class WMSClient( object ): def __init__( self, jobManagerClient = None, sbRPCClient = None, sbTransferClient = None, useCertificates = False, timeout = 600 ): """ WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.timeout = timeout self.jobManager = jobManagerClient self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient = sbRPCClient, transferClient = sbTransferClient, useCertificates = useCertificates ) ############################################################################### def __getInputSandboxEntries( self, classAdJob ): if classAdJob.lookupAttribute( "InputSandbox" ): inputSandbox = classAdJob.get_expression( "InputSandbox" ) inputSandbox = inputSandbox.replace( '","', "\n" ) inputSandbox = inputSandbox.replace( '{', "" ) inputSandbox = inputSandbox.replace( '}', "" ) inputSandbox = inputSandbox.replace( '"', "" ) inputSandbox = inputSandbox.replace( ',', "" ) inputSandbox = inputSandbox.split() else: inputSandbox = [] return inputSandbox def __uploadInputSandbox( self, classAdJob, jobDescriptionObject = None ): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries( classAdJob ) realFiles = [] badFiles = [] diskFiles = [] for isFile in inputSandbox: if not isFile.startswith( ( 'lfn:', 'LFN:', 'SB:', '%s', '%(' ) ): realFiles.append( isFile ) stringIOFiles = [] stringIOFilesSize = 0 if jobDescriptionObject is not None: if isinstance( jobDescriptionObject, StringIO.StringIO ): stringIOFiles = [jobDescriptionObject] stringIOFilesSize = len( jobDescriptionObject.buf ) gLogger.debug( "Size of the stringIOFiles: " + str( stringIOFilesSize ) ) else: return S_ERROR( "jobDescriptionObject is not a StringIO object" ) # Check real files for isFile in realFiles: if not os.path.exists( isFile ): # we are passing in real files, we expect them to be on disk badFiles.append( isFile ) gLogger.warn( "inputSandbox file/directory " + isFile + " not found. Keep looking for the others" ) continue diskFiles.append( isFile ) diskFilesSize = File.getGlobbedTotalSize( diskFiles ) gLogger.debug( "Size of the diskFiles: " + str( diskFilesSize ) ) totalSize = diskFilesSize + stringIOFilesSize gLogger.verbose( "Total size of the inputSandbox: " + str( totalSize ) ) okFiles = stringIOFiles + diskFiles if badFiles: result = S_ERROR( 'Input Sandbox is not valid' ) result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates = self.useCertificates ) result = self.sandboxClient.uploadFilesAsSandbox( okFiles ) if not result[ 'OK' ]: return result inputSandbox.append( result[ 'Value' ] ) classAdJob.insertAttributeVectorString( "InputSandbox", inputSandbox ) return S_OK() def submitJob( self, jdl, jobDescriptionObject = None ): """ Submit one job specified by its JDL to WMS """ if os.path.exists( jdl ): fic = open ( jdl, "r" ) jdlString = fic.read() fic.close() else: # If file JDL does not exist, assume that the JDL is passed as a string jdlString = jdl jdlString = jdlString.strip() # Strip of comments in the jdl string newJdlList = [] for line in jdlString.split('\n'): if not line.strip().startswith( '#' ): newJdlList.append( line ) jdlString = '\n'.join( newJdlList ) # Check the validity of the input JDL if jdlString.find( "[" ) != 0: jdlString = "[%s]" % jdlString classAdJob = ClassAd( jdlString ) if not classAdJob.isOK(): return S_ERROR( 'Invalid job JDL' ) # Check the size and the contents of the input sandbox result = self.__uploadInputSandbox( classAdJob, jobDescriptionObject ) if not result['OK']: return result # Submit the job now and get the new job ID if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) result = self.jobManager.submitJob( classAdJob.asJDL() ) if 'requireProxyUpload' in result and result['requireProxyUpload']: gLogger.warn( "Need to upload the proxy" ) return result def killJob( self, jobID ): """ Kill running job. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) return self.jobManager.killJob( jobID ) def deleteJob( self, jobID ): """ Delete job(s) from the WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) return self.jobManager.deleteJob( jobID ) def rescheduleJob( self, jobID ): """ Reschedule job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) return self.jobManager.rescheduleJob( jobID ) def resetJob( self, jobID ): """ Reset job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) return self.jobManager.resetJob( jobID )
class WMSClient: def __init__( self, jobManagerClient = False, sbRPCClient = False, sbTransferClient = False, useCertificates = False, timeout = 120 ): """ WMS Client constructor """ self.jobManagerClient = jobManagerClient self.useCertificates = useCertificates self.timeout = timeout self.sandboxClient = SandboxStoreClient( useCertificates = useCertificates, rpcClient = sbRPCClient, transferClient = sbTransferClient ) ############################################################################### def __getInputSandboxEntries( self, classAdJob ): if classAdJob.lookupAttribute( "InputSandbox" ): inputSandbox = classAdJob.get_expression( "InputSandbox" ) inputSandbox = inputSandbox.replace( '","', "\n" ) inputSandbox = inputSandbox.replace( '{', "" ) inputSandbox = inputSandbox.replace( '}', "" ) inputSandbox = inputSandbox.replace( '"', "" ) inputSandbox = inputSandbox.replace( ',', "" ) inputSandbox = inputSandbox.split() else: inputSandbox = [] return inputSandbox # This are the NEW methods def __uploadInputSandbox( self, classAdJob ): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries( classAdJob ) realFiles = [] badFiles = [] okFiles = [] realFiles = [] for file in inputSandbox: valid = True for tag in ( 'lfn:', 'LFN:', 'SB:', '%s' ): # in case of parametric input sandbox, there is %s passed, so have to ignore it also if file.find( tag ) == 0: valid = False break if valid: realFiles.append( file ) # If there are no files, skip! if not realFiles: return S_OK() # Check real files for file in realFiles: if not os.path.exists( file ): badFiles.append( file ) print "inputSandbox file/directory " + file + " not found" continue okFiles.append( file ) # print "Total size of the inputSandbox: "+str(totalSize) totalSize = File.getGlobbedTotalSize( okFiles ) if badFiles: result = S_ERROR( 'Input Sandbox is not valid' ) result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: result = self.sandboxClient.uploadFilesAsSandbox( okFiles ) if not result[ 'OK' ]: return result inputSandbox.append( result[ 'Value' ] ) classAdJob.insertAttributeVectorString( "InputSandbox", inputSandbox ) return S_OK() def __assignSandboxesToJob( self, jobID, classAdJob ): sandboxClient = SandboxStoreClient() inputSandboxes = self.__getInputSandboxEntries( classAdJob ) sbToAssign = [] for isb in inputSandboxes: if isb.find( "SB:" ) == 0: sbToAssign.append( isb ) if sbToAssign: assignList = [ ( isb, 'Input' ) for isb in sbToAssign ] result = sandboxClient.assignSandboxesToJob( jobID, assignList ) if not result[ 'OK' ]: return result return S_OK() def submitJob( self, jdl ): """ Submit one job specified by its JDL to WMS """ if not self.jobManagerClient: jobManager = RPCClient( 'WorkloadManagement/JobManager', useCertificates = self.useCertificates, timeout = self.timeout ) else: jobManager = self.jobManagerClient if os.path.exists( jdl ): fic = open ( jdl, "r" ) jdlString = fic.read() fic.close() else: # If file JDL does not exist, assume that the JDL is # passed as a string jdlString = jdl # Check the validity of the input JDL jdlString = jdlString.strip() if jdlString.find( "[" ) != 0: jdlString = "[%s]" % jdlString classAdJob = ClassAd( jdlString ) if not classAdJob.isOK(): return S_ERROR( 'Invalid job JDL' ) # Check the size and the contents of the input sandbox result = self.__uploadInputSandbox( classAdJob ) if not result['OK']: return result # Submit the job now and get the new job ID result = jobManager.submitJob( classAdJob.asJDL() ) if not result['OK']: return result jobID = result['Value'] if 'requireProxyUpload' in result and result[ 'requireProxyUpload' ]: # TODO: We should notify the user to upload a proxy with proxy-upload pass # print "Sandbox uploading" return S_OK( jobID ) # This is the OLD method def __checkInputSandbox( self, classAdJob ): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries( classAdJob ) if inputSandbox: ok = 1 # print inputSandbox # Check the Input Sandbox files totalSize = 0 for file in inputSandbox: if file.find( 'lfn:' ) != 0 and file.find( 'LFN:' ) != 0 and file.find( "SB:" ) != 0: if not os.path.exists( file ): badfile = file print "inputSandbox file/directory " + file + " not found" ok = 0 else: if os.path.isdir( file ): comm = 'du -b -s ' + file status, out = commands.getstatusoutput( comm ) try: dirSize = int( out.split()[0] ) except Exception, x: print "Input Sandbox directory name", file, "is not valid !" print str( x ) badfile = file ok = 0 totalSize = totalSize + dirSize else: totalSize = int( os.stat( file )[6] ) + totalSize # print "Total size of the inputSandbox: "+str(totalSize) if not ok: result = S_ERROR( 'Input Sandbox is not valid' ) result['BadFile'] = file result['TotalSize'] = totalSize return result result = S_OK() result['InputSandbox'] = inputSandbox result['TotalSize'] = totalSize return result
class InteractiveJobMonitorThread ( threading.Thread ): def __init__( self, user , publicIP, looptime , parentthread, output, getinfo ): threading.Thread.__init__( self ) self.sshConnect = ConnectionUtils( user , publicIP ) self.looptime = looptime self.parentthread = parentthread self.output = output self.getinfo = getinfo """ #SandBox Settings """ self.sandboxClient = SandboxStoreClient() self.failedFlag = True self.sandboxSizeLimit = 1024 * 1024 * 10 def run( self ): self.log = gLogger.getSubLogger( "InteractiveJobMonitorThread" ) self.monitoring( self.looptime, self.parentthread, self.output ) def monitoring( self, loop, parentthread, output ): self.initialTiming = os.times() accountingReport = AccountingJob() accountingReport.setStartTime() numberJobsFlag = True numberJobs = 0 numberStartedJobsDict = {} numberEndingJobsDict = {} job_pattern = re.compile( 'Job =.*?,' ) job_pattern_2 = re.compile( 'Job =.*?\n' ) jobid = int( re.split( "_", re.split( "/", output )[int( len( re.split( "/", output ) ) - 1 )] )[0] ) cmd = '/bin/chmod 555 ' + self.getinfo returned = self.commandLaunch( cmd ) while parentthread.isAlive(): time.sleep( loop ) if numberJobsFlag: cmd = self.getinfo + ' -c step1' returned = self.commandLaunch( cmd ) self.log.info( 'InteractiveJobMonitorThread:step1:numJobs:', returned ) if returned != None: if ( returned['Value'][1] != "" ): if re.split( "=", returned['Value'][1] )[1].strip().isdigit(): numberJobs = int( re.split( "=", returned['Value'][1] )[1] ) if ( numberJobs != 0 ): numberJobsFlag = False BigDataDB.setJobStatus( jobid, "Running" ) else: cmd = self.getinfo + ' -c step2' returned = self.commandLaunch( cmd ) self.log.info( 'InteractiveJobMonitorThread:step2:startedJobs:', returned ) if returned != "": if ( returned['Value'][1] != "" ): startedJobs = job_pattern.findall( returned['Value'][1] ) self.log.info( 'step2:startedJobs:', startedJobs ) cmd = self.getinfo + ' -c step3' returned = self.commandLaunch( cmd ) self.log.info( 'InteractiveJobMonitorThread:step3:endedJobs:', returned ) if returned != "": if ( returned['Value'][1] != "" ): finishedJobs = job_pattern_2.findall( returned['Value'][1] ) self.log.info( 'step3:finishedJobs:', finishedJobs ) if ( len( finishedJobs ) == numberJobs ): BigDataDB.setJobStatus( jobid, "Done" ) BigDataDB.setHadoopID( jobid, finishedJobs ) self.__updateSandBox( jobid, output ) #Update Accounting EXECUTION_RESULT = {} EXECUTION_RESULT['CPU'] = [] finalStat = os.times() for i in range( len( finalStat ) ): EXECUTION_RESULT['CPU'].append( finalStat[i] - self.initialTiming[i] ) utime, stime, cutime, cstime, elapsed = EXECUTION_RESULT['CPU'] cpuTime = utime + stime + cutime + cstime execTime = elapsed result = jobDB.getJobAttributes( jobid ) getting = result['Value'] acData = { 'User' : getting['Owner'], 'UserGroup' : getting['OwnerGroup'], 'JobGroup' : 'cesga', 'JobType' : 'User', 'JobClass' : 'unknown', 'ProcessingType' : 'unknown', 'FinalMajorStatus' : getting['Status'], 'FinalMinorStatus' : getting['MinorStatus'], 'CPUTime' : cpuTime, 'Site' : getting['Site'], # Based on the factor to convert raw CPU to Normalized units (based on the CPU Model) 'NormCPUTime' : 0, 'ExecTime' : cpuTime, 'InputDataSize' : 0, 'OutputDataSize' : 0, 'InputDataFiles' : 0, 'OutputDataFiles' : 0, 'DiskSpace' : 0, 'InputSandBoxSize' : 0, 'OutputSandBoxSize' : 0, 'ProcessedEvents' : 0 } accountingReport.setEndTime() accountingReport.setValuesFromDict( acData ) result = accountingReport.commit() def commandLaunch( self, cmd ): return self.sshConnect.sshCall( 100, cmd ) def __updateSandBox( self, jobid, output ): jobInfo = BigDataDB.getJobIDInfo( jobid ) result = self.sshConnect.scpCall( 100, output, output, False ) if not result['OK']: self.log.error( 'Error to get the data from BigData Software DFS:', result ) file_paths = [] file_paths.append( output ) outputSandbox = file_paths resolvedSandbox = self.__resolveOutputSandboxFiles( outputSandbox ) if not resolvedSandbox['OK']: self.log.warn( 'Output sandbox file resolution failed:' ) self.log.warn( resolvedSandbox['Message'] ) self.__report( 'Failed', 'Resolving Output Sandbox' ) fileList = resolvedSandbox['Value']['Files'] missingFiles = resolvedSandbox['Value']['Missing'] if missingFiles: self.jobReport.setJobParameter( 'OutputSandboxMissingFiles', ', '.join( missingFiles ), sendFlag = False ) if fileList and jobid: self.outputSandboxSize = getGlobbedTotalSize( fileList ) self.log.info( 'Attempting to upload Sandbox with limit:', self.sandboxSizeLimit ) result = self.sandboxClient.uploadFilesAsSandboxForJob( fileList, jobid, 'Output', self.sandboxSizeLimit ) # 1024*1024*10 if not result['OK']: self.log.error( 'Output sandbox upload failed with message', result['Message'] ) if result.has_key( 'SandboxFileName' ): outputSandboxData = result['SandboxFileName'] self.log.info( 'Attempting to upload %s as output data' % ( outputSandboxData ) ) outputData.append( outputSandboxData ) self.jobReport.setJobParameter( 'OutputSandbox', 'Sandbox uploaded to grid storage', sendFlag = False ) self.jobReport.setJobParameter( 'OutputSandboxLFN', self.__getLFNfromOutputFile( outputSandboxData )[0], sendFlag = False ) else: self.log.info( 'Could not get SandboxFileName to attempt upload to Grid storage' ) return S_ERROR( 'Output sandbox upload failed and no file name supplied for failover to Grid storage' ) else: # Do not overwrite in case of Error if not self.failedFlag: self.__report( 'Completed', 'Output Sandbox Uploaded' ) self.log.info( 'Sandbox uploaded successfully' ) return "OK" def __resolveOutputSandboxFiles( self, outputSandbox ): """Checks the output sandbox file list and resolves any specified wildcards. Also tars any specified directories. """ missing = [] okFiles = [] for i in outputSandbox: self.log.verbose( 'Looking at OutputSandbox file/directory/wildcard: %s' % i ) globList = glob.glob( i ) for check in globList: if os.path.isfile( check ): self.log.verbose( 'Found locally existing OutputSandbox file: %s' % check ) okFiles.append( check ) if os.path.isdir( check ): self.log.verbose( 'Found locally existing OutputSandbox directory: %s' % check ) cmd = ['tar', 'cf', '%s.tar' % check, check] result = systemCall( 60, cmd ) if not result['OK']: self.log.error( 'Failed to create OutputSandbox tar', result['Message'] ) elif result['Value'][0]: self.log.error( 'Failed to create OutputSandbox tar', result['Value'][2] ) if os.path.isfile( '%s.tar' % ( check ) ): self.log.verbose( 'Appending %s.tar to OutputSandbox' % check ) okFiles.append( '%s.tar' % ( check ) ) else: self.log.warn( 'Could not tar OutputSandbox directory: %s' % check ) missing.append( check ) for i in outputSandbox: if not i in okFiles: if not '%s.tar' % i in okFiles: if not re.search( '\*', i ): if not i in missing: missing.append( i ) result = {'Missing':missing, 'Files':okFiles} return S_OK( result )
def initializeOptimizer(cls): """Initialize specific parameters for JobSanityAgent. """ cls.sandboxClient = SandboxStoreClient(useCertificates=True, smdb=True) return S_OK()
class JobSanityAgent( OptimizerModule ): """ The specific Optimizer must provide the following methods: - checkJob() - the main method called for each job and it can provide: - initializeOptimizer() before each execution cycle """ ############################################################################# def initializeOptimizer( self ): """Initialize specific parameters for JobSanityAgent. """ #Test control flags N.B. JDL check is mandatory self.inputDataCheck = self.am_getOption( 'InputDataCheck', 1 ) self.outputDataCheck = self.am_getOption( 'OutputDataCheck', 0 ) self.inputSandboxCheck = self.am_getOption( 'InputSandboxCheck', 1 ) self.platformCheck = self.am_getOption( 'PlatformCheck', 0 ) #Other parameters self.voName = getVO( 'lhcb' ) self.successStatus = self.am_getOption( 'SuccessfulJobStatus', 'OutputReady' ) self.maxDataPerJob = self.am_getOption( 'MaxInputDataPerJob', 100 ) #Sandbox self.sandboxClient = SandboxStoreClient( useCertificates = True ) self.log.debug( 'JDL Check ==> Enabled' ) if self.inputDataCheck: self.log.debug( 'Input Data Check ==> Enabled' ) else: self.log.debug( 'Input Data Check ==> Disabled' ) if self.outputDataCheck: self.log.debug( 'Output Data Check ==> Enabled' ) else: self.log.debug( 'Output Data Check ==> Disabled' ) if self.inputSandboxCheck: self.log.debug( 'Input Sbox Check ==> Enabled' ) else: self.log.debug( 'Input Sbox Check ==> Disabled' ) if self.platformCheck: self.log.debug( 'Platform Check ==> Enabled' ) else: self.log.debug( 'Platform Check ==> Disabled' ) return S_OK() ############################################################################# def checkJob( self, job, classAdJob ): """ This method controls the order and presence of each sanity check for submitted jobs. This should be easily extended in the future to accommodate any other potential checks. """ #Job JDL check message = "Job: %s JDL: OK," % job self.log.debug( "Checking Loop Starts for job %s" % job ) jobType = self.jobDB.getJobAttribute( job, 'JobType' ) if not jobType['OK']: return S_ERROR( 'Could not determine job type' ) jobType = jobType['Value'] #Input data check if self.inputDataCheck: inputData = self.checkInputData( job, jobType ) if inputData['OK']: number = inputData['Value'] message += 'InputData: ' + number + ', ' else: minorStatus = inputData['Value'] self.log.info( message ) self.log.info( 'Job: ' + str( job ) + ' Failed input data check.' ) return S_ERROR( minorStatus ) #Platform check # disabled if self.platformCheck: platform = self.checkPlatformSupported( job, classAdJob ) if platform['OK']: arch = platform['Value'] message += 'Platform: ' + arch + ' OK, ' else: res = 'No supported platform for job ' + str( job ) + '.' minorStatus = platform['Value'] self.log.info( message ) self.log.info( res ) return S_ERROR( message ) #Output data exists check if self.outputDataCheck: # disabled if jobType != 'user': outputData = self.checkOutputDataExists( job, classAdJob ) if outputData['OK']: if outputData.has_key( 'SUCCESS' ): success = self.successStatus minorStatus = outputData['SUCCESS'] report = outputData['Value'] message += report self.log.info( message ) self.setJobParam( job, 'JobSanityCheck', message ) self.updateJobStatus( job, success, minorStatus ) # FIXME: this can not be a S_OK(), Job has to be aborted if OutPut data is present return S_OK( 'Found successful job' ) else: flag = outputData['Value'] message += 'Output Data: ' + flag + ', ' else: res = 'Job: ' + str( job ) + ' Failed since output data exists.' minorStatus = outputData['Value'] self.log.info( message ) self.log.info( res ) return S_ERROR( message ) #Input Sandbox uploaded check if self.inputSandboxCheck: # disabled inputSandbox = self.checkInputSandbox( job, classAdJob ) if inputSandbox['OK']: sbChecked = inputSandbox['Value'] message += ' Input Sandboxes: %s, OK.' % sbChecked else: res = 'Job: %s failed due some missing sandboxes' % job minorStatus = inputSandbox['Message'] self.log.info( message ) self.log.info( res ) return S_ERROR( minorStatus ) self.log.info( message ) self.setJobParam( job, 'JobSanityCheck', message ) return self.setNextOptimizer( job ) ############################################################################# def checkInputData( self, job, jobType ): """This method checks both the amount of input datasets for the job and whether the LFN conventions are correct. """ voName = self.voName maxData = int( self.maxDataPerJob ) totalData = 0 slashFlag = 0 incorrectDataFlag = 0 result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) ) self.log.warn( result['Message'] ) result = S_ERROR() result['Value'] = 'Input Data Specification' return result if not result['Value']: return S_OK( 'No input LFNs' ) data = result['Value'] # seems to be [''] when null, which isn't an empty list ;) ok = False for i in data: if i: ok = True if not ok: self.log.debug( 'Job %s has no input data requirement' % ( job ) ) return S_OK( 'No input LFNs' ) self.log.debug( 'Job %s has an input data requirement and will be checked' % ( job ) ) data = result['Value'] repData = '\n' for i in data: repData += i + '\n' self.log.debug( 'Data is: %s' % ( repData ) ) totalData = len( data ) if totalData: for i in data: j = i.replace( 'LFN:', '' ) if not re.search( '^/' + voName + '/', j ): incorrectDataFlag += 1 if re.search( '//', j ): slashFlag += 1 if incorrectDataFlag: result = S_ERROR() result['Value'] = "Input data not correctly specified" return result if slashFlag: result = S_ERROR() result['Value'] = "Input data contains //" return result #only check limit for user jobs if jobType.lower() == 'user' and totalData > maxData: message = '%s datasets selected. Max limit is %s.' % ( totalData, maxData ) self.setJobParam( job, 'DatasetCheck', message ) result = S_ERROR() result['Value'] = "Exceeded Maximum Dataset Limit (%s)" % ( maxData ) return result number = str( totalData ) result = S_OK() result['Value'] = number + ' LFNs OK' return result ############################################################################# def checkOutputDataExists( self, job, classAdJob ): """If the job output data is already in the LFC, this method will fail the job for the attention of the data manager. To be tidied for DIRAC3... """ # FIXME: To implement checkOutputDataExists return S_OK() ############################################################################# def checkPlatformSupported( self, job, classAdJob ): """This method queries the CS for available platforms supported by DIRAC and will check these against what the job requests. """ # FIXME: To implement checkPlatformSupported return S_OK() ############################################################################# def checkInputSandbox( self, job, classAdJob ): """The number of input sandbox files, as specified in the job JDL are checked in the JobDB. """ ownerName = classAdJob.getAttributeString( "Owner" ) if not ownerName: ownerDN = classAdJob.getAttributeString( "OwnerDN" ) ownerName = CS.getUsernameForDN( ownerDN ) ownerGroup = classAdJob.getAttributeString( "OwnerGroup" ) jobSetup = classAdJob.getAttributeString( "DIRACSetup" ) isbList = classAdJob.getListFromExpression( 'InputSandbox' ) sbsToAssign = [] for isb in isbList: if isb.find( "SB:" ) == 0: self.log.info( "Found a sandbox", isb ) sbsToAssign.append( ( isb, "Input" ) ) numSBsToAssign = len( sbsToAssign ) if not numSBsToAssign: return S_OK( 0 ) self.log.info( "Assigning %s sandboxes on behalf of %s@%s" % ( numSBsToAssign, ownerName, ownerGroup ) ) result = self.sandboxClient.assignSandboxesToJob( job, sbsToAssign, ownerName, ownerGroup, jobSetup ) if not result[ 'OK' ]: self.log.error( "Could not assign sandboxes in the SandboxStore", "assigned to job %s" % job ) return S_ERROR( "Cannot assign sandbox to job" ) assigned = result[ 'Value' ] if assigned != numSBsToAssign: self.log.error( "Could not assign all sandboxes (%s). Only assigned %s" % ( numSBsToAssign, assigned ) ) return S_OK( numSBsToAssign )
class WMSClient: def __init__(self, jobManagerClient=False, sbRPCClient=False, sbTransferClient=False, useCertificates=False, timeout=120): """ WMS Client constructor """ self.jobManagerClient = jobManagerClient self.useCertificates = useCertificates self.timeout = timeout self.sandboxClient = SandboxStoreClient( useCertificates=useCertificates, rpcClient=sbRPCClient, transferClient=sbTransferClient) ############################################################################### def __getInputSandboxEntries(self, classAdJob): if classAdJob.lookupAttribute("InputSandbox"): inputSandbox = classAdJob.get_expression("InputSandbox") inputSandbox = inputSandbox.replace('","', "\n") inputSandbox = inputSandbox.replace('{', "") inputSandbox = inputSandbox.replace('}', "") inputSandbox = inputSandbox.replace('"', "") inputSandbox = inputSandbox.replace(',', "") inputSandbox = inputSandbox.split() else: inputSandbox = [] return inputSandbox # This are the NEW methods def __uploadInputSandbox(self, classAdJob): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) realFiles = [] badFiles = [] okFiles = [] realFiles = [] for file in inputSandbox: valid = True for tag in ( 'lfn:', 'LFN:', 'SB:', '%s' ): # in case of parametric input sandbox, there is %s passed, so have to ignore it also if file.find(tag) == 0: valid = False break if valid: realFiles.append(file) # If there are no files, skip! if not realFiles: return S_OK() # Check real files for file in realFiles: if not os.path.exists(file): badFiles.append(file) print "inputSandbox file/directory " + file + " not found" continue okFiles.append(file) # print "Total size of the inputSandbox: "+str(totalSize) totalSize = File.getGlobbedTotalSize(okFiles) if badFiles: result = S_ERROR('Input Sandbox is not valid') result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: result = self.sandboxClient.uploadFilesAsSandbox(okFiles) if not result['OK']: return result inputSandbox.append(result['Value']) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK() def __assignSandboxesToJob(self, jobID, classAdJob): sandboxClient = SandboxStoreClient() inputSandboxes = self.__getInputSandboxEntries(classAdJob) sbToAssign = [] for isb in inputSandboxes: if isb.find("SB:") == 0: sbToAssign.append(isb) if sbToAssign: assignList = [(isb, 'Input') for isb in sbToAssign] result = sandboxClient.assignSandboxesToJob(jobID, assignList) if not result['OK']: return result return S_OK() def submitJob(self, jdl): """ Submit one job specified by its JDL to WMS """ if not self.jobManagerClient: jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) else: jobManager = self.jobManagerClient if os.path.exists(jdl): fic = open(jdl, "r") jdlString = fic.read() fic.close() else: # If file JDL does not exist, assume that the JDL is # passed as a string jdlString = jdl # Check the validity of the input JDL jdlString = jdlString.strip() if jdlString.find("[") != 0: jdlString = "[%s]" % jdlString classAdJob = ClassAd(jdlString) if not classAdJob.isOK(): return S_ERROR('Invalid job JDL') # Check the size and the contents of the input sandbox result = self.__uploadInputSandbox(classAdJob) if not result['OK']: return result # Submit the job now and get the new job ID result = jobManager.submitJob(classAdJob.asJDL()) if not result['OK']: return result jobID = result['Value'] if 'requireProxyUpload' in result and result['requireProxyUpload']: # TODO: We should notify the user to upload a proxy with proxy-upload pass # print "Sandbox uploading" return S_OK(jobID) # This is the OLD method def __checkInputSandbox(self, classAdJob): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) if inputSandbox: ok = 1 # print inputSandbox # Check the Input Sandbox files totalSize = 0 for file in inputSandbox: if file.find('lfn:') != 0 and file.find( 'LFN:') != 0 and file.find("SB:") != 0: if not os.path.exists(file): badfile = file print "inputSandbox file/directory " + file + " not found" ok = 0 else: if os.path.isdir(file): comm = 'du -b -s ' + file status, out = commands.getstatusoutput(comm) try: dirSize = int(out.split()[0]) except Exception, x: print "Input Sandbox directory name", file, "is not valid !" print str(x) badfile = file ok = 0 totalSize = totalSize + dirSize else: totalSize = int(os.stat(file)[6]) + totalSize # print "Total size of the inputSandbox: "+str(totalSize) if not ok: result = S_ERROR('Input Sandbox is not valid') result['BadFile'] = file result['TotalSize'] = totalSize return result result = S_OK() result['InputSandbox'] = inputSandbox result['TotalSize'] = totalSize return result
class WMSClient(object): def __init__(self, jobManagerClient=None, sbRPCClient=None, sbTransferClient=None, useCertificates=False, timeout=600): """ WMS Client constructor Here we also initialize the needed clients and connections """ self.useCertificates = useCertificates self.timeout = timeout self.jobManager = jobManagerClient self.sandboxClient = None if sbRPCClient and sbTransferClient: self.sandboxClient = SandboxStoreClient( rpcClient=sbRPCClient, transferClient=sbTransferClient, useCertificates=useCertificates) ############################################################################### def __getInputSandboxEntries(self, classAdJob): if classAdJob.lookupAttribute("InputSandbox"): inputSandbox = classAdJob.get_expression("InputSandbox") inputSandbox = inputSandbox.replace('","', "\n") inputSandbox = inputSandbox.replace('{', "") inputSandbox = inputSandbox.replace('}', "") inputSandbox = inputSandbox.replace('"', "") inputSandbox = inputSandbox.replace(',', "") inputSandbox = inputSandbox.split() else: inputSandbox = [] return inputSandbox def __uploadInputSandbox(self, classAdJob, jobDescriptionObject=None): """Checks the validity of the job Input Sandbox. The function returns the list of Input Sandbox files. The total volume of the input sandbox is evaluated """ inputSandbox = self.__getInputSandboxEntries(classAdJob) realFiles = [] badFiles = [] diskFiles = [] for isFile in inputSandbox: if not isFile.startswith(('lfn:', 'LFN:', 'SB:', '%s', '%(')): realFiles.append(isFile) stringIOFiles = [] stringIOFilesSize = 0 if jobDescriptionObject is not None: if isinstance(jobDescriptionObject, StringIO.StringIO): stringIOFiles = [jobDescriptionObject] stringIOFilesSize = len(jobDescriptionObject.buf) gLogger.debug("Size of the stringIOFiles: " + str(stringIOFilesSize)) else: return S_ERROR("jobDescriptionObject is not a StringIO object") # Check real files for isFile in realFiles: if not os.path.exists( isFile ): # we are passing in real files, we expect them to be on disk badFiles.append(isFile) gLogger.warn("inputSandbox file/directory " + isFile + " not found. Keep looking for the others") continue diskFiles.append(isFile) diskFilesSize = File.getGlobbedTotalSize(diskFiles) gLogger.debug("Size of the diskFiles: " + str(diskFilesSize)) totalSize = diskFilesSize + stringIOFilesSize gLogger.verbose("Total size of the inputSandbox: " + str(totalSize)) okFiles = stringIOFiles + diskFiles if badFiles: result = S_ERROR('Input Sandbox is not valid') result['BadFile'] = badFiles result['TotalSize'] = totalSize return result if okFiles: if not self.sandboxClient: self.sandboxClient = SandboxStoreClient( useCertificates=self.useCertificates) result = self.sandboxClient.uploadFilesAsSandbox(okFiles) if not result['OK']: return result inputSandbox.append(result['Value']) classAdJob.insertAttributeVectorString("InputSandbox", inputSandbox) return S_OK() def submitJob(self, jdl, jobDescriptionObject=None): """ Submit one job specified by its JDL to WMS """ if os.path.exists(jdl): fic = open(jdl, "r") jdlString = fic.read() fic.close() else: # If file JDL does not exist, assume that the JDL is passed as a string jdlString = jdl jdlString = jdlString.strip() # Strip of comments in the jdl string newJdlList = [] for line in jdlString.split('\n'): if not line.strip().startswith('#'): newJdlList.append(line) jdlString = '\n'.join(newJdlList) # Check the validity of the input JDL if jdlString.find("[") != 0: jdlString = "[%s]" % jdlString classAdJob = ClassAd(jdlString) if not classAdJob.isOK(): return S_ERROR('Invalid job JDL') # Check the size and the contents of the input sandbox result = self.__uploadInputSandbox(classAdJob, jobDescriptionObject) if not result['OK']: return result # Submit the job now and get the new job ID if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) result = self.jobManager.submitJob(classAdJob.asJDL()) if 'requireProxyUpload' in result and result['requireProxyUpload']: gLogger.warn("Need to upload the proxy") return result def killJob(self, jobID): """ Kill running job. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.killJob(jobID) def deleteJob(self, jobID): """ Delete job(s) from the WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.deleteJob(jobID) def rescheduleJob(self, jobID): """ Reschedule job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.rescheduleJob(jobID) def resetJob(self, jobID): """ Reset job(s) in WMS Job database. jobID can be an integer representing a single DIRAC job ID or a list of IDs """ if not self.jobManager: self.jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=self.useCertificates, timeout=self.timeout) return self.jobManager.resetJob(jobID)