def submitBigDataJobs( self, endpoint, numBigDataJobsAllowed, runningSiteName, NameNode, BigDataSoftware, BigDataSoftwareVersion, HLLName, HLLVersion, PublicIP, Port, jobIds , runningEndPointName, JobName, User, dataset, UsePilot, IsInteractive ): """ Big Data job submission with all the parameters of SITE and Job """ self.log.info( 'Director:submitBigDataJobs:JobSubmisionProcess' ) if ( numBigDataJobsAllowed <= 0 ): return S_ERROR( "Number of slots reached for %s in the NameNode " % runningSiteName, NameNode ) if NameNode not in self.runningEndPoints[endpoint]['NameNode']: return S_ERROR( 'Unknown NameNode: %s' % NameNode ) newJob = BigDataDB.insertBigDataJob( jobIds, JobName, Time.toString(), NameNode, runningSiteName, PublicIP, "", "", "", BigDataSoftware, BigDataSoftwareVersion, HLLName, HLLVersion, "Submitted" ) self.log.info( 'Director:submitBigDataJobs:SubmitJob' ) dictBDJobSubmitted = self._submitBigDataJobs( NameNode, Port, jobIds, PublicIP, runningEndPointName, User, JobName, dataset, UsePilot, IsInteractive ) if not dictBDJobSubmitted[ 'OK' ]: return dictBDJobSubmitted bdjobID = dictBDJobSubmitted['Value'] result = BigDataDB.setHadoopID( jobIds, bdjobID ) if not result[ 'OK' ]: S_ERROR( "BigData ID not updated" ) result = BigDataDB.setIntoJobDBStatus( jobIds, "Submitted", "", runningSiteName, bdjobID ) if not result[ 'OK' ]: S_ERROR( "JobDB of BigData Soft not updated" ) self.log.info( 'Director:submitBigDataJobs:JobSubmitted' ) return S_OK( "OK" )
def __updateSandBox( self, jobid, output ): jobInfo = BigDataDB.getJobIDInfo( jobid ) result = self.sshConnect.scpCall( 100, output, output, False ) if not result['OK']: self.log.error( 'Error to get the data from BigData Software DFS:', result ) file_paths = [] file_paths.append( output ) outputSandbox = file_paths resolvedSandbox = self.__resolveOutputSandboxFiles( outputSandbox ) if not resolvedSandbox['OK']: self.log.warn( 'Output sandbox file resolution failed:' ) self.log.warn( resolvedSandbox['Message'] ) self.__report( 'Failed', 'Resolving Output Sandbox' ) fileList = resolvedSandbox['Value']['Files'] missingFiles = resolvedSandbox['Value']['Missing'] if missingFiles: self.jobReport.setJobParameter( 'OutputSandboxMissingFiles', ', '.join( missingFiles ), sendFlag = False ) if fileList and jobid: self.outputSandboxSize = getGlobbedTotalSize( fileList ) self.log.info( 'Attempting to upload Sandbox with limit:', self.sandboxSizeLimit ) result = self.sandboxClient.uploadFilesAsSandboxForJob( fileList, jobid, 'Output', self.sandboxSizeLimit ) # 1024*1024*10 if not result['OK']: self.log.error( 'Output sandbox upload failed with message', result['Message'] ) if result.has_key( 'SandboxFileName' ): outputSandboxData = result['SandboxFileName'] self.log.info( 'Attempting to upload %s as output data' % ( outputSandboxData ) ) outputData.append( outputSandboxData ) self.jobReport.setJobParameter( 'OutputSandbox', 'Sandbox uploaded to grid storage', sendFlag = False ) self.jobReport.setJobParameter( 'OutputSandboxLFN', self.__getLFNfromOutputFile( outputSandboxData )[0], sendFlag = False ) else: self.log.info( 'Could not get SandboxFileName to attempt upload to Grid storage' ) return S_ERROR( 'Output sandbox upload failed and no file name supplied for failover to Grid storage' ) else: # Do not overwrite in case of Error if not self.failedFlag: self.__report( 'Completed', 'Output Sandbox Uploaded' ) self.log.info( 'Sandbox uploaded successfully' ) return "OK"
def configureFromSection( self, mySection ): """ reload from CS """ self.log.debug( 'Configuring from %s' % mySection ) self.errorMailAddress = gConfig.getValue( mySection + '/ErrorMailAddress' , self.errorMailAddress ) self.alarmMailAddress = gConfig.getValue( mySection + '/AlarmMailAddress' , self.alarmMailAddress ) self.mailFromAddress = gConfig.getValue( mySection + '/MailFromAddress' , self.mailFromAddress ) # following will do something only when call from reload including SubmitPool as mySection requestedRunningEndPoints = gConfig.getValue( mySection + '/RunningEndPoints', self.runningEndPoints.keys() ) for runningEndPointName in requestedRunningEndPoints: self.log.verbose( 'Trying to configure RunningEndPoint:', runningEndPointName ) if runningEndPointName in self.runningEndPoints: continue RunningBDEndPointDict = BigDataDB.getRunningEnPointDict( runningEndPointName ) if not RunningBDEndPointDict['OK']: self.log.error( 'Error in RunningBDEndPointDict: %s' % RunningBDEndPointDict['Message'] ) return RunningBDEndPointDict self.log.verbose( 'Trying to configure RunningBDEndPointDict:', RunningBDEndPointDict ) RunningBDEndPointDict = RunningBDEndPointDict[ 'Value' ] for option in ['NameNode', 'Port', 'SiteName', 'BigDataSoftware', 'BigDataSoftwareVersion', 'HighLevelLanguage', 'LimitQueueJobsEndPoint', 'URL', 'PublicIP']: if option not in RunningBDEndPointDict.keys(): self.log.error( 'Missing option in "%s" EndPoint definition:%s' % ( runningEndPointName, option ) ) continue self.runningEndPoints[runningEndPointName] = {} self.runningEndPoints[runningEndPointName]['NameNode'] = RunningBDEndPointDict['NameNode'] self.runningEndPoints[runningEndPointName]['Port'] = int ( RunningBDEndPointDict['Port'] ) self.runningEndPoints[runningEndPointName]['SiteName'] = RunningBDEndPointDict['SiteName'] self.runningEndPoints[runningEndPointName]['BigDataSoftware'] = RunningBDEndPointDict['BigDataSoftware'] self.runningEndPoints[runningEndPointName]['BigDataSoftwareVersion'] = RunningBDEndPointDict['BigDataSoftwareVersion'] self.runningEndPoints[runningEndPointName]['LimitQueueJobsEndPoint'] = int( RunningBDEndPointDict['LimitQueueJobsEndPoint'] ) self.runningEndPoints[runningEndPointName]['URL'] = RunningBDEndPointDict['URL'] self.runningEndPoints[runningEndPointName]['User'] = RunningBDEndPointDict['User'] self.runningEndPoints[runningEndPointName]['PublicIP'] = RunningBDEndPointDict['PublicIP'] self.runningEndPoints[runningEndPointName]['UsePilot'] = RunningBDEndPointDict['UsePilot'] self.runningEndPoints[runningEndPointName]['IsInteractive'] = RunningBDEndPointDict['IsInteractive'] self.runningEndPoints[runningEndPointName]['HighLevelLanguage'] = RunningBDEndPointDict['HighLevelLanguage'] self.runningEndPoints[runningEndPointName]['Requirements'] = RunningBDEndPointDict['Requirements'] self.runningEndPoints[runningEndPointName]['Requirements']['CPUTime'] = int ( self.runningEndPoints[runningEndPointName]['Requirements']['CPUTime'] )
def configureFromSection(self, mySection, endPoint): """ get CS for monitoring endpoints """ self.log.debug("Configuring from %s" % mySection) monitoringBDEndPointDict = BigDataDB.getRunningEnPointDict(endPoint) if not monitoringBDEndPointDict["OK"]: self.log.error("Error in RunninggBDEndPointDict: %s" % monitoringBDEndPointDict["Message"]) return monitoringBDEndPointDict self.log.verbose("Trying to configure RunningBDEndPointDict:", monitoringBDEndPointDict) monitoringBDEndPointDict = monitoringBDEndPointDict["Value"] for option in [ "NameNode", "Port", "SiteName", "BigDataSoftware", "BigDataSoftwareVersion", "HighLevelLanguage", "LimitQueueJobsEndPoint", "URL", "PublicIP", ]: if option not in monitoringBDEndPointDict.keys(): self.log.error('Missing option in "%s" EndPoint definition:' % endPoint, option) continue self.monitoringEndPoints[endPoint] = {} self.monitoringEndPoints[endPoint]["NameNode"] = monitoringBDEndPointDict["NameNode"] self.monitoringEndPoints[endPoint]["Port"] = int(monitoringBDEndPointDict["Port"]) self.monitoringEndPoints[endPoint]["SiteName"] = monitoringBDEndPointDict["SiteName"] self.monitoringEndPoints[endPoint]["BigDataSoftware"] = monitoringBDEndPointDict["BigDataSoftware"] self.monitoringEndPoints[endPoint]["BigDataSoftwareVersion"] = monitoringBDEndPointDict[ "BigDataSoftwareVersion" ] self.monitoringEndPoints[endPoint]["LimitQueueJobsEndPoint"] = int( monitoringBDEndPointDict["LimitQueueJobsEndPoint"] ) self.monitoringEndPoints[endPoint]["URL"] = monitoringBDEndPointDict["URL"] self.monitoringEndPoints[endPoint]["User"] = monitoringBDEndPointDict["User"] self.monitoringEndPoints[endPoint]["PublicIP"] = monitoringBDEndPointDict["PublicIP"] self.monitoringEndPoints[endPoint]["IsInteractive"] = monitoringBDEndPointDict["IsInteractive"] self.monitoringEndPoints[endPoint]["HighLevelLanguage"] = monitoringBDEndPointDict["HighLevelLanguage"]
def monitoring( self, loop, parentthread, output ): self.initialTiming = os.times() accountingReport = AccountingJob() accountingReport.setStartTime() numberJobsFlag = True numberJobs = 0 numberStartedJobsDict = {} numberEndingJobsDict = {} job_pattern = re.compile( 'Job =.*?,' ) job_pattern_2 = re.compile( 'Job =.*?\n' ) jobid = int( re.split( "_", re.split( "/", output )[int( len( re.split( "/", output ) ) - 1 )] )[0] ) cmd = '/bin/chmod 555 ' + self.getinfo returned = self.commandLaunch( cmd ) while parentthread.isAlive(): time.sleep( loop ) if numberJobsFlag: cmd = self.getinfo + ' -c step1' returned = self.commandLaunch( cmd ) self.log.info( 'InteractiveJobMonitorThread:step1:numJobs:', returned ) if returned != None: if ( returned['Value'][1] != "" ): if re.split( "=", returned['Value'][1] )[1].strip().isdigit(): numberJobs = int( re.split( "=", returned['Value'][1] )[1] ) if ( numberJobs != 0 ): numberJobsFlag = False BigDataDB.setJobStatus( jobid, "Running" ) else: cmd = self.getinfo + ' -c step2' returned = self.commandLaunch( cmd ) self.log.info( 'InteractiveJobMonitorThread:step2:startedJobs:', returned ) if returned != "": if ( returned['Value'][1] != "" ): startedJobs = job_pattern.findall( returned['Value'][1] ) self.log.info( 'step2:startedJobs:', startedJobs ) cmd = self.getinfo + ' -c step3' returned = self.commandLaunch( cmd ) self.log.info( 'InteractiveJobMonitorThread:step3:endedJobs:', returned ) if returned != "": if ( returned['Value'][1] != "" ): finishedJobs = job_pattern_2.findall( returned['Value'][1] ) self.log.info( 'step3:finishedJobs:', finishedJobs ) if ( len( finishedJobs ) == numberJobs ): BigDataDB.setJobStatus( jobid, "Done" ) BigDataDB.setHadoopID( jobid, finishedJobs ) self.__updateSandBox( jobid, output ) #Update Accounting EXECUTION_RESULT = {} EXECUTION_RESULT['CPU'] = [] finalStat = os.times() for i in range( len( finalStat ) ): EXECUTION_RESULT['CPU'].append( finalStat[i] - self.initialTiming[i] ) utime, stime, cutime, cstime, elapsed = EXECUTION_RESULT['CPU'] cpuTime = utime + stime + cutime + cstime execTime = elapsed result = jobDB.getJobAttributes( jobid ) getting = result['Value'] acData = { 'User' : getting['Owner'], 'UserGroup' : getting['OwnerGroup'], 'JobGroup' : 'cesga', 'JobType' : 'User', 'JobClass' : 'unknown', 'ProcessingType' : 'unknown', 'FinalMajorStatus' : getting['Status'], 'FinalMinorStatus' : getting['MinorStatus'], 'CPUTime' : cpuTime, 'Site' : getting['Site'], # Based on the factor to convert raw CPU to Normalized units (based on the CPU Model) 'NormCPUTime' : 0, 'ExecTime' : cpuTime, 'InputDataSize' : 0, 'OutputDataSize' : 0, 'InputDataFiles' : 0, 'OutputDataFiles' : 0, 'DiskSpace' : 0, 'InputSandBoxSize' : 0, 'OutputSandBoxSize' : 0, 'ProcessedEvents' : 0 } accountingReport.setEndTime() accountingReport.setValuesFromDict( acData ) result = accountingReport.commit()
def execute( self ): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Count Pending Jobs 3.- Submit Jobs """ self.__checkSubmitPools() bigDataJobsToSubmit = {} bigDataJobIdsToSubmit = {} for directorName, directorDict in self.directors.items(): self.log.verbose( 'Checking Director:', directorName ) self.log.verbose( 'RunningEndPoints:', directorDict['director'].runningEndPoints ) for runningEndPointName in directorDict['director'].runningEndPoints: runningEndPointDict = directorDict['director'].runningEndPoints[runningEndPointName] NameNode = runningEndPointDict['NameNode'] jobsByEndPoint = 0 result = BigDataDB.getBigDataJobsByStatusAndEndpoint( 'Submitted', NameNode ) if result['OK']: jobsByEndPoint += len( result['Value'] ) result = BigDataDB.getBigDataJobsByStatusAndEndpoint( 'Running', NameNode ) if result['OK']: jobsByEndPoint += len( result['Value'] ) self.log.verbose( 'Checking Jobs By EndPoint %s:' % jobsByEndPoint ) jobLimitsEndPoint = runningEndPointDict['LimitQueueJobsEndPoint'] bigDataJobs = 0 if jobsByEndPoint >= jobLimitsEndPoint: self.log.info( '%s >= %s Running jobs reach job limits: %s, skipping' % ( jobsByEndPoint, jobLimitsEndPoint, runningEndPointName ) ) continue else: bigDataJobs = jobLimitsEndPoint - jobsByEndPoint requirementsDict = runningEndPointDict['Requirements'] self.log.info( 'Requirements Dict: ', requirementsDict ) result = taskQueueDB.getMatchingTaskQueues( requirementsDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] self.log.info( 'Task Queues Dict: ', taskQueueDict ) jobs = 0 priority = 0 cpu = 0 jobsID = 0 self.log.info( 'Pending Jobs from TaskQueue, which not matching before: ', self.pendingTaskQueueJobs ) for tq in taskQueueDict: jobs += taskQueueDict[tq]['Jobs'] priority += taskQueueDict[tq]['Priority'] cpu += taskQueueDict[tq]['Jobs'] * taskQueueDict[tq]['CPUTime'] #Matching of Jobs with BigData Softwares #This process is following the sequence: #Retrieve a job from taskqueueDict #Get job name and try to match with the resources #If not match store the var pendingTaskQueueJobs for the #next iteration # #This matching is doing with the following JobName Pattern # NameSoftware _ SoftwareVersion _ HighLanguageName _ HighLanguageVersion _ DataSetName #extract a job from the TaskQueue if tq not in self.pendingTaskQueueJobs.keys(): self.pendingTaskQueueJobs[tq] = {} getJobFromTaskQueue = taskQueueDB.matchAndGetJob( taskQueueDict[tq] ) if not getJobFromTaskQueue['OK']: self.log.error( 'Could not get Job and FromTaskQueue', getJobFromTaskQueue['Message'] ) return getJobFromTaskQueue jobInfo = getJobFromTaskQueue['Value'] jobID = jobInfo['jobId'] jobAttrInfo = jobDB.getJobAttributes( jobID ) if not jobAttrInfo['OK']: self.log.error( 'Could not get Job Attributes', jobAttrInfo['Message'] ) return jobAttrInfo jobInfoUniq = jobAttrInfo['Value'] jobName = jobInfoUniq['JobName'] self.pendingTaskQueueJobs[tq][jobID] = jobName result = jobDB.getJobJDL( jobID, True ) classAdJob = ClassAd( result['Value'] ) arguments = 0 if classAdJob.lookupAttribute( 'Arguments' ): arguments = classAdJob.getAttributeString( 'Arguments' ) #if not classAdJob.lookupAttribute( 'Arguments' ): # continue jobsToSubmit = self.matchingJobsForBDSubmission( arguments, runningEndPointName, runningEndPointDict['BigDataSoftware'], runningEndPointDict['BigDataSoftwareVersion'], runningEndPointDict['HighLevelLanguage']['HLLName'], runningEndPointDict['HighLevelLanguage']['HLLVersion'], jobID ) if ( jobsToSubmit == "OK" ): if directorName not in bigDataJobsToSubmit: bigDataJobsToSubmit[directorName] = {} if runningEndPointName not in bigDataJobsToSubmit[directorName]: bigDataJobsToSubmit[directorName][runningEndPointName] = {} bigDataJobsToSubmit[directorName][runningEndPointName] = { 'JobId': jobID, 'JobName': jobName, 'TQPriority': priority, 'CPUTime': cpu, 'BigDataEndpoint': runningEndPointName, 'BigDataEndpointNameNode': runningEndPointDict['NameNode'], 'BdSoftware': runningEndPointDict['BigDataSoftware'], 'BdSoftwareVersion': runningEndPointDict['BigDataSoftwareVersion'], 'HLLName' : runningEndPointDict['HighLevelLanguage']['HLLName'], 'HLLVersion' : runningEndPointDict['HighLevelLanguage']['HLLVersion'], 'NumBigDataJobsAllowedToSubmit': bigDataJobs, 'SiteName': runningEndPointDict['SiteName'], 'PublicIP': runningEndPointDict['PublicIP'], 'User': runningEndPointDict['User'], 'Port': runningEndPointDict['Port'], 'UsePilot': runningEndPointDict['UsePilot'], 'IsInteractive': runningEndPointDict['IsInteractive'], 'Arguments': arguments } del self.pendingTaskQueueJobs[tq][jobID] else: self.log.error( jobsToSubmit ) self.log.info( 'Pending Jobs from TaskQueue, which not matching after: ', self.pendingTaskQueueJobs ) for tq in self.pendingTaskQueueJobs.keys(): for jobid in self.pendingTaskQueueJobs[tq].keys(): result = jobDB.getJobJDL( jobid, True ) classAdJob = ClassAd( result['Value'] ) arguments = 0 if classAdJob.lookupAttribute( 'Arguments' ): arguments = classAdJob.getAttributeString( 'Arguments' ) #if not classAdJob.lookupAttribute( 'Arguments' ): # continue #do the match with the runningEndPoint jobsToSubmit = self.matchingJobsForBDSubmission( arguments, runningEndPointName, runningEndPointDict['BigDataSoftware'], runningEndPointDict['BigDataSoftwareVersion'], runningEndPointDict['HighLevelLanguage']['HLLName'], runningEndPointDict['HighLevelLanguage']['HLLVersion'], jobid ) if ( jobsToSubmit == "OK" ): if directorName not in bigDataJobsToSubmit: bigDataJobsToSubmit[directorName] = {} if runningEndPointName not in bigDataJobsToSubmit[directorName]: bigDataJobsToSubmit[directorName][runningEndPointName] = {} bigDataJobsToSubmit[directorName][runningEndPointName] = { 'JobId': jobid, 'JobName': self.pendingTaskQueueJobs[tq][jobid], 'TQPriority': priority, 'CPUTime': cpu, 'BigDataEndpoint': runningEndPointName, 'BigDataEndpointNameNode': runningEndPointDict['NameNode'], 'BdSoftware': runningEndPointDict['BigDataSoftware'], 'BdSoftwareVersion': runningEndPointDict['BigDataSoftwareVersion'], 'HLLName' : runningEndPointDict['HighLevelLanguage']['HLLName'], 'HLLVersion' : runningEndPointDict['HighLevelLanguage']['HLLVersion'], 'NumBigDataJobsAllowedToSubmit': bigDataJobs, 'SiteName': runningEndPointDict['SiteName'], 'PublicIP': runningEndPointDict['PublicIP'], 'User': runningEndPointDict['User'], 'Port': runningEndPointDict['Port'], 'UsePilot': runningEndPointDict['UsePilot'], 'IsInteractive': runningEndPointDict['IsInteractive'], 'Arguments': arguments } del self.pendingTaskQueueJobs[tq][jobid] else: self.log.error( jobsToSubmit ) if not jobs and not self.pendingTaskQueueJobs: self.log.info( 'No matching jobs for %s found, skipping' % NameNode ) continue self.log.info( '___BigDataJobsTo Submit:', bigDataJobsToSubmit ) for directorName, JobsToSubmitDict in bigDataJobsToSubmit.items(): for runningEndPointName, jobsToSubmitDict in JobsToSubmitDict.items(): if self.directors[directorName]['isEnabled']: self.log.info( 'Requesting submission to %s of %s' % ( runningEndPointName, directorName ) ) director = self.directors[directorName]['director'] pool = self.pools[self.directors[directorName]['pool']] jobIDs = JobsToSubmitDict[runningEndPointName]['JobId'] jobName = JobsToSubmitDict[runningEndPointName]['JobName'] endpoint = JobsToSubmitDict[runningEndPointName]['BigDataEndpoint'] runningSiteName = JobsToSubmitDict[runningEndPointName]['SiteName'] NameNode = JobsToSubmitDict[runningEndPointName]['BigDataEndpointNameNode'] BigDataSoftware = JobsToSubmitDict[runningEndPointName]['BdSoftware'] BigDataSoftwareVersion = JobsToSubmitDict[runningEndPointName]['BdSoftwareVersion'] HLLName = JobsToSubmitDict[runningEndPointName]['HLLName'] HLLVersion = JobsToSubmitDict[runningEndPointName]['HLLVersion'] PublicIP = JobsToSubmitDict[runningEndPointName]['PublicIP'] User = JobsToSubmitDict[runningEndPointName]['User'] Port = JobsToSubmitDict[runningEndPointName]['Port'] UsePilot = JobsToSubmitDict[runningEndPointName]['UsePilot'] IsInteractive = JobsToSubmitDict[runningEndPointName]['IsInteractive'] Arguments = JobsToSubmitDict[runningEndPointName]['Arguments'] numBigDataJobsAllowed = JobsToSubmitDict[runningEndPointName]['NumBigDataJobsAllowedToSubmit'] ret = pool.generateJobAndQueueIt( director.submitBigDataJobs, args = ( endpoint, numBigDataJobsAllowed, runningSiteName, NameNode, BigDataSoftware, BigDataSoftwareVersion, HLLName, HLLVersion, PublicIP, Port, jobIDs, runningEndPointName, jobName, User, self.jobDataset, UsePilot, IsInteractive ), oCallback = self.callBack, oExceptionCallback = director.exceptionCallBack, blocking = False ) if not ret['OK']: # Disable submission until next iteration self.directors[directorName]['isEnabled'] = False else: time.sleep( self.am_getOption( 'ThreadStartDelay' ) ) if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() return DIRAC.S_OK()
def execute(self): """Main Agent code: 1.- Query BigDataDB for existing Running, Queue, or Submitted jobs 2.- Ask about the status 3.- Change the status into DB in the case of had changed """ self.pendingJobs["Submitted"] = BigDataDB.getBigDataJobsByStatus("Submitted") self.pendingJobs["Running"] = BigDataDB.getBigDataJobsByStatus("Running") self.pendingJobs["Unknown"] = BigDataDB.getBigDataJobsByStatus("Unknown") self.__getMonitoringPools() self.log.verbose("monitoring pools", self.monitoringEndPoints) for status in self.pendingJobs: self.log.verbose("Analizing %s jobs" % status) JobStatus = 0 if self.pendingJobs[status]["OK"]: for jobId in self.pendingJobs[status]["Value"]: self.log.verbose("Analizing job %s" % jobId) getSoftIdAndSiteName = BigDataDB.getSoftwareJobIDByJobID(jobId[0]) self.log.verbose("Site and SoftID:", getSoftIdAndSiteName) for runningEndPoint in self.monitoringEndPoints: if (self.monitoringEndPoints[runningEndPoint]["NameNode"] == getSoftIdAndSiteName[0][1]) and ( getSoftIdAndSiteName[0][0] != "" ): # Depending on the BigData Software the Query should be different if self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"] == "hadoop": if self.monitoringEndPoints[runningEndPoint]["BigDataSoftwareVersion"] == "hdv1": if ( self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"]["HLLName"] == "none" ): self.log.info( "Hadoop V.1 Monitoring submmission command with Hadoop jobID: ", getSoftIdAndSiteName[0][0], ) from BigDataDIRAC.WorkloadManagementSystem.Client.HadoopV1Client import ( HadoopV1Client, ) HadoopV1cli = HadoopV1Client( self.monitoringEndPoints[runningEndPoint]["User"], self.monitoringEndPoints[runningEndPoint]["PublicIP"], self.monitoringEndPoints[runningEndPoint]["Port"], ) JobStatus = HadoopV1cli.jobStatus( getSoftIdAndSiteName[0][0], self.monitoringEndPoints[runningEndPoint]["User"], self.monitoringEndPoints[runningEndPoint]["PublicIP"], ) if (JobStatus["OK"] == True) and ( self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1" ): if JobStatus["Value"][1].strip() == "Succeded": result = HadoopV1cli.newJob( self.__tmpSandBoxDir, jobId[0], getSoftIdAndSiteName[0][0] ) if result["OK"] == True: result = BigDataDB.updateHadoopIDAndJobStatus( jobId[0], result["Value"] ) BigDataDB.setJobStatus(jobId[0], "Running") JobStatus["OK"] = False else: self.log.info("New result from new Job", result) if JobStatus["OK"] == True: if JobStatus["Value"][1].strip() == "Succeded": BigDataDB.setJobStatus(jobId[0], "Done") if self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1": self.__updateInteractiveSandBox( jobId[0], self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"], self.monitoringEndPoints[runningEndPoint][ "BigDataSoftwareVersion" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLName" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLVersion" ], HadoopV1cli, ) else: self.__updateSandBox( jobId[0], self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"], self.monitoringEndPoints[runningEndPoint][ "BigDataSoftwareVersion" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLName" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLVersion" ], HadoopV1cli, ) getStatus = HadoopV1cli.jobCompleteStatus(getSoftIdAndSiteName[0][0]) if getStatus["OK"]: result = self.getJobFinalStatusInfo(getStatus["Value"][1]) if result["OK"]: self.sendJobAccounting(result["Value"], jobId[0]) if self.cleanDataAfterFinish: self.__deleteData(jobId[0], HadoopV1cli) if JobStatus["Value"][1].strip() == "Unknown": BigDataDB.setJobStatus(jobId[0], "Submitted") if JobStatus["Value"][1].strip() == "Running": BigDataDB.setJobStatus(jobId[0], "Running") if self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"] == "hadoop": if self.monitoringEndPoints[runningEndPoint]["BigDataSoftwareVersion"] == "hdv2": if ( self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"]["HLLName"] == "none" ): self.log.info( "Hadoop V.2 Monitoring submmission command with Hadoop jobID: ", getSoftIdAndSiteName[0][0], ) from BigDataDIRAC.WorkloadManagementSystem.Client.HadoopV2Client import ( HadoopV2Client, ) HadoopV2cli = HadoopV2Client( self.monitoringEndPoints[runningEndPoint]["User"], self.monitoringEndPoints[runningEndPoint]["PublicIP"], ) JobStatus = HadoopV2cli.jobStatus( getSoftIdAndSiteName[0][0], self.monitoringEndPoints[runningEndPoint]["User"], self.monitoringEndPoints[runningEndPoint]["PublicIP"], ) if (JobStatus["OK"] == True) and ( self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1" ): if JobStatus["Value"].strip() == "Succeded": result = HadoopV2cli.newJob( self.__tmpSandBoxDir, jobId[0], getSoftIdAndSiteName[0][0] ) if result["OK"] == True: result = BigDataDB.updateHadoopIDAndJobStatus( jobId[0], result["Value"] ) BigDataDB.setJobStatus(jobId[0], "Running") JobStatus["OK"] = False else: self.log.info("New result from new Job", result) if JobStatus["OK"] == True: if JobStatus["Value"] == "Succeded": BigDataDB.setJobStatus(jobId[0], "Done") if self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1": self.__updateInteractiveSandBox( jobId[0], self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"], self.monitoringEndPoints[runningEndPoint][ "BigDataSoftwareVersion" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLName" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLVersion" ], HadoopV2cli, ) else: self.__updateSandBox( jobId[0], self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"], self.monitoringEndPoints[runningEndPoint][ "BigDataSoftwareVersion" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLName" ], self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][ "HLLVersion" ], HadoopV2cli, ) getStatus = HadoopV2cli.jobCompleteStatus(getSoftIdAndSiteName[0][0]) if getStatus["OK"]: result = self.getJobFinalStatusInfo(getStatus["Value"][1]) if result["OK"]: self.sendJobAccounting(result["Value"], jobId[0]) # if self.cleanDataAfterFinish: # self.__deleteData( jobId[0], HadoopV2cli ) if JobStatus["Value"] == "Unknown": BigDataDB.setJobStatus(jobId[0], "Submitted") if JobStatus["Value"] == "Running": BigDataDB.setJobStatus(jobId[0], "Running") return DIRAC.S_OK()
def __updateSandBox(self, jobid, software, version, hll, hllversion, cli): jobInfo = BigDataDB.getJobIDInfo(jobid) source = ( self.__tmpSandBoxDir + str(jobid) + "/InputSandbox" + str(jobid) + "/" + self.__getJobName(jobInfo[0][0]).replace(" ", "") + "_" + str(jobid) ) dest = ( self.__tmpSandBoxDir + str(jobid) + "/" + self.__getJobName(jobInfo[0][0]).replace(" ", "") + "_" + str(jobid) ) result = 0 if (software == "hadoop") and (version == "hdv1") and (hll == "none"): result = cli.getData(source, dest) if (software == "hadoop") and (version == "hdv2") and (hll == "none"): result = cli.getData(source, dest) if not result["OK"]: self.log.error("Error to get the data from BigData Software DFS:", result) result = cli.getdata(dest, dest) if not result["OK"]: self.log.error("Error to get the data from BigData Cluster to DIRAC:", result) outputSandbox = self.get_filepaths(dest) resolvedSandbox = self.__resolveOutputSandboxFiles(outputSandbox) if not resolvedSandbox["OK"]: self.log.warn("Output sandbox file resolution failed:") self.log.warn(resolvedSandbox["Message"]) self.__report("Failed", "Resolving Output Sandbox") self.fileList = resolvedSandbox["Value"]["Files"] missingFiles = resolvedSandbox["Value"]["Missing"] if missingFiles: self.jobReport.setJobParameter("OutputSandboxMissingFiles", ", ".join(missingFiles), sendFlag=False) if self.fileList and jobid: self.outputSandboxSize = getGlobbedTotalSize(self.fileList) self.log.info("Attempting to upload Sandbox with limit:", self.sandboxSizeLimit) result = self.sandboxClient.uploadFilesAsSandboxForJob( self.fileList, jobid, "Output", self.sandboxSizeLimit ) # 1024*1024*10 if not result["OK"]: self.log.error("Output sandbox upload failed with message", result["Message"]) if result.has_key("SandboxFileName"): outputSandboxData = result["SandboxFileName"] self.log.info("Attempting to upload %s as output data" % (outputSandboxData)) outputData.append(outputSandboxData) self.jobReport.setJobParameter("OutputSandbox", "Sandbox uploaded to grid storage", sendFlag=False) self.jobReport.setJobParameter( "OutputSandboxLFN", self.__getLFNfromOutputFile(outputSandboxData)[0], sendFlag=False ) else: self.log.info("Could not get SandboxFileName to attempt upload to Grid storage") return S_ERROR( "Output sandbox upload failed and no file name supplied for failover to Grid storage" ) else: # Do not overwrite in case of Error if not self.failedFlag: self.__report("Completed", "Output Sandbox Uploaded") self.log.info("Sandbox uploaded successfully") return "OK"
def __updateInteractiveSandBox(self, jobid, software, version, hll, hllversion, cli): # Detele content of InputSandbox jobInfo = BigDataDB.getJobIDInfo(jobid) source = self.__tmpSandBoxDir + str(jobid) + "/*_out" dest = self.__tmpSandBoxDir + str(jobid) result = 0 result = cli.delHadoopData(self.__tmpSandBoxDir + str(jobid) + "/InputSandbox" + str(jobid)) self.log.debug("ATENTION::Deleting InputSandBox Contain:", result) result = cli.getdata(dest, source) self.log.debug("Step 0:getting data from hadoop:", result) if not result["OK"]: self.log.error("Error to get the data from BigData Cluster to DIRAC:", result) self.log.debug("Step:1:GetFilePaths:") outputSandbox = self.get_filepaths(self.__tmpSandBoxDir + str(jobid)) self.log.debug("Step:2:OutputSandBox:", self.__tmpSandBoxDir + str(jobid)) self.log.debug("Step:2:OutputSandBox:", outputSandbox) resolvedSandbox = self.__resolveOutputSandboxFiles(outputSandbox) self.log.debug("Step:3:ResolveSandbox:", resolvedSandbox) if not resolvedSandbox["OK"]: self.log.warn("Output sandbox file resolution failed:") self.log.warn(resolvedSandbox["Message"]) self.__report("Failed", "Resolving Output Sandbox") self.fileList = resolvedSandbox["Value"]["Files"] missingFiles = resolvedSandbox["Value"]["Missing"] if missingFiles: self.jobReport.setJobParameter("OutputSandboxMissingFiles", ", ".join(missingFiles), sendFlag=False) if self.fileList and jobid: self.outputSandboxSize = getGlobbedTotalSize(self.fileList) self.log.info("Attempting to upload Sandbox with limit:", self.sandboxSizeLimit) result = self.sandboxClient.uploadFilesAsSandboxForJob( self.fileList, jobid, "Output", self.sandboxSizeLimit ) # 1024*1024*10 if not result["OK"]: self.log.error("Output sandbox upload failed with message", result["Message"]) if result.has_key("SandboxFileName"): outputSandboxData = result["SandboxFileName"] self.log.info("Attempting to upload %s as output data" % (outputSandboxData)) outputData.append(outputSandboxData) self.jobReport.setJobParameter("OutputSandbox", "Sandbox uploaded to grid storage", sendFlag=False) self.jobReport.setJobParameter( "OutputSandboxLFN", self.__getLFNfromOutputFile(outputSandboxData)[0], sendFlag=False ) else: self.log.info("Could not get SandboxFileName to attempt upload to Grid storage") return S_ERROR( "Output sandbox upload failed and no file name supplied for failover to Grid storage" ) else: # Do not overwrite in case of Error if not self.failedFlag: self.__report("Completed", "Output Sandbox Uploaded") self.log.info("Sandbox uploaded successfully") return "OK"