def ProccessResults(self,Verbose=True): if Verbose:print "Processing the output" try: jobfilesDir=os.path.join(self.taskDir,"jobfiles") resultFile=os.path.join(self.taskDir,"result.xml") if hasattr(self, 'ReportFormat'):#i.e. fatal error and the last one is already in status/statusinfo if self.ReportFormat=="Error": self.ToDoNextString="PushToDB" self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) (batchJobDir,stdoutFile,stderrFile,appstdoutFile,taskexeclogFile)=self.GetResultFiles(raiseError=True) #get the performance data parserfilename=os.path.join(akrr.curdir,"appkernelsparsers",self.app['parser']) import imp with open(parserfilename, 'rb') as fp: thisAppKerParser = imp.load_module( 'thisAppKerParser', fp, parserfilename, ('.py', 'rb', imp.PY_SOURCE) ) appKerNResVars={} appKerNResVars['resource']=self.resource appKerNResVars['resource'].update(self.resourceParam) appKerNResVars['app']=self.app appKerNResVars['app'].update(self.appParam) appKerNResVars['taskId']=self.task_id appKerNResVars['subTasksId']=self.subTasksId performance=thisAppKerParser.processAppKerOutput(appstdout=appstdoutFile,geninfo=os.path.join(batchJobDir,"gen.info"),appKerNResVars=appKerNResVars) if performance==None: self.status="ERROR: Job have not finished successfully" self.statusinfo="" self.ToDoNextString="PushToDB" self.WriteErrorXML(resultFile) else: fout=open(resultFile,"w") content=fout.write(performance) fout.close() self.status="Output was processed and found that kernel either exited with error or executed successfully." self.statusinfo="Done" self.ToDoNextString="PushToDB" return datetime.timedelta(seconds=3) except: print traceback.format_exc() self.status="ERROR: Error happens during processing of output." self.statusinfo=traceback.format_exc() self.FatalErrorsCount+=1 akrr.printException(self.status) self.ToDoNextString="PushToDB" self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3)
def PushToDB(self,Verbose=True): db,cur=akrr.getExportDB() try: time_finished=None if hasattr(self,'TimeJobPossiblyCompleted'): time_finished=self.TimeJobPossiblyCompleted else: time_finished=datetime.datetime.today() self.PushToDBRaw(cur,self.task_id,time_finished,Verbose) db.commit() cur.close() del db self.ToDoNextString="IamDone" return None except: print traceback.format_exc() db.rollback() db.commit() cur.close() del db if hasattr(self, 'PushToDBAttemps'): self.PushToDBAttemps+=1 else: self.PushToDBAttemps=1 if self.PushToDBAttemps <= akrr.export_db_max_repeat_attempts: akrr.printException("AKRR server was not able to push to external DB.") self.status="ERROR: Can not push to external DB, will try again" self.statusinfo=traceback.format_exc() return akrr.export_db_repeat_attempt_in else: akrr.printException("AKRR server was not able to push to external DB will only update local.") self.status="ERROR: Can not push to external DB, will try again" self.statusinfo=traceback.format_exc() self.ToDoNextString="IamDone" return None
def CreateBatchJobScriptAndSubmitIt(self): self.JobScriptName=self.appName+".job" print "### Creating batch job script and submitting it to remote machine" #as a current bypass will create a job script remotely and copy it here #get ssh to remote resource sh=None try: sh=akrr.sshResource(self.resource) #Create remote directories if needed def CheckAndCreateDir(self,sh,d): cmd="if [ ! -d \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d) akrr.sshCommand(sh,cmd) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name'])) #akrrdata CheckAndCreateDir(self,sh,self.resource['akrrdata']) #dir for app CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName)) #dir for task CheckAndCreateDir(self,sh,self.remoteTaskDir) #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl")) #cd to remoteTaskDir akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir)) #get walltime from DB dbdefaults={} try: db,cur=akrr.getDB() cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS WHERE task_id=%s ;''',(self.task_id,)) raw=cur.fetchall() (resource,app,resource_param,app_param)=raw[0] cur.execute("""SELECT walllimit FROM akrr_default_walllimit WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param)) raw=cur.fetchall() if len(raw)>0: dbdefaults['walllimit']=raw[0][0] #db.commit() cur.close() del db except Exception as e: pass #create job-script batchvars={} #print "#"*80 for di in [self.resource,self.app,dbdefaults,self.resourceParam, self.appParam]: batchvars.update(di) #stack the subtasks subTaskInfo=self.GetSubTaskInfo() if batchvars['shuffleSubtasks']: random.shuffle(subTaskInfo) subTasksExecution="" for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo: remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp) SubTaskJobScriptName=self.GetJobScriptName(subtask_app) SubTaskJobScriptPath=os.path.join(remoteSubTaskDir,SubTaskJobScriptName) subTasksExecution+="cd "+remoteSubTaskDir+"\n" #subTasksExecution+="cp "+os.path.join(self.remoteTaskDir,"job.id ")+"./\n" subTasksExecution+="echo Starting "+subtask_app+"\n" subTasksExecution+=self.resource['shell']+" "+SubTaskJobScriptPath+" > stdout 2> stderr\n" subTasksExecution+="echo Done with "+subtask_app+"\n"+"\n" batchvars['subTasksExecution']=subTasksExecution #calculate NNodes and NCores tmpNNodes=None tmpNCores=None if batchvars.has_key('nnodes'): tmpNNodes=batchvars['nnodes'] tmpNCores=tmpNNodes*batchvars['ppn'] else: tmpNCores=batchvars['ncores'] if tmpNCores%batchvars['ppn']==0: tmpNNodes=tmpNCores/batchvars['ppn'] else: tmpNNodes=(tmpNCores/batchvars['ppn'])+1 batchvars['akrrNCores']=tmpNCores batchvars['akrrNNodes']=tmpNNodes #Set batchvars remaps batchvars['akrrPPN']=batchvars['ppn'] batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes'] batchvars['akrrTaskWorkingDir']=self.remoteTaskDir batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60) batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip() batchvars['akrrAppKerName']=self.app['name'] batchvars['akrrResourceName']=self.resource['name'] batchvars['akrrTimeStamp']= self.timeStamp if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores'] else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN'] if 'nodeListSetterTemplate' not in batchvars: batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']] #set AppKerLauncher #if self.resource['name'] in batchvars['runScript']: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True) #else: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True) #process templates batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars) batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True) #do parameters adjustment if 'process_params' in batchvars: batchvars['process_params'](batchvars) #generate job script jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars) fout=open(os.path.join(self.taskDir,"jobfiles",self.JobScriptName),"w") fout.write(jobScript) fout.close() msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir)) ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript)) akrr.sshCommand(sh,"cat %s "%(self.JobScriptName)) #send to queue from string import Template sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName) msg=akrr.sshCommand(sh,sendToQueue) matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S) JobID=None if matchObj: try: JobID=int(matchObj.group(1)) except: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg) else: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg) akrr.sshCommand(sh,"echo %d > job.id"%(JobID)) #cp job id to subtasks for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo: remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp) akrr.sshCommand(sh,"cp job.id %s"%(remoteSubTaskDir)) self.RemoteJobID=JobID self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh=None print "\nRemoteJobID=",self.RemoteJobID print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #update DB time_submitted_to_queue db,cur=akrr.getDB() cur.execute('''UPDATE ACTIVETASKS SET time_submitted_to_queue=%s WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id)) cur.close() del db self.status="Created batch job script and have submitted it to remote queue." self.statusinfo="Remote job ID is %d"%(self.RemoteJobID) self.ToDoNextString="CheckTheJobOnRemoteMachine" #check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) except Exception as e: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not created batch job script and submit it to remote queue" self.statusinfo=traceback.format_exc() if akrr.max_fails_to_submit_to_the_queue>=0: if hasattr(self, "FailsToSubmitToTheQueue"): self.FailsToSubmitToTheQueue+=1 if self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue: #Stop execution of the task and submit results to db self.ToDoNextString="PushToDB" resultFile=os.path.join(self.taskDir,"result.xml") self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) else: self.FailsToSubmitToTheQueue=1 else: self.FatalErrorsCount+=1 akrr.printException(self.status) return akrr.RepeateAfterFailsToSubmitToTheQueue
def CheckTheJobOnRemoteMachine(self): sh=None try: print "### Checking the job status on remote machine" from string import Template wE=waitExprs[self.resource['batchScheduler']] cmd =Template(wE[0]).substitute(jobId=str(self.RemoteJobID)) rege=Template(wE[2]).substitute(jobId=str(self.RemoteJobID)) sh=akrr.sshResource(self.resource) msg=akrr.sshCommand(sh,cmd) sh.sendline("exit") sh.close(force=True) del sh sh=None matchObj= wE[1](rege,msg,wE[3]) if matchObj: print "Still in queue. Either waiting or running" if datetime.datetime.today()-self.TimeJobSubmetedToRemoteQueue>self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue): print "ERROR:" print "Job exceeds the maximal time in queue (%s). And will be terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue))) print "Removing job from remote queue." self.Terminate() print "copying files from remote machine" akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #print msg print "Deleting all files from remote machine" self.DeleteRemoteFolder() self.status="ERROR: Job exceeds the maximal time in queue (%s) and was terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue))) self.statusinfo="\nLast Status report:\n"+msg self.ReportFormat="Error" self.ToDoNextString="CheckIfSubtasksDoneProccessingResults" self.UpdateSubTasks() #del self.RemoteJobID return datetime.timedelta(seconds=3) self.status="Still in queue. Either waiting or running" self.statusinfo=msg return active_task_default_attempt_repeat else: print "Not in queue. Either exited with error or executed successfully." print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #print msg print "Deleting all files from remote machine" self.DeleteRemoteFolder() self.status="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine" self.statusinfo="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine" self.ToDoNextString="CheckIfSubtasksDoneProccessingResults" self.UpdateSubTasks() #del self.RemoteJobID self.TimeJobPossiblyCompleted=datetime.datetime.today() return datetime.timedelta(seconds=3) #print msg except: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not check the status of the job on remote resource" self.statusinfo=traceback.format_exc() self.FatalErrorsCount+=1 akrr.printException(self.status) self.ToDoNextString="CheckTheJobOnRemoteMachine" return active_task_default_attempt_repeat self.status="CheckTheJobOnRemoteMachine" self.statusinfo="CheckTheJobOnRemoteMachine" self.ToDoNextString="CheckTheJobOnRemoteMachine" return datetime.timedelta(days=0, hours=0, minutes=2)
def ProccessResultsOld(self,Verbose=True): if Verbose:print "Processing the output" try: jobfilesDir=os.path.join(self.taskDir,"jobfiles") resultFile=os.path.join(self.taskDir,"result.xml") if hasattr(self, 'ReportFormat'):#i.e. fatal error and the last one is already in status/statusinfo if self.ReportFormat=="Error": self.ToDoNextString="PushToDB" self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) (batchJobDir,stdoutFile,stderrFile,appstdoutFile,taskexeclogFile)=self.GetResultFiles(raiseError=True) #now check if stdoutFile is empty or not fin=open(stdoutFile,"r") remstdout=fin.read() fin.close() if len(remstdout) < 5: fin=open(stderrFile,"r") remstderr=fin.readlines() fin.close() for l in remstderr: if re.search('job killed: walltime *\d+ *exceeded limit *\d+',l): self.status="ERROR: Job was killed on remote resource due to walltime exceeded limit" self.statusinfo=l self.ToDoNextString="PushToDB" self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) if Verbose:print "stdout is too short meaning that application kernel exit prematurely" self.status="ERROR: stdout is too short meaning that application kernel exit prematurely" self.statusinfo="stdout is too short meaning that application kernel exit prematurely" self.WriteErrorXML(resultFile) self.ToDoNextString="PushToDB" return datetime.timedelta(seconds=3) #here we need to check file if remstdout.count("<rep:report")==0: self.status="ERROR: unknown error" self.statusinfo="stdout:\n"+remstdout if appstdoutFile!=None: fin=open(appstdoutFile,"r") remappstdout=fin.read() fin.close() self.statusinfo+="\nappstdout:\n"+remappstdout self.WriteErrorXML(resultFile) self.ToDoNextString="PushToDB" return datetime.timedelta(seconds=3) self.status="Output was processed and found that kernel either exited with error or executed successfully." self.statusinfo="Done" self.ToDoNextString="PushToDB" import shutil shutil.copy2(stdoutFile,resultFile) #need to extract xml part file, some resource put servise information above and below fin=open(resultFile,"r") content=fin.read() fin.close() if content[0]!='<' or content[-2]!='>': #need to reformat i0=content.find("<rep:report") i1=content.find("</rep:report>") fout=open(resultFile,"w") content=fout.write("<?xml version='1.0'?>\n"+content[i0:i1+len("</rep:report>")]+"\n") fout.close() return datetime.timedelta(seconds=3) except: self.status="ERROR: Error happens during processing of output." self.statusinfo=traceback.format_exc() self.FatalErrorsCount+=1 akrr.printException(self.status) self.ToDoNextString="PushToDB" self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3)
def ProccessResults(self,Verbose=True): if not self.app.has_key('parser'): return self.ProccessResultsOld(Verbose) if Verbose:print "Processing the output" try: jobfilesDir=os.path.join(self.taskDir,"jobfiles") resultFile=os.path.join(self.taskDir,"result.xml") print resultFile #get job.id (from remote machine) of master node if self.RemoteJobID==0: #i.e. this is a subtask of a bundle if os.path.isfile(os.path.join(jobfilesDir,"job.id")): fin=open(os.path.join(jobfilesDir,"job.id"),"r") self.RemoteJobID=int(fin.read().strip()) print "Master task's RemoteJobID is ",self.RemoteJobID fin.close() if hasattr(self, 'ReportFormat'):#i.e. fatal error and the last one is already in status/statusinfo if self.ReportFormat=="Error": self.ToDoNextString="PushToDB" self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) (batchJobDir,stdoutFile,stderrFile,appstdoutFile,taskexeclogFile)=self.GetResultFiles(raiseError=True) #get the performance data parserfilename=os.path.join(akrr.curdir,"appkernelsparsers",self.app['parser']) import imp with open(parserfilename, 'rb') as fp: thisAppKerParser = imp.load_module( 'thisAppKerParser', fp, parserfilename, ('.py', 'rb', imp.PY_SOURCE) ) appKerNResVars={} appKerNResVars['resource']=self.resource appKerNResVars['resource'].update(self.resourceParam) appKerNResVars['app']=self.app appKerNResVars['app'].update(self.appParam) performance=thisAppKerParser.processAppKerOutput(appstdout=appstdoutFile, stdout=stdoutFile, stderr=stderrFile, geninfo=os.path.join(batchJobDir,"gen.info"), appKerNResVars=appKerNResVars) if performance==None: self.status="ERROR: Job have not finished successfully" self.statusinfo="" self.ToDoNextString="PushToDB" self.WriteErrorXML(resultFile) else: fout=open(resultFile,"w") content=fout.write(performance) fout.close() self.status="Output was processed and found that kernel either exited with error or executed successfully." self.statusinfo="Done" self.ToDoNextString="PushToDB" if hasattr(performance,'nodeList'): self.nodesList=performance.nodeList else: self.nodesList=None return datetime.timedelta(seconds=3) except: print traceback.format_exc() self.status="ERROR: Error happens during processing of output." self.statusinfo=traceback.format_exc() self.FatalErrorsCount+=1 akrr.printException(self.status) self.ToDoNextString="PushToDB" self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3)
def GenerateBatchJobScript(self): if not hasattr(self, 'JobScriptName'): self.JobScriptName=self.GetJobScriptName(self.appName) #get walltime from DB dbdefaults={} try: db,cur=akrr.getDB() cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS WHERE task_id=%s ;''',(self.task_id,)) raw=cur.fetchall() if len(raw)>0: (resource,app,resource_param,app_param)=raw[0] cur.execute("""SELECT walllimit FROM akrr_default_walllimit WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param)) raw=cur.fetchall() if len(raw)>0: dbdefaults['walllimit']=raw[0][0] #db.commit() cur.close() del db except Exception as e: pass raise e #create job-script try: batchvars={} appkernelOnResource={} if 'appkernelOnResource' in self.app: if self.resourceName in self.app['appkernelOnResource']: appkernelOnResource=self.app['appkernelOnResource'][self.resourceName] elif 'default' in self.app['appkernelOnResource']: appkernelOnResource=self.app['appkernelOnResource']['default'] #print "#"*80 for di in [self.resource,self.app,appkernelOnResource,dbdefaults,self.resourceParam, self.appParam]: batchvars.update(di) #get autowalltime limit try: if 'autoWalltimeLimit' in batchvars and batchvars['autoWalltimeLimit']==True: print "\nautoWalltimeLimit is on, trying to estimate walltime limit..." autoWalltimeLimitOverhead=1.2 if 'autoWalltimeLimitOverhead' in batchvars: autoWalltimeLimitOverhead=batchvars['autoWalltimeLimitOverhead']+1.0 #query last 20 executions of this appkernel on that resource with that node count db,cur=akrr.getDB(True) cur.execute('''SELECT resource,reporter,reporternickname,collected,status,walltime FROM akrr_xdmod_instanceinfo WHERE `resource`=%s AND `reporternickname` = %s ORDER BY `akrr_xdmod_instanceinfo`.`collected` DESC LIMIT 0 , 20''',(self.resource['name'],"%s.%d"%(self.app['name'],batchvars['nnodes']))) raw=cur.fetchall() i=0 lastFiveRunsSuccessfull=True maxwalltime=0.0 for r in raw: if i<5 and r['status']==0: lastFiveRunsSuccessfull=False if r['status']==1 and r['walltime']>maxwalltime: maxwalltime=r['walltime'] i+=1 if i<5: print "There are only %d previous run, need at least 5 for walltime limit autoset" else: if lastFiveRunsSuccessfull == False: print "One of last 5 runs have failed. Would not use autoset." else: print "Max walltime was %.1f s, will change walltime limit from %.1f minutes to %d minutes"%(maxwalltime,batchvars['walllimit'],int(autoWalltimeLimitOverhead*maxwalltime/60.0+0.99)) batchvars['walllimit']=int((autoWalltimeLimitOverhead*maxwalltime/60.0+0.99)) print cur.close() del db except Exception as e: pass #calculate NNodes and NCores tmpNNodes=None tmpNCores=None if batchvars.has_key('nnodes'): tmpNNodes=batchvars['nnodes'] tmpNCores=tmpNNodes*batchvars['ppn'] else: tmpNCores=batchvars['ncores'] if tmpNCores%batchvars['ppn']==0: tmpNNodes=tmpNCores/batchvars['ppn'] else: tmpNNodes=(tmpNCores/batchvars['ppn'])+1 batchvars['akrrNCores']=tmpNCores batchvars['akrrNNodes']=tmpNNodes #Set batchvars remaps batchvars['akrrPPN']=batchvars['ppn'] batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes'] batchvars['akrrTaskWorkingDir']=self.remoteTaskDir batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60) #batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip() batchvars['akrrAppKerName']=self.app['name'] batchvars['akrrResourceName']=self.resource['name'] batchvars['akrrTimeStamp']= self.timeStamp if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores'] else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN'] if 'nodeListSetterTemplate' not in batchvars: batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']] #set AppKerLauncher #if 'runScript' in batchvars: # if self.resource['name'] in batchvars['runScript']: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True) # else: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True) #process templates batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars) batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True) #specially for IOR request two nodes for single node benchmark, one for read and one for write if batchvars['requestTwoNodesForOneNodeAppKer']==True and batchvars['akrrNNodes']==1 and 'batchJobHeaderTemplate' in batchvars: batchvars2=copy.deepcopy(batchvars) batchvars2['akrrNCores']=2*batchvars['akrrNCores'] batchvars2['akrrNNodes']=2*batchvars['akrrNNodes'] batchvars2['akrrNCoresToBorder']=2*batchvars['akrrNCoresToBorder'] batchvars2['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN'] batchvars['batchJobHeaderTemplate']=akrr.formatRecursively(batchvars2['batchJobHeaderTemplate'],batchvars2) pass #do parameters adjustment if 'process_params' in batchvars: batchvars['process_params'](batchvars) #generate job script jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars) jobScriptFullPath=os.path.join(self.taskDir,"jobfiles",self.JobScriptName) fout=open(jobScriptFullPath,"w") fout.write(jobScript) fout.close() except Exception as e: self.status="ERROR: Can not created batch job script" self.statusinfo=traceback.format_exc() akrr.printException(self.status) raise e
def CreateBatchJobScriptAndSubmitIt(self,doNotSubmitToQueue=False): self.JobScriptName=self.GetJobScriptName(self.appName) print "### Creating batch job script and submitting it to remote machine" #as a current bypass will create a job script remotely and copy it here #get ssh to remote resource sh=None try: sh=akrr.sshResource(self.resource) #Create remote directories if needed def CheckAndCreateDir(self,sh,d): cmd="if [ ! -d \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d) akrr.sshCommand(sh,cmd) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name'])) #akrrdata CheckAndCreateDir(self,sh,self.resource['akrrdata']) #dir for app CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName)) #dir for task CheckAndCreateDir(self,sh,self.remoteTaskDir) #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl")) #cd to remoteTaskDir akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir)) #GenerateBatchJobScript self.GenerateBatchJobScript() msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir)) if doNotSubmitToQueue: return ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript)) akrr.sshCommand(sh,"cat %s "%(self.JobScriptName)) #send to queue from string import Template JobID=0 if not 'masterTaskID' in self.taskParam: #i.e. submit to queue only if task is independent sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName) msg=akrr.sshCommand(sh,sendToQueue) matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S) if matchObj: try: JobID=int(matchObj.group(1)) except: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg) else: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg) akrr.sshCommand(sh,"echo %d > job.id"%(JobID)) self.RemoteJobID=JobID self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh=None print "\nRemoteJobID=",self.RemoteJobID print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #update DB time_submitted_to_queue db,cur=akrr.getDB() cur.execute('''UPDATE ACTIVETASKS SET time_submitted_to_queue=%s WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id)) cur.close() del db if not 'masterTaskID' in self.taskParam: #i.e. idepentent task self.status="Created batch job script and have submitted it to remote queue." self.statusinfo="Remote job ID is %d"%(self.RemoteJobID) self.ToDoNextString="CheckTheJobOnRemoteMachine" #check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) else: #i.e. this is subtask #i.e. idepentent task self.status="Created batch job script." self.statusinfo="Created batch job script. Waiting for master task to execute it." self.ToDoNextString="CheckTheJobOnRemoteMachine" #master task will update the time when it will finish task execution return datetime.timedelta(days=111*365) except Exception as e: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not created batch job script and submit it to remote queue" self.statusinfo=traceback.format_exc() if akrr.max_fails_to_submit_to_the_queue>=0: if hasattr(self, "FailsToSubmitToTheQueue"): self.FailsToSubmitToTheQueue+=1 if (self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue or (self.taskParam['test_run']==True and self.FailsToSubmitToTheQueue>=2)): #Stop execution of the task and submit results to db self.ToDoNextString="PushToDB" resultFile=os.path.join(self.taskDir,"result.xml") self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) else: self.FailsToSubmitToTheQueue=1 else: self.FatalErrorsCount+=1 akrr.printException(self.status) return akrr.repeat_after_fails_to_submit_to_the_queue