def CheckTheJobOnRemoteMachine(self): sh=None try: print "### Checking the job status on remote machine" from string import Template wE=waitExprs[self.resource['batchScheduler']] cmd =Template(wE[0]).substitute(jobId=str(self.RemoteJobID)) rege=Template(wE[2]).substitute(jobId=str(self.RemoteJobID)) sh=akrr.sshResource(self.resource) msg=akrr.sshCommand(sh,cmd) sh.sendline("exit") sh.close(force=True) del sh sh=None matchObj= wE[1](rege,msg,wE[3]) if matchObj: print "Still in queue. Either waiting or running" if datetime.datetime.today()-self.TimeJobSubmetedToRemoteQueue>self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue): print "ERROR:" print "Job exceeds the maximal time in queue (%s). And will be terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue))) print "Removing job from remote queue." self.Terminate() print "copying files from remote machine" akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #print msg print "Deleting all files from remote machine" self.DeleteRemoteFolder() self.status="ERROR: Job exceeds the maximal time in queue (%s) and was terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue))) self.statusinfo="\nLast Status report:\n"+msg self.ReportFormat="Error" self.ToDoNextString="CheckIfSubtasksDoneProccessingResults" self.UpdateSubTasks() #del self.RemoteJobID return datetime.timedelta(seconds=3) self.status="Still in queue. Either waiting or running" self.statusinfo=msg return active_task_default_attempt_repeat else: print "Not in queue. Either exited with error or executed successfully." print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #print msg print "Deleting all files from remote machine" self.DeleteRemoteFolder() self.status="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine" self.statusinfo="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine" self.ToDoNextString="CheckIfSubtasksDoneProccessingResults" self.UpdateSubTasks() #del self.RemoteJobID self.TimeJobPossiblyCompleted=datetime.datetime.today() return datetime.timedelta(seconds=3) #print msg except: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not check the status of the job on remote resource" self.statusinfo=traceback.format_exc() self.FatalErrorsCount+=1 akrr.printException(self.status) self.ToDoNextString="CheckTheJobOnRemoteMachine" return active_task_default_attempt_repeat self.status="CheckTheJobOnRemoteMachine" self.statusinfo="CheckTheJobOnRemoteMachine" self.ToDoNextString="CheckTheJobOnRemoteMachine" return datetime.timedelta(days=0, hours=0, minutes=2)
def CreateBatchJobScriptAndSubmitIt(self): self.JobScriptName=self.appName+".job" print "### Creating batch job script and submitting it to remote machine" #as a current bypass will create a job script remotely and copy it here #get ssh to remote resource sh=None try: sh=akrr.sshResource(self.resource) #Create remote directories if needed def CheckAndCreateDir(self,sh,d): cmd="if [ ! -d \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d) akrr.sshCommand(sh,cmd) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name'])) #akrrdata CheckAndCreateDir(self,sh,self.resource['akrrdata']) #dir for app CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName)) #dir for task CheckAndCreateDir(self,sh,self.remoteTaskDir) #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl")) #cd to remoteTaskDir akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir)) #get walltime from DB dbdefaults={} try: db,cur=akrr.getDB() cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS WHERE task_id=%s ;''',(self.task_id,)) raw=cur.fetchall() (resource,app,resource_param,app_param)=raw[0] cur.execute("""SELECT walllimit FROM akrr_default_walllimit WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param)) raw=cur.fetchall() if len(raw)>0: dbdefaults['walllimit']=raw[0][0] #db.commit() cur.close() del db except Exception as e: pass #create job-script batchvars={} #print "#"*80 for di in [self.resource,self.app,dbdefaults,self.resourceParam, self.appParam]: batchvars.update(di) #stack the subtasks subTaskInfo=self.GetSubTaskInfo() if batchvars['shuffleSubtasks']: random.shuffle(subTaskInfo) subTasksExecution="" for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo: remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp) SubTaskJobScriptName=self.GetJobScriptName(subtask_app) SubTaskJobScriptPath=os.path.join(remoteSubTaskDir,SubTaskJobScriptName) subTasksExecution+="cd "+remoteSubTaskDir+"\n" #subTasksExecution+="cp "+os.path.join(self.remoteTaskDir,"job.id ")+"./\n" subTasksExecution+="echo Starting "+subtask_app+"\n" subTasksExecution+=self.resource['shell']+" "+SubTaskJobScriptPath+" > stdout 2> stderr\n" subTasksExecution+="echo Done with "+subtask_app+"\n"+"\n" batchvars['subTasksExecution']=subTasksExecution #calculate NNodes and NCores tmpNNodes=None tmpNCores=None if batchvars.has_key('nnodes'): tmpNNodes=batchvars['nnodes'] tmpNCores=tmpNNodes*batchvars['ppn'] else: tmpNCores=batchvars['ncores'] if tmpNCores%batchvars['ppn']==0: tmpNNodes=tmpNCores/batchvars['ppn'] else: tmpNNodes=(tmpNCores/batchvars['ppn'])+1 batchvars['akrrNCores']=tmpNCores batchvars['akrrNNodes']=tmpNNodes #Set batchvars remaps batchvars['akrrPPN']=batchvars['ppn'] batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes'] batchvars['akrrTaskWorkingDir']=self.remoteTaskDir batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60) batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip() batchvars['akrrAppKerName']=self.app['name'] batchvars['akrrResourceName']=self.resource['name'] batchvars['akrrTimeStamp']= self.timeStamp if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores'] else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN'] if 'nodeListSetterTemplate' not in batchvars: batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']] #set AppKerLauncher #if self.resource['name'] in batchvars['runScript']: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True) #else: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True) #process templates batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars) batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True) #do parameters adjustment if 'process_params' in batchvars: batchvars['process_params'](batchvars) #generate job script jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars) fout=open(os.path.join(self.taskDir,"jobfiles",self.JobScriptName),"w") fout.write(jobScript) fout.close() msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir)) ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript)) akrr.sshCommand(sh,"cat %s "%(self.JobScriptName)) #send to queue from string import Template sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName) msg=akrr.sshCommand(sh,sendToQueue) matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S) JobID=None if matchObj: try: JobID=int(matchObj.group(1)) except: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg) else: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg) akrr.sshCommand(sh,"echo %d > job.id"%(JobID)) #cp job id to subtasks for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo: remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp) akrr.sshCommand(sh,"cp job.id %s"%(remoteSubTaskDir)) self.RemoteJobID=JobID self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh=None print "\nRemoteJobID=",self.RemoteJobID print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #update DB time_submitted_to_queue db,cur=akrr.getDB() cur.execute('''UPDATE ACTIVETASKS SET time_submitted_to_queue=%s WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id)) cur.close() del db self.status="Created batch job script and have submitted it to remote queue." self.statusinfo="Remote job ID is %d"%(self.RemoteJobID) self.ToDoNextString="CheckTheJobOnRemoteMachine" #check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) except Exception as e: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not created batch job script and submit it to remote queue" self.statusinfo=traceback.format_exc() if akrr.max_fails_to_submit_to_the_queue>=0: if hasattr(self, "FailsToSubmitToTheQueue"): self.FailsToSubmitToTheQueue+=1 if self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue: #Stop execution of the task and submit results to db self.ToDoNextString="PushToDB" resultFile=os.path.join(self.taskDir,"result.xml") self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) else: self.FailsToSubmitToTheQueue=1 else: self.FatalErrorsCount+=1 akrr.printException(self.status) return akrr.RepeateAfterFailsToSubmitToTheQueue
def CreateBatchJobScriptAndSubmitIt(self,doNotSubmitToQueue=False): self.JobScriptName=self.GetJobScriptName(self.appName) print "### Creating batch job script and submitting it to remote machine" #as a current bypass will create a job script remotely and copy it here #get ssh to remote resource sh=None try: sh=akrr.sshResource(self.resource) #Create remote directories if needed def CheckAndCreateDir(self,sh,d): cmd="if [ ! -d \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d) akrr.sshCommand(sh,cmd) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name'])) #akrrdata CheckAndCreateDir(self,sh,self.resource['akrrdata']) #dir for app CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName)) #dir for task CheckAndCreateDir(self,sh,self.remoteTaskDir) #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl")) #cd to remoteTaskDir akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir)) #GenerateBatchJobScript self.GenerateBatchJobScript() msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir)) if doNotSubmitToQueue: return ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript)) akrr.sshCommand(sh,"cat %s "%(self.JobScriptName)) #send to queue from string import Template JobID=0 if not 'masterTaskID' in self.taskParam: #i.e. submit to queue only if task is independent sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName) msg=akrr.sshCommand(sh,sendToQueue) matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S) if matchObj: try: JobID=int(matchObj.group(1)) except: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg) else: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg) akrr.sshCommand(sh,"echo %d > job.id"%(JobID)) self.RemoteJobID=JobID self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh=None print "\nRemoteJobID=",self.RemoteJobID print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #update DB time_submitted_to_queue db,cur=akrr.getDB() cur.execute('''UPDATE ACTIVETASKS SET time_submitted_to_queue=%s WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id)) cur.close() del db if not 'masterTaskID' in self.taskParam: #i.e. idepentent task self.status="Created batch job script and have submitted it to remote queue." self.statusinfo="Remote job ID is %d"%(self.RemoteJobID) self.ToDoNextString="CheckTheJobOnRemoteMachine" #check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) else: #i.e. this is subtask #i.e. idepentent task self.status="Created batch job script." self.statusinfo="Created batch job script. Waiting for master task to execute it." self.ToDoNextString="CheckTheJobOnRemoteMachine" #master task will update the time when it will finish task execution return datetime.timedelta(days=111*365) except Exception as e: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not created batch job script and submit it to remote queue" self.statusinfo=traceback.format_exc() if akrr.max_fails_to_submit_to_the_queue>=0: if hasattr(self, "FailsToSubmitToTheQueue"): self.FailsToSubmitToTheQueue+=1 if (self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue or (self.taskParam['test_run']==True and self.FailsToSubmitToTheQueue>=2)): #Stop execution of the task and submit results to db self.ToDoNextString="PushToDB" resultFile=os.path.join(self.taskDir,"result.xml") self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) else: self.FailsToSubmitToTheQueue=1 else: self.FatalErrorsCount+=1 akrr.printException(self.status) return akrr.repeat_after_fails_to_submit_to_the_queue