예제 #1
0
 def CheckTheJobOnRemoteMachine(self):
     sh=None
     try:
         print "### Checking the job status on remote machine"
         from string import Template
         wE=waitExprs[self.resource['batchScheduler']]
         cmd =Template(wE[0]).substitute(jobId=str(self.RemoteJobID))
         rege=Template(wE[2]).substitute(jobId=str(self.RemoteJobID))
         
         sh=akrr.sshResource(self.resource)
         msg=akrr.sshCommand(sh,cmd)
         sh.sendline("exit")
         sh.close(force=True)
         del sh
         sh=None
         
         
         matchObj= wE[1](rege,msg,wE[3])
         if matchObj:
             print "Still in queue. Either waiting or running"
             if datetime.datetime.today()-self.TimeJobSubmetedToRemoteQueue>self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue):
                 print "ERROR:"
                 print "Job exceeds the maximal time in queue (%s). And will be terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue)))
                 print "Removing job from remote queue."
                 self.Terminate()
                 print "copying files from remote machine"
                 akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
                 #print msg
                 print "Deleting all files from remote machine"
                 self.DeleteRemoteFolder()
                 self.status="ERROR: Job exceeds the maximal time in queue (%s) and was terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue)))
                 self.statusinfo="\nLast Status report:\n"+msg
                 self.ReportFormat="Error"
                 self.ToDoNextString="CheckIfSubtasksDoneProccessingResults"
                 
                 self.UpdateSubTasks()
                 #del self.RemoteJobID
                 return datetime.timedelta(seconds=3)
             
             self.status="Still in queue. Either waiting or running"
             self.statusinfo=msg
             return active_task_default_attempt_repeat
         else:
             print "Not in queue. Either exited with error or executed successfully."
             print "copying files from remote machine"
             msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
             #print msg
             print "Deleting all files from remote machine"
             self.DeleteRemoteFolder()
             self.status="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine"
             self.statusinfo="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine"
             self.ToDoNextString="CheckIfSubtasksDoneProccessingResults"
             self.UpdateSubTasks()
             #del self.RemoteJobID
             self.TimeJobPossiblyCompleted=datetime.datetime.today()
             return datetime.timedelta(seconds=3)
         #print msg
     except:
         if sh!=None:
             sh.sendline("exit")
             sh.close(force=True)
             del sh
         self.status="ERROR Can not check the status of the job on remote resource"
         self.statusinfo=traceback.format_exc()
         self.FatalErrorsCount+=1
         akrr.printException(self.status)
         self.ToDoNextString="CheckTheJobOnRemoteMachine"
         return active_task_default_attempt_repeat
     self.status="CheckTheJobOnRemoteMachine"
     self.statusinfo="CheckTheJobOnRemoteMachine"
     self.ToDoNextString="CheckTheJobOnRemoteMachine"
     return datetime.timedelta(days=0, hours=0, minutes=2)
예제 #2
0
    def CreateBatchJobScriptAndSubmitIt(self):
        self.JobScriptName=self.appName+".job"
        print "### Creating batch job script and submitting it to remote machine"
        
        
        #as a current bypass will create a job script remotely and copy it here
        #get ssh to remote resource
        
        sh=None
        try:
            sh=akrr.sshResource(self.resource)
            #Create remote directories if needed
            def CheckAndCreateDir(self,sh,d):
                cmd="if [ ! -d  \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d)
                akrr.sshCommand(sh,cmd)
                cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d)
                msg=akrr.sshCommand(sh,cmd)
                if msg.find("DOESNOTEXIST")>=0:
                    raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name']))
            #akrrdata
            CheckAndCreateDir(self,sh,self.resource['akrrdata'])
            #dir for app
            CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName))
            #dir for task
            CheckAndCreateDir(self,sh,self.remoteTaskDir)
            #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl"))
            
            #cd to remoteTaskDir
            akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir))
            
            #get walltime from DB
            dbdefaults={}
            try:
                db,cur=akrr.getDB()
                
                cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS
                WHERE task_id=%s ;''',(self.task_id,))
                raw=cur.fetchall()
                (resource,app,resource_param,app_param)=raw[0]
                
                cur.execute("""SELECT walllimit
                    FROM akrr_default_walllimit
                    WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param))
                raw=cur.fetchall()
                
                if len(raw)>0:
                    dbdefaults['walllimit']=raw[0][0]
                
                #db.commit()
                cur.close()
                del db
            except Exception as e:
                pass
            
            #create job-script
            batchvars={}
            
            #print "#"*80
            for di in [self.resource,self.app,dbdefaults,self.resourceParam, self.appParam]:
                batchvars.update(di)
                
            #stack the subtasks
            subTaskInfo=self.GetSubTaskInfo()
            if batchvars['shuffleSubtasks']:
                random.shuffle(subTaskInfo)
            subTasksExecution=""
            for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo:
                remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp)
                SubTaskJobScriptName=self.GetJobScriptName(subtask_app)
                SubTaskJobScriptPath=os.path.join(remoteSubTaskDir,SubTaskJobScriptName)
                
                subTasksExecution+="cd "+remoteSubTaskDir+"\n"
                #subTasksExecution+="cp "+os.path.join(self.remoteTaskDir,"job.id ")+"./\n"
                subTasksExecution+="echo Starting "+subtask_app+"\n"
                subTasksExecution+=self.resource['shell']+" "+SubTaskJobScriptPath+" > stdout 2> stderr\n"
                subTasksExecution+="echo Done with "+subtask_app+"\n"+"\n"
           
            batchvars['subTasksExecution']=subTasksExecution
            
            
            
            
            #calculate NNodes and NCores
            tmpNNodes=None
            tmpNCores=None
            if batchvars.has_key('nnodes'):
                tmpNNodes=batchvars['nnodes']
                tmpNCores=tmpNNodes*batchvars['ppn']
            else:
                tmpNCores=batchvars['ncores']
                if tmpNCores%batchvars['ppn']==0:
                    tmpNNodes=tmpNCores/batchvars['ppn']
                else:
                    tmpNNodes=(tmpNCores/batchvars['ppn'])+1
            
            batchvars['akrrNCores']=tmpNCores
            batchvars['akrrNNodes']=tmpNNodes
            
            #Set batchvars remaps
            batchvars['akrrPPN']=batchvars['ppn']
            batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes']
            batchvars['akrrTaskWorkingDir']=self.remoteTaskDir
            batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60)
            batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip()
            batchvars['akrrAppKerName']=self.app['name']
            batchvars['akrrResourceName']=self.resource['name']
            batchvars['akrrTimeStamp']= self.timeStamp
            if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores']
            else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN']
            if 'nodeListSetterTemplate' not in batchvars:
                batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']]
            #set AppKerLauncher
            #if self.resource['name'] in batchvars['runScript']:
            #    batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True)
            #else:
            #    batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True)
                
            
            #process templates
            batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True)
            #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True)
            #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars)
            batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True)
            
            #do parameters adjustment
            if 'process_params' in batchvars:
                batchvars['process_params'](batchvars)
            #generate job script
            jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars)
            fout=open(os.path.join(self.taskDir,"jobfiles",self.JobScriptName),"w")
            fout.write(jobScript)
            fout.close()
            msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir))
            
            ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript))
            akrr.sshCommand(sh,"cat %s "%(self.JobScriptName))
            
            
            #send to queue
            from string import Template
            sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName)
            msg=akrr.sshCommand(sh,sendToQueue)
            matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S)

            JobID=None
            if matchObj:
                try:
                    JobID=int(matchObj.group(1))
                except:
                    raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg)
            else:
                raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg)
            
            akrr.sshCommand(sh,"echo %d > job.id"%(JobID))
            
            #cp job id to subtasks
            for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo:
                remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp)
                akrr.sshCommand(sh,"cp job.id %s"%(remoteSubTaskDir))
            
            self.RemoteJobID=JobID
            self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today()
            
            sh.sendline("exit")
            sh.close(force=True)
            del sh
            sh=None
            print "\nRemoteJobID=",self.RemoteJobID
            print "copying files from remote machine"
            msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
            
            #update DB time_submitted_to_queue
            db,cur=akrr.getDB()
            
            cur.execute('''UPDATE ACTIVETASKS
            SET time_submitted_to_queue=%s
            WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id))
            
            cur.close()
            del db
            
            self.status="Created batch job script and have submitted it to remote queue."
            self.statusinfo="Remote job ID is %d"%(self.RemoteJobID)
            self.ToDoNextString="CheckTheJobOnRemoteMachine"
            
            
            #check first time in 1 minute
            return datetime.timedelta(days=0, hours=0, minutes=1)
        except Exception as e:
            if sh!=None:
                sh.sendline("exit")
                sh.close(force=True)
                del sh
            self.status="ERROR Can not created batch job script and submit it to remote queue"
            self.statusinfo=traceback.format_exc()
            if akrr.max_fails_to_submit_to_the_queue>=0:
                if hasattr(self, "FailsToSubmitToTheQueue"):
                    self.FailsToSubmitToTheQueue+=1
                    if self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue:
                        #Stop execution of the task and submit results to db
                        self.ToDoNextString="PushToDB"
                        resultFile=os.path.join(self.taskDir,"result.xml")
                        self.WriteErrorXML(resultFile)
                        return  datetime.timedelta(seconds=3)
                else:
                    self.FailsToSubmitToTheQueue=1
            else:
                self.FatalErrorsCount+=1
            
            akrr.printException(self.status)
            return akrr.RepeateAfterFailsToSubmitToTheQueue
예제 #3
0
 def CreateBatchJobScriptAndSubmitIt(self,doNotSubmitToQueue=False):
     self.JobScriptName=self.GetJobScriptName(self.appName)
     print "### Creating batch job script and submitting it to remote machine"
     #as a current bypass will create a job script remotely and copy it here
     #get ssh to remote resource
     
     sh=None
     try:
         sh=akrr.sshResource(self.resource)
         #Create remote directories if needed
         def CheckAndCreateDir(self,sh,d):
             cmd="if [ ! -d  \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d)
             akrr.sshCommand(sh,cmd)
             cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d)
             msg=akrr.sshCommand(sh,cmd)
             if msg.find("DOESNOTEXIST")>=0:
                 raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name']))
         #akrrdata
         CheckAndCreateDir(self,sh,self.resource['akrrdata'])
         #dir for app
         CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName))
         #dir for task
         CheckAndCreateDir(self,sh,self.remoteTaskDir)
         #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl"))
         
         #cd to remoteTaskDir
         akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir))
         
         #GenerateBatchJobScript
         self.GenerateBatchJobScript()
         
         msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir))
         if doNotSubmitToQueue:
             return
         ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript))
         akrr.sshCommand(sh,"cat %s "%(self.JobScriptName))
         
         #send to queue
         from string import Template
         JobID=0
         if not 'masterTaskID' in self.taskParam:
             #i.e. submit to queue only if task is independent
             sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName)
             msg=akrr.sshCommand(sh,sendToQueue)
             matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S)
             
             if matchObj:
                 try:
                     JobID=int(matchObj.group(1))
                 except:
                     raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg)
             else:
                 raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg)
         
         akrr.sshCommand(sh,"echo %d > job.id"%(JobID))
         
         self.RemoteJobID=JobID
         self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today()
         
         
         sh.sendline("exit")
         sh.close(force=True)
         del sh
         sh=None
         print "\nRemoteJobID=",self.RemoteJobID
         print "copying files from remote machine"
         msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r")
         
         #update DB time_submitted_to_queue
         db,cur=akrr.getDB()
         
         cur.execute('''UPDATE ACTIVETASKS
         SET time_submitted_to_queue=%s
         WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id))
         
         cur.close()
         del db
         
         if not 'masterTaskID' in self.taskParam:
             #i.e. idepentent task
             self.status="Created batch job script and have submitted it to remote queue."
             self.statusinfo="Remote job ID is %d"%(self.RemoteJobID)
             self.ToDoNextString="CheckTheJobOnRemoteMachine"
             
             #check first time in 1 minute
             return datetime.timedelta(days=0, hours=0, minutes=1)
         else:
             #i.e. this is subtask
              #i.e. idepentent task
             self.status="Created batch job script."
             self.statusinfo="Created batch job script. Waiting for master task to execute it."
             self.ToDoNextString="CheckTheJobOnRemoteMachine"
             
             #master task will update the time when it will finish task execution 
             return datetime.timedelta(days=111*365)
         
     except Exception as e:
         if sh!=None:
             sh.sendline("exit")
             sh.close(force=True)
             del sh
         self.status="ERROR Can not created batch job script and submit it to remote queue"
         self.statusinfo=traceback.format_exc()
         if akrr.max_fails_to_submit_to_the_queue>=0:
             if hasattr(self, "FailsToSubmitToTheQueue"):
                 self.FailsToSubmitToTheQueue+=1
                 if (self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue or
                         (self.taskParam['test_run']==True and self.FailsToSubmitToTheQueue>=2)):
                     #Stop execution of the task and submit results to db
                     self.ToDoNextString="PushToDB"
                     resultFile=os.path.join(self.taskDir,"result.xml")
                     self.WriteErrorXML(resultFile)
                     return  datetime.timedelta(seconds=3)
             else:
                 self.FailsToSubmitToTheQueue=1
         else:
             self.FatalErrorsCount+=1
         
         akrr.printException(self.status)
         return akrr.repeat_after_fails_to_submit_to_the_queue