def CheckAndCreateDir(self,sh,d): cmd="if [ ! -d \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d) akrr.sshCommand(sh,cmd) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name']))
def CheckDirSimple(sh,d): """ check directory existance and verify accessability return None,message if does not exists return True,message if can write there return False,message if can not write there """ dir(sh) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: return (None,"Directory %s:%s does not exists!"%(sh.remotemachine,d)) cmd="echo test > "+os.path.join(d,'akrrtestwrite') #print cmd msg=akrr.sshCommand(sh,cmd) #print msg cmd="cat "+os.path.join(d,'akrrtestwrite') #print cmd msg=akrr.sshCommand(sh,cmd) #print msg if msg.strip()=="test": cmd="rm "+os.path.join(d,'akrrtestwrite') akrr.sshCommand(sh,cmd) return (True,"Directory exist and accessible for read/write") else: return (False,"Directory %s:%s is NOT accessible for read/write!"%(sh.remotemachine,d))
def CheckDir(sh, d,exitOnFail=True,tryToCreate=True): status,msg=CheckDirSimple(sh, d) if tryToCreate==True and status==None: log("Directory %s:%s does not exists, will try to create it"%(sh.remotemachine,d)) cmd="mkdir -p \"%s\""%(d) akrr.sshCommand(sh,cmd) status,msg=CheckDirSimple(sh, d) if exitOnFail==False: return status,msg if status==None: logerr("Directory %s:%s does not exists!"%(sh.remotemachine,d)) exit() elif status==True: return (True,msg) else: logerr("Directory %s:%s is NOT accessible for read/write!"%(sh.remotemachine,d)) exit()
def CreateBatchJobScriptAndSubmitIt(self): self.JobScriptName=self.appName+".job" print "### Creating batch job script and submitting it to remote machine" #as a current bypass will create a job script remotely and copy it here #get ssh to remote resource sh=None try: sh=akrr.sshResource(self.resource) #Create remote directories if needed def CheckAndCreateDir(self,sh,d): cmd="if [ ! -d \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d) akrr.sshCommand(sh,cmd) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name'])) #akrrdata CheckAndCreateDir(self,sh,self.resource['akrrdata']) #dir for app CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName)) #dir for task CheckAndCreateDir(self,sh,self.remoteTaskDir) #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl")) #cd to remoteTaskDir akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir)) #get walltime from DB dbdefaults={} try: db,cur=akrr.getDB() cur.execute('''SELECT resource,app,resource_param,app_param FROM ACTIVETASKS WHERE task_id=%s ;''',(self.task_id,)) raw=cur.fetchall() (resource,app,resource_param,app_param)=raw[0] cur.execute("""SELECT walllimit FROM akrr_default_walllimit WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """,(resource,app,resource_param,app_param)) raw=cur.fetchall() if len(raw)>0: dbdefaults['walllimit']=raw[0][0] #db.commit() cur.close() del db except Exception as e: pass #create job-script batchvars={} #print "#"*80 for di in [self.resource,self.app,dbdefaults,self.resourceParam, self.appParam]: batchvars.update(di) #stack the subtasks subTaskInfo=self.GetSubTaskInfo() if batchvars['shuffleSubtasks']: random.shuffle(subTaskInfo) subTasksExecution="" for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo: remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp) SubTaskJobScriptName=self.GetJobScriptName(subtask_app) SubTaskJobScriptPath=os.path.join(remoteSubTaskDir,SubTaskJobScriptName) subTasksExecution+="cd "+remoteSubTaskDir+"\n" #subTasksExecution+="cp "+os.path.join(self.remoteTaskDir,"job.id ")+"./\n" subTasksExecution+="echo Starting "+subtask_app+"\n" subTasksExecution+=self.resource['shell']+" "+SubTaskJobScriptPath+" > stdout 2> stderr\n" subTasksExecution+="echo Done with "+subtask_app+"\n"+"\n" batchvars['subTasksExecution']=subTasksExecution #calculate NNodes and NCores tmpNNodes=None tmpNCores=None if batchvars.has_key('nnodes'): tmpNNodes=batchvars['nnodes'] tmpNCores=tmpNNodes*batchvars['ppn'] else: tmpNCores=batchvars['ncores'] if tmpNCores%batchvars['ppn']==0: tmpNNodes=tmpNCores/batchvars['ppn'] else: tmpNNodes=(tmpNCores/batchvars['ppn'])+1 batchvars['akrrNCores']=tmpNCores batchvars['akrrNNodes']=tmpNNodes #Set batchvars remaps batchvars['akrrPPN']=batchvars['ppn'] batchvars['akrrNCoresToBorder']=batchvars['akrrPPN']*batchvars['akrrNNodes'] batchvars['akrrTaskWorkingDir']=self.remoteTaskDir batchvars['akrrWallTimeLimit']="%02d:%02d:00"%(int(batchvars['walllimit'])/60,int(batchvars['walllimit'])%60) batchvars['localPATH']=akrr.sshCommand(sh,"echo $PATH").strip() batchvars['akrrAppKerName']=self.app['name'] batchvars['akrrResourceName']=self.resource['name'] batchvars['akrrTimeStamp']= self.timeStamp if batchvars['akrrNNodes']==1: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrNCores'] else: batchvars['akrrPPN4NodesOrCores4OneNode']=batchvars['akrrPPN'] if 'nodeListSetterTemplate' not in batchvars: batchvars['nodeListSetterTemplate']=batchvars['nodeListSetter'][batchvars['batchScheduler']] #set AppKerLauncher #if self.resource['name'] in batchvars['runScript']: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript'][self.resource['name']],batchvars,keepDoubleBrakets=True) #else: # batchvars['akrrStartAppKer']=akrr.formatRecursively(batchvars['runScript']['default'],batchvars,keepDoubleBrakets=True) #process templates batchvars['akrrCommonCommands']=akrr.formatRecursively(batchvars['akrrCommonCommandsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrCommonTests']=akrr.formatRecursively(batchvars['akrrCommonTestsTemplate'],batchvars,keepDoubleBrakets=True) #batchvars['akrrStartAppKer']=batchvars['akrrStartAppKerTemplate'].format(**batchvars) batchvars['akrrCommonCleanup']=akrr.formatRecursively(batchvars['akrrCommonCleanupTemplate'],batchvars,keepDoubleBrakets=True) #do parameters adjustment if 'process_params' in batchvars: batchvars['process_params'](batchvars) #generate job script jobScript=akrr.formatRecursively(self.resource["batchJobTemplate"],batchvars) fout=open(os.path.join(self.taskDir,"jobfiles",self.JobScriptName),"w") fout.write(jobScript) fout.close() msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir)) ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript)) akrr.sshCommand(sh,"cat %s "%(self.JobScriptName)) #send to queue from string import Template sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName) msg=akrr.sshCommand(sh,sendToQueue) matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S) JobID=None if matchObj: try: JobID=int(matchObj.group(1)) except: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg) else: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id. "+msg) akrr.sshCommand(sh,"echo %d > job.id"%(JobID)) #cp job id to subtasks for subtask_id,subtask_status,subtask_datetimestamp,subtask_resource,subtask_app,subtask_task_param in subTaskInfo: remoteSubTaskDir=self.GetRemoteTaskDir(self.resource['akrrdata'],subtask_app,subtask_datetimestamp) akrr.sshCommand(sh,"cp job.id %s"%(remoteSubTaskDir)) self.RemoteJobID=JobID self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh=None print "\nRemoteJobID=",self.RemoteJobID print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #update DB time_submitted_to_queue db,cur=akrr.getDB() cur.execute('''UPDATE ACTIVETASKS SET time_submitted_to_queue=%s WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id)) cur.close() del db self.status="Created batch job script and have submitted it to remote queue." self.statusinfo="Remote job ID is %d"%(self.RemoteJobID) self.ToDoNextString="CheckTheJobOnRemoteMachine" #check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) except Exception as e: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not created batch job script and submit it to remote queue" self.statusinfo=traceback.format_exc() if akrr.max_fails_to_submit_to_the_queue>=0: if hasattr(self, "FailsToSubmitToTheQueue"): self.FailsToSubmitToTheQueue+=1 if self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue: #Stop execution of the task and submit results to db self.ToDoNextString="PushToDB" resultFile=os.path.join(self.taskDir,"result.xml") self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) else: self.FailsToSubmitToTheQueue=1 else: self.FatalErrorsCount+=1 akrr.printException(self.status) return akrr.RepeateAfterFailsToSubmitToTheQueue
def CheckTheJobOnRemoteMachine(self): sh=None try: print "### Checking the job status on remote machine" from string import Template wE=waitExprs[self.resource['batchScheduler']] cmd =Template(wE[0]).substitute(jobId=str(self.RemoteJobID)) rege=Template(wE[2]).substitute(jobId=str(self.RemoteJobID)) sh=akrr.sshResource(self.resource) msg=akrr.sshCommand(sh,cmd) sh.sendline("exit") sh.close(force=True) del sh sh=None matchObj= wE[1](rege,msg,wE[3]) if matchObj: print "Still in queue. Either waiting or running" if datetime.datetime.today()-self.TimeJobSubmetedToRemoteQueue>self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue): print "ERROR:" print "Job exceeds the maximal time in queue (%s). And will be terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue))) print "Removing job from remote queue." self.Terminate() print "copying files from remote machine" akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #print msg print "Deleting all files from remote machine" self.DeleteRemoteFolder() self.status="ERROR: Job exceeds the maximal time in queue (%s) and was terminated."%(str(self.taskParam.get('MaxTimeInQueue',akrr.max_time_in_queue))) self.statusinfo="\nLast Status report:\n"+msg self.ReportFormat="Error" self.ToDoNextString="CheckIfSubtasksDoneProccessingResults" self.UpdateSubTasks() #del self.RemoteJobID return datetime.timedelta(seconds=3) self.status="Still in queue. Either waiting or running" self.statusinfo=msg return active_task_default_attempt_repeat else: print "Not in queue. Either exited with error or executed successfully." print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #print msg print "Deleting all files from remote machine" self.DeleteRemoteFolder() self.status="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine" self.statusinfo="Not in queue. Either exited with error or executed successfully. Copied all files to local machine. Deleted all files from remote machine" self.ToDoNextString="CheckIfSubtasksDoneProccessingResults" self.UpdateSubTasks() #del self.RemoteJobID self.TimeJobPossiblyCompleted=datetime.datetime.today() return datetime.timedelta(seconds=3) #print msg except: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not check the status of the job on remote resource" self.statusinfo=traceback.format_exc() self.FatalErrorsCount+=1 akrr.printException(self.status) self.ToDoNextString="CheckTheJobOnRemoteMachine" return active_task_default_attempt_repeat self.status="CheckTheJobOnRemoteMachine" self.statusinfo="CheckTheJobOnRemoteMachine" self.ToDoNextString="CheckTheJobOnRemoteMachine" return datetime.timedelta(days=0, hours=0, minutes=2)
sys.stdout=sys.__stdout__ sys.stderr=sys.__stderr__ except Exception,e: msg2=str_io.getvalue() msg2+="\n"+traceback.format_exc() sys.stdout=sys.__stdout__ sys.stderr=sys.__stderr__ logerr("Can not connect to """+resource['name']+"\n"+ "Probably invalid credential, see full error report below",msg2) exit() print "="*80 log("Successfully connected to %s\n\n"%(resource['name']),highlight="ok") ############################################################################################### log("Checking if shell is BASH\n") msg=akrr.sshCommand(rsh,"echo $BASH") if msg.count("bash")>0: log("Shell is BASH\n",highlight="ok") else: logerr("Shell on headnode of %s is not BASH, change it to bash and try again.\n"%(resource_name,)) exit() ############################################################################################### log("Checking directory locations\n") d=resource['akrrData'] log("Checking: %s:%s"%(resource['remoteAccessNode'],d)) status,msg=CheckDir(rsh, d,exitOnFail=True,tryToCreate=True) log(msg+"\n",highlight="ok") d=resource['appKerDir']
def CreateBatchJobScriptAndSubmitIt(self,doNotSubmitToQueue=False): self.JobScriptName=self.GetJobScriptName(self.appName) print "### Creating batch job script and submitting it to remote machine" #as a current bypass will create a job script remotely and copy it here #get ssh to remote resource sh=None try: sh=akrr.sshResource(self.resource) #Create remote directories if needed def CheckAndCreateDir(self,sh,d): cmd="if [ ! -d \"%s\" ]\n then mkdir \"%s\"\n fi"%(d,d) akrr.sshCommand(sh,cmd) cmd="if [ -d \"%s\" ]\n then \necho EXIST\n else echo DOESNOTEXIST\n fi"%(d) msg=akrr.sshCommand(sh,cmd) if msg.find("DOESNOTEXIST")>=0: raise akrr.akrrError(akrr.ERROR_REMOTE_FILES,"Can not create directory %s on %s."%(d,self.resource['name'])) #akrrdata CheckAndCreateDir(self,sh,self.resource['akrrdata']) #dir for app CheckAndCreateDir(self,sh,os.path.join(self.resource['akrrdata'],self.appName)) #dir for task CheckAndCreateDir(self,sh,self.remoteTaskDir) #CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl")) #cd to remoteTaskDir akrr.sshCommand(sh,"cd %s"%(self.remoteTaskDir)) #GenerateBatchJobScript self.GenerateBatchJobScript() msg=akrr.scpToResource(self.resource,os.path.join(self.taskDir,"jobfiles",self.JobScriptName),os.path.join(self.remoteTaskDir)) if doNotSubmitToQueue: return ##akrr.sshCommandNoReturn(sh,"cat > %s << EOF1234567\n%s\nEOF1234567\n"%(self.JobScriptName,jobScript)) akrr.sshCommand(sh,"cat %s "%(self.JobScriptName)) #send to queue from string import Template JobID=0 if not 'masterTaskID' in self.taskParam: #i.e. submit to queue only if task is independent sendToQueue=Template(submitCommands[self.resource['batchScheduler']]).substitute(scriptPath=self.JobScriptName) msg=akrr.sshCommand(sh,sendToQueue) matchObj=re.search(jidExtractPatterns[self.resource['batchScheduler']],msg,re.M|re.S) if matchObj: try: JobID=int(matchObj.group(1)) except: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg) else: raise akrr.akrrError(akrr.ERROR_REMOTE_JOB,"Can't get job id:\n"+msg) akrr.sshCommand(sh,"echo %d > job.id"%(JobID)) self.RemoteJobID=JobID self.TimeJobSubmetedToRemoteQueue=datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh=None print "\nRemoteJobID=",self.RemoteJobID print "copying files from remote machine" msg=akrr.scpFromResource(self.resource,os.path.join(self.remoteTaskDir,"*"),os.path.join(self.taskDir,"jobfiles"),"-r") #update DB time_submitted_to_queue db,cur=akrr.getDB() cur.execute('''UPDATE ACTIVETASKS SET time_submitted_to_queue=%s WHERE task_id=%s ;''',(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"),self.task_id)) cur.close() del db if not 'masterTaskID' in self.taskParam: #i.e. idepentent task self.status="Created batch job script and have submitted it to remote queue." self.statusinfo="Remote job ID is %d"%(self.RemoteJobID) self.ToDoNextString="CheckTheJobOnRemoteMachine" #check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) else: #i.e. this is subtask #i.e. idepentent task self.status="Created batch job script." self.statusinfo="Created batch job script. Waiting for master task to execute it." self.ToDoNextString="CheckTheJobOnRemoteMachine" #master task will update the time when it will finish task execution return datetime.timedelta(days=111*365) except Exception as e: if sh!=None: sh.sendline("exit") sh.close(force=True) del sh self.status="ERROR Can not created batch job script and submit it to remote queue" self.statusinfo=traceback.format_exc() if akrr.max_fails_to_submit_to_the_queue>=0: if hasattr(self, "FailsToSubmitToTheQueue"): self.FailsToSubmitToTheQueue+=1 if (self.FailsToSubmitToTheQueue>akrr.max_fails_to_submit_to_the_queue or (self.taskParam['test_run']==True and self.FailsToSubmitToTheQueue>=2)): #Stop execution of the task and submit results to db self.ToDoNextString="PushToDB" resultFile=os.path.join(self.taskDir,"result.xml") self.WriteErrorXML(resultFile) return datetime.timedelta(seconds=3) else: self.FailsToSubmitToTheQueue=1 else: self.FatalErrorsCount+=1 akrr.printException(self.status) return akrr.repeat_after_fails_to_submit_to_the_queue
def getFileSytemAccessPoints(): global networkScratch global localScratch global akrrData global appKerDir homeDir=akrr.sshCommand(rsh,"echo $HOME").strip() scratchNetworkDir=akrr.sshCommand(rsh,"echo $SCRATCH").strip() #localScratch localScratchDefault="/tmp" while True: logging.input("Enter location of local scratch (visible only to single node):") localScratch=raw_input("[%s]"%localScratchDefault) if localScratch.strip()=="": localScratch=localScratchDefault status,msg=resource_validation_and_deployment.CheckDirSimple(rsh, localScratch) if status: logging.info(msg) print break else: logging.warning(msg) logging.warning('local scratch might be have a different location on head node, so if it is by design it is ok') print break localScratch=akrr.sshCommand(rsh,"echo %s"%(localScratch,)).strip() #networkScratch networkScratchDefault="" if scratchNetworkDir!="": networkScratchDefault=scratchNetworkDir networkScratchVisible=False while True: logging.input("Enter location of network scratch (visible only to all nodes), used for temporary storage of app kernel input/output:") if networkScratchDefault!="": networkScratch=raw_input("[%s]"%networkScratchDefault) if networkScratch.strip()=="": networkScratch=networkScratchDefault else: networkScratch=raw_input("") if networkScratch=="": logging.error("Incorrect value for networkScratch, try again") continue status,msg=resource_validation_and_deployment.CheckDir(rsh, networkScratch,exitOnFail=False,tryToCreate=True) if status: logging.info(msg) networkScratchVisible=True print break else: logging.warning(msg) #logging.warning('network scratch might be have a different location on head node, so if it is by design it is ok') #print break networkScratch=akrr.sshCommand(rsh,"echo %s"%(networkScratch,)).strip() #appKerDir appKerDirDefault=os.path.join(homeDir,"appker",resourceName) while True: logging.input("Enter future location of app kernels input and executable files:") appKerDir=raw_input("[%s]"%appKerDirDefault) if appKerDir.strip()=="": appKerDir=appKerDirDefault status,msg=resource_validation_and_deployment.CheckDir(rsh, appKerDir,exitOnFail=False,tryToCreate=True) if status: logging.info(msg) print break else: logging.error(msg) appKerDir=akrr.sshCommand(rsh,"echo %s"%(appKerDir,)).strip() #akrrData akrrDataDefault=os.path.join(homeDir,"akrrdata",resourceName) if networkScratchVisible: akrrDataDefault=os.path.join(networkScratch,"akrrdata",resourceName) while True: logging.input("Enter future locations for app kernels working directories (can or even should be on scratch space):") akrrData=raw_input("[%s]"%akrrDataDefault) if akrrData.strip()=="": akrrData=akrrDataDefault status,msg=resource_validation_and_deployment.CheckDir(rsh, akrrData,exitOnFail=False,tryToCreate=True) if status: logging.info(msg) print break else: logging.error(msg) akrrData=akrr.sshCommand(rsh,"echo %s"%(akrrData,)).strip()