def main(argv): # Parse arguments. User must input a job name. parser = argparse.ArgumentParser(description='Main program to start or restart ' + \ 'calibration spinup for the National Water Model') parser.add_argument('jobID',metavar='jobID',type=str,nargs='+', help='Job ID specific to calibration spinup.') parser.add_argument('--optDbPath',type=str,nargs='?', help='Optional alternative path to SQLite DB file.') args = parser.parse_args() # If the SQLite file does not exist, throw an error. if args.optDbPath is not None: if not os.path.isfile(args.optDbPath): print "ERROR: " + args.optDbPath + " Does Not Exist." sys.exit(1) else: dbPath = args.optDbPath else: dbPath = topDir + "wrfHydroCalib.db" if not os.path.isfile(dbPath): print "ERROR: SQLite3 DB file: " + dbPath + " Does Not Exist." sys.exit(1) # Establish the beginning timestamp for this program. begTimeStamp = datetime.datetime.now() # Get current user who is running this program. userTmp = pwd.getpwuid(os.getuid()).pw_name # Initialize object to hold status and job information jobData = statusMod.statusMeta() jobData.jobID = int(args.jobID[0]) jobData.dbPath = dbPath # Establish database connection. db = dbMod.Database(jobData) try: db.connect(jobData) except: print jobData.errMsg sys.exit(1) # Extract job data from database try: db.jobStatus(jobData) except: print jobData.errMsg sys.exit(1) # Pull extensive meta-data describing the job from the config file. configPath = str(jobData.jobDir) + "/setup.config" if not os.path.isfile(configPath): print "ERROR: Configuration file: " + configPath + " not found." sys.exit(1) try: staticData = configMod.readConfig(configPath) except: print "ERROR: Failure to read configuration file: " + configPath sys.exit(1) if staticData.coldStart == 1: print "ERROR: User has specified a cold-start option for calibration. Exiting...." sys.exit(0) if staticData.optSpinFlag == 1: print "ERROR: User has specified an optional spinup file. Exiting...." sys.exit(0) # Assign the SQL command from the config file into the jobData structure jobData.gSQL = staticData.gSQL # Check gages in directory to match what's in the database try: jobData.checkGages2(db) except: errMod.errOut(jobData) # Establish LOCK file to secure this Python program to make sure # no other instances over-step here. This is mostly designed to deal # with nohup processes being kicked off Yellowstone/Cheyenne/Crontabs arbitrarily. # Just another check/balance here. pyLockPath = str(jobData.jobDir) + "/PYTHON.LOCK" if os.path.isfile(pyLockPath): # Either a job is still running, or was running # and was killed. print 'LOCK FILE FOUND.' # Read in to get PID number pidObj = pd.read_csv(pyLockPath) pidCheck = int(pidObj.PID[0]) if errMod.check_pid(pidCheck): print "JOB: " + str(pidCheck) + \ " Is still running." sys.exit(0) else: print "JOB: " + str(pidCheck) + \ " Has Failed. Removing LOCK " + \ " file." os.remove(pyLockPath) fileObj = open(pyLockPath,'w') fileObj.write('\"PID\"\n') fileObj.write(str(os.getpid())) fileObj.close() # TEMPORARY FOR CHEYENNE. Since all cron jobs are launched # from an administrative node, we cannot monitor the process at # all, which is an inconvenience. So.... we will check the last # modified time. If it's more than 15 minutes old, we will assume # the process is no longer running and can continue on with calibration. #dtRunCheck = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(pyLockPath)) #if dtRunCheck.seconds/60.0 < 15.0: # # We are going to assume a previous process is still running on the system. # # exit gracefully. # print 'ASSUMING PROCESS STILL RUNNING' # sys.exit(0) #else: # # We are assuming the process is no longer running on the system. Alow # # the workflow to continue. # print 'ALLOWING WORKFLOW TO CONINUE. REMOVING LOCK FILE' # os.remove(pyLockPath) # fileObj = open(pyLockPath,'w') # fileObj.write('\"PID\"\n') # fileObj.write(str(os.getpid())) # fileObj.close() else: # Write a LOCK file for this program. fileObj = open(pyLockPath,'w') fileObj.write('\"PID\"\n') fileObj.write(str(os.getpid())) fileObj.close() # Some house keeping here. If the spinup is already complete, throw an error. # also, if this is a re-initiation under a different user, require the new # user to enter a new contact that will be unpdated in the database. if int(jobData.spinComplete) == 1: jobData.errMsg = "ERROR: Spinup for job ID: " + str(jobData.jobID) + \ " has already completed." errMod.errOut(jobData) if userTmp != jobData.owner: print "User: "******" is requesting to takeover jobID: " + \ str(jobData.jobID) + " from owner: " + str(jobData.owner) strTmp = "Please enter new email address. Leave blank if no email " + \ "change is desired. NOTE if you leave both email and Slack " + \ "information blank, no change in contact will occur. Only " + \ "the owner will be modified:" newEmail = raw_input(strTmp) #strTmp = "Please enter Slack channel:" #newSlackChannel = raw_input(strTmp) #strTmp = "Please enter Slack token:" #newSlackToken = raw_input(strTmp) #strTmp = "Please enter Slack user name:" #newSlackUName = raw_input(strTmp) changeFlag = 1 #if len(newSlackChannel) != 0 and len(newSlackToken) == 0: # print "ERROR: You must specify an associated Slacker API token." # sys.exit(1) #if len(newSlackChannel) != 0 and len(newSlackUName) == 0: # print "ERROR: You must specify an associated Slacker user name." # sys.exit(1) #if len(newSlackToken) != 0 and len(newSlackChannel) == 0: # print "ERROR: You must specify an associated Slacker channel name." # sys.exit(1) #if len(newSlackToken) != 0 and len(newSlackUName) == 0: # print "ERROR: You must specify an associated Slacker user name." # sys.exit(1) #if len(newSlackUName) != 0 and len(newSlackChannel) == 0: # print "ERROR: You must specify an associated Slacker channel name." # sys.exit(1) #if len(newSlackUName) != 0 and len(newSlackToken) == 0: # print "ERROR: You must specify an associated Slacker API token." # sys.exit(1) #if len(newSlackChannel) != 0 and len(newEmail) != 0: # print "ERROR: You cannot specify both email and Slack for notifications." # sys.exit(1) #if len(newSlackChannel) == 0 and len(newEmail) == 0: # changeFlag = 0 # PLACEHOLDER FOR CHECKING SLACK CREDENTIALS # TEMPORARY FOR VERSION 1.2 NWM CALIBRATION!!!! # If a new owner takes over, simply change the owner, but keep all # other contact information the same. newEmail = jobData.email newSlackChannel = jobData.slChan newSlackToken = jobData.slToken newSlackUName = jobData.slUser if not newEmail: newEmail = '' if not newSlackChannel: newSlackChannel = '' newSlackToken = '' newSlackUName = '' try: db.updateJobOwner(jobData,userTmp,newEmail,newSlackChannel,newSlackToken,newSlackUName,changeFlag) except: errMod.errOut(jobData) jobData.genMsg = "MSG: User: "******" Is Taking Over JobID: " + str(jobData.jobID) + \ " From Owner: " + str(jobData.owner) errMod.sendMsg(jobData) # Begin an "infinite" do loop. This loop will continue to loop through all # the basins until spinups are complete. Basins are allowed ONE failure. A restart # will be attempted. If the restart fails again, a LOCK file is placed into the # run directory and an error email is sent to the user. completeStatus = False # Create a "key" array. This array is of length [numBasins] and is initialized to 0.0. # Each array element can have the following values based on current model status: # 0.0 - Initial value # 0.5 - Model simulation in progress # 1.0 - Model simulation complete # -0.5 - Model simulation failed once and a restart has been attempted # -1.0 - Model has failed twice. A LOCK file has been created. # Once all array elements are 1.0, then completeStatus goes to True, an entry into # the database occurs, and the program will complete. keySlot = np.empty(len(jobData.gages)) keySlot[:] = 0.0 entryValue = float(len(jobData.gages)) # Create an array to hold systme job ID values. This will only be used for # PBS as qstat has demonstrated slow behavior when doing a full qstat command. # We will track job ID values and do a qstat <jobID> and populate this array # to keep track of things. pbsJobId = np.empty([len(jobData.gages)],np.int64) pbsJobId[:] = -9999 while not completeStatus: # Walk through spinup directory for each basin. Determine the status of # the model runs by the files available. If restarting, modify the # namelist files appropriately. Then, restart the model. Once all # basins have been accounted for, fire off the monitoring program through # nohup to keep track of the models. If anything goes wrong, notifications # will either be emailed per the user's info, or piped to Slack for group # notification. # Loop through each basin. Perform the following steps: # 1.) If status is -0.5,0.0, or 0.5, check to see if the model is running # for this basin. # 2.) If the model is not running, check for expected output and perform # necessary logistics. Continue to the next basin. # If the status goes to -1.0, a LOCK file is created and must be manually # removed from the user. Once the program detects this, it will restart the # model and the status goes back to 0.5. # If the status is -0.5 and no job is running, output must be complete, or # status goes to -1.0. # If output is not complete, the model is still running, status stays at 0.5. # If job is not running, and output has been completed, status goes to 1.0. # This continues indefinitely until statuses for ALL basins go to 1.0. for basin in range(0,len(jobData.gages)): try: spinupMod.runModel(jobData,staticData,db,jobData.gageIDs[basin],jobData.gages[basin],keySlot,basin,pbsJobId) except: errMod.errOut(jobData) # TEMPORARY FOR CHEYENNE # Check to make sure program hasn't passed a prescribed time limit. If it has, # exit gracefully. #timeCheckStamp = datetime.datetime.now() #programDtCheck = timeCheckStamp - begTimeStamp #if programDtCheck.seconds/60.0 > 90.0: # # 90-minutes) # try: # fileObj = open(pyLockPath,'a') # fileObj.write('WORKFLOW HAS HIT TIME LIMIT - EXITING....\n') # fileObj.close() # except: # jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + pyLockPath # errMod.errOut(jobData) # Check to see if program requirements have been met. if keySlot.sum() == entryValue: jobData.spinComplete = 1 try: db.updateSpinupStatus(jobData) except: errMod.errout(jobData) jobData.genMsg = "SPINUP FOR JOB ID: " + str(jobData.jobID) + " COMPLETE." errMod.sendMsg(jobData) completeStatus = True # Open the Python LOCK file. Write a blank line to the file and close it. # This action will simply modify the file modification time while only adding # a blank line. try: fileObj = open(pyLockPath,'a') fileObj.write('\n') fileObj.close() except: jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + pyLockPath errMod.errOut(jobData) # Remove LOCK file os.remove(pyLockPath)
def main(argv): # Parse arguments. User must input a job name and directory. parser = argparse.ArgumentParser(description='Utility program to report the position' + \ ' of a calibration job.') parser.add_argument('jobID', metavar='jobID', type=str, nargs='+', help='Job ID specific to calibration spinup.') parser.add_argument('contactFlag', metavar='ctFlag', type=int, nargs='+', help='1 = send to job contact, 0 = print to screen.') parser.add_argument('inDB', metavar='inDB', type=str, nargs='+', help='Required path to sqllite3 DB file.') parser.add_argument('--email', nargs='?', help='Optional email to pipe output to.') #parser.add_argument('--hostname',type=str,nargs='?', # help='Optional hostname MySQL DB resides on. Will use localhost if not passed.') #parser.add_argument('--pwd',metavar='pwd',type=str,nargs='?',help='Password to the Database.') args = parser.parse_args() # If the sqllite DB file does not exist, throw an error to the user. if not os.path.isfile(args.inDB[0]): print "ERROR: Unable to locate DB file: " + args.inDB[0] sys.exit(1) # Create dictionary of specified status messages. msgDict = { '-1.0': 'MODEL RUN LOCKED.', '-0.75': 'MAIN CALIBRATION PROGRAM LOCKED', '-0.5': 'MODEL FAILED ONCE - RUNNING AGAIN', '-0.25': 'MODEL FAILED ONCE - WAITING', '-0.1': 'CALIBRATON PROGRAM FOR DEFAULT PARAMETERS LOCKED', '0.0': 'NOT STARTED', '0.25': 'CALIBRATION PROGRAM FOR DEFAULT PROGRAM RUNNING', '0.5': 'MODEL CURRENTLY RUNNING', '0.75': 'MODEL COMPLETE READY FOR PARAMETER ESTIMATION', '0.9': 'PARAMETER ESTIMATION OCCURRING', '1.0': 'MODEL ITERATION COMPLETE' } # Initialize object to hold status and job information jobData = statusMod.statusMeta() jobData.jobID = int(args.jobID[0]) jobData.dbPath = args.inDB[0] # Lookup database username/login credentials based on username # running program. #if not args.pwd: # try: # pwdTmp = getpass.getpass('Enter Database Password: '******'WH_Calib_rw' # jobData.dbPwd = str(pwdTmp) # except: # print "ERROR: Unable to authenticate credentials for database." # sys.exit(1) #else: # jobData.dbPwd = args.pwd #jobData.dbUName= 'WH_Calib_rw' #jobData.port = 5432 #if not args.hostname: # # We will assume localhost for Postgres DB # hostTmp = 'localhost' #else: # hostTmp = str(args.hostname) #jobData.host = hostTmp # Establish database connection. db = dbMod.Database(jobData) try: db.connect(jobData) except: print jobData.errMsg sys.exit(1) # Extract job data from database try: db.jobStatus(jobData) except: print jobData.errMsg sys.exit(1) # Pull extensive meta-data describing the job from the config file. configPath = str(jobData.jobDir) + "/setup.config" if not os.path.isfile(configPath): print "ERROR: Configuration file: " + configPath + " not found." sys.exit(1) staticData = configMod.readConfig(configPath) try: staticData = configMod.readConfig(configPath) except: print "ERROR: Failure to read configuration file: " + configPath sys.exit(1) # Assign the SQL command from the config file into the jobData structure jobData.gSQL = staticData.gSQL # Check gages in directory to match what's in the database try: jobData.checkGages2(db) except: errMod.errOut(jobData) # If an optional email was passed to the program, update the job object to # reflect this for information dissemination. if args.email: jobData.slackObj = None jobData.email = str(args.email) # Loop through each basin. Determine if which iteration we are on, then report the status # of the job for this basin. msgOut = '' iterArray = np.empty([int(jobData.nIter)], np.int) completeArray = np.empty([int(jobData.nIter)], np.float) meanSum = 0.0 for basin in range(0, len(jobData.gages)): iterArray[:] = 0 completeArray[:] = 0.0 keyStatus = 0.0 keyStatusPrev = 0.0 domainID = jobData.gageIDs[basin] iterComplete = 1 statusData = db.iterationStatus(jobData, domainID, str(jobData.gages[basin])) for iteration in range(0, int(jobData.nIter)): keyStatus = float(statusData[iteration][1]) iterationTmp = int(statusData[iteration][0]) iterArray[iteration] = iterationTmp completeArray[iteration] = keyStatus indComplete = np.where(completeArray == 1) indCheck1 = np.where(completeArray != 1.0) indCheck2 = np.where(completeArray == 0.0) meanSum = meanSum + len(indComplete[0]) if len(indComplete[0]) == int(jobData.nIter): msgOut = msgOut + "BASIN: " + str(jobData.gages[basin]) + \ ": CALIBRATION COMPLETE.\n" else: if len(indCheck2[0]) == int(jobData.nIter): msgOut = msgOut + "BASIN: " + str(jobData.gages[basin]) + \ " - HAS NOT BEGUN CALIBRATION.\n" else: iterLastComplete = len(indComplete[0]) iterCurrent = iterLastComplete + 1 indCurrent = np.where(iterArray == iterCurrent) statusCurrent = completeArray[indCurrent[0][0]] ind2 = np.where(completeArray != 0.0) and np.where( completeArray != 1.0) iterTmp = iterArray[ind2[0][0]] msgOut = msgOut + "BASIN: " + str(jobData.gages[basin]) + \ ": " + str(msgDict[str(statusCurrent)]) + \ " - ITERATION: " + str(iterCurrent) + "\n" print "MEAN COMPLETENESS = " + str(float(meanSum) / len(jobData.gages)) jobData.genMsg = msgOut if int(args.contactFlag[0]) == 0: print jobData.genMsg else: errMod.sendMsg(jobData)
def main(argv): # Parse arguments. User must input a job name. parser = argparse.ArgumentParser(description='Main program to start or restart ' + \ 'sensitivity analysis for WRF-Hydro') parser.add_argument('jobID',metavar='jobID',type=str,nargs='+', help='Job ID specific to your sensitivity/caliration workflow job.') parser.add_argument('--optDbPath',type=str,nargs='?', help='Optional alternative path to SQLite DB file.') args = parser.parse_args() # If the SQLite file does not exist, throw an error. if args.optDbPath is not None: if not os.path.isfile(args.optDbPath): print "ERROR: " + args.optDbPath + " Does Not Exist." sys.exit(1) else: dbPath = args.optDbPath else: dbPath = topDir + "wrfHydroCalib.db" if not os.path.isfile(dbPath): print "ERROR: SQLite3 DB file: " + dbPath + " Does Not Exist." sys.exit(1) # Establish the beginning timestamp for this program. begTimeStamp = datetime.datetime.now() # Get current user who is running this program. userTmp = pwd.getpwuid(os.getuid()).pw_name # Initialize object to hold status and job information jobData = statusMod.statusMeta() jobData.jobID = int(args.jobID[0]) jobData.dbPath = dbPath # Establish database connection. db = dbMod.Database(jobData) try: db.connect(jobData) except: print jobData.errMsg sys.exit(1) # Extract job data from database try: db.jobStatus(jobData) except: print jobData.errMsg sys.exit(1) # If the sensitivity flag is 0, simply exit gracefully as the user specified # not to run calibration. if jobData.sensFlag != 1: print "ERROR: Sensitivity flag was set to 0 for this workflow." sys.exit(1) # Establish LOCK file to secure this Python program to make sure # no other instances over-step here. This is mostly designed to deal # with nohup processes being kicked off Yellowstone/Cheyenne/Crontabs arbitrarily. # Just another check/balance here. lockPath = str(jobData.jobDir) + "/PYTHON.LOCK" if os.path.isfile(lockPath): # Either a job is still running, or was running # and was killed. print 'LOCK FILE FOUND.' # Read in to get PID number pidObj = pd.read_csv(lockPath) pidCheck = int(pidObj.PID[0]) if errMod.check_pid(pidCheck): print "JOB: " + str(pidCheck) + \ " Is still running." sys.exit(0) else: print "JOB: " + str(pidCheck) + \ " Has Failed. Removing LOCK " + \ " file." os.remove(lockPath) fileObj = open(lockPath,'w') fileObj.write('\"PID\"\n') fileObj.write(str(os.getpid())) fileObj.close() # TEMPORARY FOR CHEYENNE. Since all cron jobs are launched # from an administrative node, we cannot monitor the process at # all, which is an inconvenience. So.... we will check the last # modified time. If it's more than 30 minutes old, we will assume # the process is no longer running and can continue on with calibration. #dtRunCheck = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(lockPath)) #if dtRunCheck.seconds/60.0 < 15.0: # # We are going to assume a previous process is still running on the system. # # exit gracefully. # print 'ASSUMING PROCESS STILL RUNNING' # sys.exit(0) #else: # # We are assuming the process is no longer running on the system. Alow # # the workflow to continue. # print 'ALLOWING WORKFLOW TO CONINUE. REMOVING LOCK FILE' # os.remove(lockPath) # fileObj = open(lockPath,'w') # fileObj.write('\"PID\"\n') # fileObj.write(str(os.getpid())) # fileObj.close() else: # Write a LOCK file for this program. fileObj = open(lockPath,'w') fileObj.write('\"PID\"\n') fileObj.write(str(os.getpid())) fileObj.close() # Pull extensive meta-data describing the job from the config file. configPath = str(jobData.jobDir) + "/setup.config" if not os.path.isfile(configPath): print "ERROR: Configuration file: " + configPath + " not found." sys.exit(1) try: staticData = configMod.readConfig(configPath) except: print "ERROR: Failure to read configuration file: " + configPath sys.exit(1) # Assign the SQL command from the config file into the jobData structure jobData.gSQL = staticData.gSQL # Check gages in directory to match what's in the database try: jobData.checkGages2(db) except: errMod.errOut(jobData) # Some house keeping here. If the sensitivity is already complete, throw an error. # Also ensure the spinup has been entered as complete. This is necessary for the # sensitivity to run. # also, if this is a re-initiation under a different user, require the new # user to enter a new contact that will be unpdated in the database. if int(jobData.spinComplete) != 1: # Check to see if optional spinup options were enabled. If so, update the spinup status. if staticData.coldStart == 1 or staticData.optSpinFlag != 0: print "Found optional spinup alternatives" jobData.spinComplete = 1 try: db.updateSpinupStatus(jobData) except: errMod.errOut(jobData) else: jobData.errMsg = "ERROR: Spinup for job ID: " + str(jobData.jobID) + \ " is NOT complete. You must complete the spinup in order" + \ " to run calibration." errMod.errOut(jobData) if int(jobData.sensComplete) == 1: jobData.errMsg = "ERROR: Sensitivity for job ID: " + str(jobData.jobID) + \ " has already completed." errMod.errOut(jobData) if userTmp != jobData.owner: print "User: "******" is requesting to takeover jobID: " + \ str(jobData.jobID) + " from owner: " + str(jobData.owner) strTmp = "Please enter new email address. Leave blank if no email " + \ "change is desired. NOTE if you leave both email and Slack " + \ "information blank, no change in contact will occur. Only " + \ "the owner will be modified:" newEmail = raw_input(strTmp) #strTmp = "Please enter Slack channel:" #newSlackChannel = raw_input(strTmp) #strTmp = "Please enter Slack token:" #newSlackToken = raw_input(strTmp) #strTmp = "Please enter Slack user name:" #newSlackUName = raw_input(strTmp) changeFlag = 1 #if len(newSlackChannel) != 0 and len(newSlackToken) == 0: # print "ERROR: You must specify an associated Slacker API token." # sys.exit(1) #if len(newSlackChannel) != 0 and len(newSlackUName) == 0: # print "ERROR: You must specify an associated Slacker user name." # sys.exit(1) #if len(newSlackToken) != 0 and len(newSlackChannel) == 0: # print "ERROR: You must specify an associated Slacker channel name." # sys.exit(1) #if len(newSlackToken) != 0 and len(newSlackUName) == 0: # print "ERROR: You must specify an associated Slacker user name." # sys.exit(1) #if len(newSlackUName) != 0 and len(newSlackChannel) == 0: # print "ERROR: You must specify an associated Slacker channel name." # sys.exit(1) #if len(newSlackUName) != 0 and len(newSlackToken) == 0: # print "ERROR: You must specify an associated Slacker API token." # sys.exit(1) #if len(newSlackChannel) != 0 and len(newEmail) != 0: # print "ERROR: You cannot specify both email and Slack for notifications." # sys.exit(1) #if len(newSlackChannel) == 0 and len(newEmail) == 0: # changeFlag = 0 # PLACEHOLDER FOR CHECKING SLACK CREDENTIALS jobData.genMsg = "MSG: User: "******" Is Taking Over JobID: " + str(jobData.jobID) + \ " From Owner: " + str(jobData.owner) errMod.sendMsg(jobData) # TEMPORARY FOR VERSION 1.2 NWM CALIBRATION!!!! # If a new owner takes over, simply change the owner, but keep all # other contact information the same. newEmail = jobData.email newSlackChannel = jobData.slChan newSlackToken = jobData.slToken newSlackUName = jobData.slUser if not newEmail: newEmail = '' if not newSlackChannel: newSlackChannel = '' newSlackToken = '' try: db.updateJobOwner(jobData,userTmp,newEmail,newSlackChannel,newSlackToken,newSlackUName,changeFlag) except: errMod.errOut(jobData) # Begin an "infinite" do loop. This loop will continue to loop through all # the basins until sensitivity jobs are complete. Basins are allowed ONE failure. A restart # will be attempted. If the restart fails again, a LOCK file is placed into the # run directory and an error email is sent to the user. completeStatus = False # Create a "key" array. This array is of length [numBasins] and is initialized to 0.0. # Each array element can have the following values based on current model status: # 0.0 - Initial value # 0.10 - Job to generate parameter grids for each model job is being ran. # 0.25 - Job to generate parameter grids is complete. Ready to run models..... # 0.5 - Model simulationa are in progress # 0.75 - Job to read in model output and run sensitivity analysis is ready to be ran. # 0.90 - Job to read in model output and run sensitivity analysis is running. # 1.0 - Sensitivity analysis complete # -0.1 - Parameter generation failed. A LOCK file has been created. # -0.5 - Model simulation failed once and a restart has been attempted # -0.90 - Sensitivity analysis job has failed. A LOCK file has been created. # -1.0 - Model has failed twice. A LOCK file has been created. # Once all array elements are 1.0, then completeStatus goes to True, an entry into # the database occurs, and the program will complete. keySlot = np.empty([len(jobData.gages),int(jobData.nSensIter)]) keySlot[:,:] = 0.0 entryValue = float(len(jobData.gages)*int(jobData.nSensIter)*2.0) # Create an array to hold systme job ID values. This will only be used for # PBS as qstat has demonstrated slow behavior when doing a full qstat command. # We will track job ID values and do a qstat <jobID> and populate this array # to keep track of things. pbsJobId = np.empty([len(jobData.gages),int(jobData.nSensIter)],np.int64) pbsJobId[:,:] = -9999 pbsCollectId = np.empty([len(jobData.gages),int(jobData.nSensIter)],np.int64) pbsCollectId[:,:] = -9999 pbsPreId = np.empty([len(jobData.gages)],np.int64) pbsPreId[:] = -9999 pbsPostId = np.empty([len(jobData.gages)],np.int64) pbsPostId[:] = -9999 # Pull all the status values into the keySlot array. for basin in range(0,len(jobData.gages)): domainID = jobData.gageIDs[basin] if domainID == -9999: jobData.errMsg = "ERROR: Unable to locate domainID for gage: " + str(jobData.gages[basin]) errMod.errOut(jobData) # We are going to pull all values for one basin, then place them into the array. # This is faster then looping over each iteration at a time. statusData = db.sensIterationStatus(jobData,domainID,str(jobData.gages[basin])) statusData = [list(item) for item in statusData] for iteration in range(0,int(jobData.nSensIter)): for iteration2 in range(0,int(jobData.nSensIter)): if statusData[iteration2][0] == iteration+1: keySlot[basin,iteration] = float(statusData[iteration2][1]) if len(np.where(keySlot != 0.0)[0]) == 0: # We need to either check to see if pre-processing has taken place, or # run it. preProcStatus = False while not completeStatus: # Walk through each basin undergoing sensitivity analysis. for basin in range(0,len(jobData.gages)): print "GAGE: " + jobData.gages[basin] # Establish a status value for pre-processing the parameter values from R/Python code. preProcStatus = False # Establish a status value for post-processing the model output and running sensitivity # analysis. postProcStatus = False # Calculate the number of "batches" we are going to run nBatches = int(jobData.nSensIter/jobData.nSensBatch) entryValueBatch = float(jobData.nSensBatch) # If we have a pre-processing complete file, set our pre-proc status to True. # Also, log parameter values generated if the log file hasn't been created. preProcComplete = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/preProc.COMPLETE" parmsLogged = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/PARAMS_LOGGED.COMPLETE" parmTxtFile = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/params_new.txt" sensLogged = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/SENS_LOGGED.COMPLETE" sensStats = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/stat_sensitivity.txt" missingFlag = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/CALC_STATS_MISSING" if os.path.isfile(preProcComplete): preProcStatus = True print "PRE PROCESSING COMPLETE!" if not os.path.isfile(parmsLogged): # Log parameter values generated by pre-processing. print "LOGGING PRE-PROC PARAM FILES." try: db.insertSensParms(jobData,parmsLogged,parmTxtFile,jobData.gageIDs[basin]) except: jobData.errMsg = ("WARNING: Unable to log sensitivity parameters for basin: " + str(basin) + \ " Job: " + str(jobData.jobID)) errMod.errOut(jobData) if not preProcStatus: try: sensitivityMod.preProc(preProcStatus,jobData,staticData,db,jobData.gageIDs[basin],jobData.gages[basin],pbsPreId,basin) except: errMod.errOut(jobData) else: # The goal here is to only operate on a fixed number of model runs at a time. # If you have a large parameter sample size, it's possible to have hundreds, # if not thousands of model permuatations. This worflow allows for # only batches of model runs to be ran at a time as to not bog down the system. for batchIter in range(0,nBatches): time.sleep(30) batchCheck = keySlot[basin,(batchIter*jobData.nSensBatch):((batchIter+1)*jobData.nSensBatch)] if batchIter == 0: batchCheckPrev = entryValueBatch else: batchCheckPrev = keySlot[basin,((batchIter-1)*jobData.nSensBatch):(batchIter*jobData.nSensBatch)] batchCheckPrev = batchCheckPrev.sum() if batchCheck.sum() != entryValueBatch and batchCheckPrev == entryValueBatch: for iterTmp in range(0,jobData.nSensBatch): iteration = batchIter*jobData.nSensBatch + iterTmp keyCheck1 = keySlot[basin,iteration] if keyCheck1 < 1: # This model iteration has not completed. try: sensitivityMod.runModel(jobData,staticData,db,jobData.gageIDs[basin],jobData.gages[basin],keySlot,basin,iteration,pbsJobId,pbsCollectId) except: errMod.errOut(jobData) if keySlot[basin,iteration] == 0.0 and keyCheck1 == 0.5: # Put some spacing between launching model simulations to slow down que geting # overloaded. time.sleep(3) # Update the temporary status array as it will be checked for this batch of model runs. batchCheck[iterTmp] = keySlot[basin,iteration] # Run post-processing ONLY when all model simulations are finished. if not postProcStatus and preProcStatus and len(np.where(batchCheck != 1.0)[0]) == 0: print "READY FOR POST PROCESSING" try: sensitivityMod.postProc(postProcStatus,jobData,staticData,db,jobData.gageIDs[basin],jobData.gages[basin],pbsPostId,basin) except: errMod.errOut(jobData) postProcComplete = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/postProc.COMPLETE" if os.path.isfile(postProcComplete): if not os.path.isfile(sensLogged) and not os.path.isfile(missingFlag): # Log sensitivity statistics into the database. if not os.path.isfile(sensStats): jobData.errMsg = "ERROR: Expected to find: " + sensStats + " after post-processing. Not found." errMod.errOut(jobData) else: try: db.logSensStats(jobData,sensStats,jobData.gageIDs[basin],sensLogged) except: errMod.errOut(jobData) # Check for complete flag on logging sensitivity statistics. if os.path.isfile(sensLogged): postProcStatus = True # Upgrade key status values as necessary for iterTmp in range(0,jobData.nSensIter): keySlot[basin,iterTmp] = 2.0 elif os.path.isfile(sensLogged): # Post-processing complete and statistics were sucessfully logged. postProcStatus = True # Upgrade key status values as necessary for iterTmp in range(0,jobData.nSensIter): keySlot[basin,iterTmp] = 2.0 elif os.path.isfile(missingFlag): # Missing obs were found. We will default to making this basin complete. for iterTmp in range(0,jobData.nSensIter): keySlot[basin,iterTmp] = 2.0 # TEMPORARY FOR CHEYENNE # Check to make sure program hasn't passed a prescribed time limit. If it has, # exit gracefully. #timeCheckStamp = datetime.datetime.now() #programDtCheck = timeCheckStamp - begTimeStamp #if programDtCheck.seconds/60.0 > 90.0: # # 90-minutes) # try: # fileObj = open(lockPath,'a') # fileObj.write('WORKFLOW HAS HIT TIME LIMIT - EXITING....\n') # fileObj.close() # except: # jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + lockPath # errMod.errOut(jobData) # Check to see if program requirements have been met. if keySlot.sum() == entryValue and postProcStatus: jobData.sensComplete = 1 try: db.updateSensStatus(jobData) except: errMod.errOut(jobData) jobData.genMsg = "SENSITIVITY FOR JOB ID: " + str(jobData.jobID) + " COMPLETE." errMod.sendMsg(jobData) completeStatus = True # Open the Python LOCK file. Write a blank line to the file and close it. # This action will simply modify the file modification time while only adding # a blank line. try: fileObj = open(lockPath,'a') fileObj.write('\n') fileObj.close() except: jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + lockPath errMod.errOut(jobData) # Remove LOCK file os.remove(lockPath)
def main(argv): # Parse arguments. User must input a job name. parser = argparse.ArgumentParser(description='Main program to start or restart ' + \ 'calibration for WRF-Hydro') parser.add_argument('jobID', metavar='jobID', type=str, nargs='+', help='Job ID specific to calibration spinup.') parser.add_argument('--optDbPath', type=str, nargs='?', help='Optional alternative path to SQLite DB file.') args = parser.parse_args() # If the SQLite file does not exist, throw an error. if args.optDbPath is not None: if not os.path.isfile(args.optDbPath): print "ERROR: " + args.optDbPath + " Does Not Exist." sys.exit(1) else: dbPath = args.optDbPath else: dbPath = topDir + "wrfHydroCalib.db" if not os.path.isfile(dbPath): print "ERROR: SQLite3 DB file: " + dbPath + " Does Not Exist." sys.exit(1) # Establish the beginning timestamp for this program. begTimeStamp = datetime.datetime.now() # Get current user who is running this program. userTmp = pwd.getpwuid(os.getuid()).pw_name # Initialize object to hold status and job information jobData = statusMod.statusMeta() jobData.jobID = int(args.jobID[0]) jobData.dbPath = dbPath # Establish database connection. db = dbMod.Database(jobData) try: db.connect(jobData) except: print jobData.errMsg sys.exit(1) # Extract job data from database try: db.jobStatus(jobData) except: print jobData.errMsg sys.exit(1) # If the calibration flag is 0, simply exit gracefully as the user specified # not to run calibration. if jobData.calibFlag != 1: print "ERROR: Calibration flag was set to 0 for this workflow." sys.exit(1) # Establish LOCK file to secure this Python program to make sure # no other instances over-step here. This is mostly designed to deal # with nohup processes being kicked off Yellowstone/Cheyenne/Crontabs arbitrarily. # Just another check/balance here. lockPath = str(jobData.jobDir) + "/PYTHON.LOCK" if os.path.isfile(lockPath): # Either a job is still running, or was running # and was killed. print 'LOCK FILE FOUND.' # Read in to get PID number pidObj = pd.read_csv(lockPath) pidCheck = int(pidObj.PID[0]) if errMod.check_pid(pidCheck): print "JOB: " + str(pidCheck) + \ " Is still running." sys.exit(0) else: print "JOB: " + str(pidCheck) + \ " Has Failed. Removing LOCK " + \ " file." os.remove(lockPath) fileObj = open(lockPath, 'w') fileObj.write('\"PID\"\n') fileObj.write(str(os.getpid())) fileObj.close() # TEMPORARY FOR CHEYENNE. Since all cron jobs are launched # from an administrative node, we cannot monitor the process at # all, which is an inconvenience. So.... we will check the last # modified time. If it's more than 30 minutes old, we will assume # the process is no longer running and can continue on with calibration. #dtRunCheck = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(lockPath)) #if dtRunCheck.seconds/60.0 < 15.0: # # We are going to assume a previous process is still running on the system. # # exit gracefully. # print 'ASSUMING PROCESS STILL RUNNING' # sys.exit(0) #else: # # We are assuming the process is no longer running on the system. Alow # # the workflow to continue. # print 'ALLOWING WORKFLOW TO CONINUE. REMOVING LOCK FILE' # os.remove(lockPath) # fileObj = open(lockPath,'w') # fileObj.write('\"PID\"\n') # fileObj.write(str(os.getpid())) # fileObj.close() else: print 'LOCK FILE NOT FOUND.' # Write a LOCK file for this program. fileObj = open(lockPath, 'w') fileObj.write('\"PID\"\n') fileObj.write(str(os.getpid())) fileObj.close() # Pull extensive meta-data describing the job from the config file. configPath = str(jobData.jobDir) + "/setup.config" if not os.path.isfile(configPath): print "ERROR: Configuration file: " + configPath + " not found." sys.exit(1) try: staticData = configMod.readConfig(configPath) except: print "ERROR: Failure to read configuration file: " + configPath sys.exit(1) # Assign the SQL command from the config file into the jobData structure jobData.gSQL = staticData.gSQL # Check gages in directory to match what's in the database try: jobData.checkGages2(db) except: errMod.errOut(jobData) # Some house keeping here. If the calibration is already complete, throw an error. # Also ensure the spinup has been entered as complete. This is necessary for the # calibration to run. # also, if this is a re-initiation under a different user, require the new # user to enter a new contact that will be unpdated in the database. if int(jobData.spinComplete) != 1: # Check to see if optional spinup options were enabled. If so, update the spinup status. if staticData.coldStart == 1 or staticData.optSpinFlag != 0: print "Found optional spinup alternatives" jobData.spinComplete = 1 try: db.updateSpinupStatus(jobData) except: errMod.errOut(jobData) else: jobData.errMsg = "ERROR: Spinup for job ID: " + str(jobData.jobID) + \ " is NOT complete. You must complete the spinup in order" + \ " to run calibration." errMod.errOut(jobData) if int(jobData.calibComplete) == 1: jobData.errMsg = "ERROR: Calibration for job ID: " + str(jobData.jobID) + \ " has already completed." errMod.errOut(jobData) if userTmp != jobData.owner: print "User: "******" is requesting to takeover jobID: " + \ str(jobData.jobID) + " from owner: " + str(jobData.owner) strTmp = "Please enter new email address. Leave blank if no email " + \ "change is desired. NOTE if you leave both email and Slack " + \ "information blank, no change in contact will occur. Only " + \ "the owner will be modified:" newEmail = raw_input(strTmp) #strTmp = "Please enter Slack channel:" #newSlackChannel = raw_input(strTmp) #strTmp = "Please enter Slack token:" #newSlackToken = raw_input(strTmp) #strTmp = "Please enter Slack user name:" #newSlackUName = raw_input(strTmp) # V1.2 NOTE!!!!! # Given the automation of the workflow on Yellowstone, we are simply # keeping contact information the same, but only changing the ownership # of the workflow changeFlag = 1 #if len(newSlackChannel) != 0 and len(newSlackToken) == 0: # print "ERROR: You must specify an associated Slacker API token." # sys.exit(1) #if len(newSlackChannel) != 0 and len(newSlackUName) == 0: # print "ERROR: You must specify an associated Slacker user name." # sys.exit(1) #if len(newSlackToken) != 0 and len(newSlackChannel) == 0: # print "ERROR: You must specify an associated Slacker channel name." # sys.exit(1) #if len(newSlackToken) != 0 and len(newSlackUName) == 0: # print "ERROR: You must specify an associated Slacker user name." # sys.exit(1) #if len(newSlackUName) != 0 and len(newSlackChannel) == 0: # print "ERROR: You must specify an associated Slacker channel name." # sys.exit(1) #if len(newSlackUName) != 0 and len(newSlackToken) == 0: # print "ERROR: You must specify an associated Slacker API token." # sys.exit(1) #if len(newSlackChannel) != 0 and len(newEmail) != 0: # print "ERROR: You cannot specify both email and Slack for notifications." # sys.exit(1) #if len(newSlackChannel) == 0 and len(newEmail) == 0: # changeFlag = 0 # PLACEHOLDER FOR CHECKING SLACK CREDENTIALS jobData.genMsg = "MSG: User: "******" Is Taking Over JobID: " + str(jobData.jobID) + \ " From Owner: " + str(jobData.owner) errMod.sendMsg(jobData) # If a new owner takes over, simply change the owner, but keep all # other contact information the same. newEmail = jobData.email newSlackChannel = jobData.slChan newSlackToken = jobData.slToken newSlackUName = jobData.slUser if not newEmail: newEmail = '' if not newSlackChannel: newSlackChannel = '' newSlackToken = '' try: db.updateJobOwner(jobData, userTmp, newEmail, newSlackChannel, newSlackToken, newSlackUName, changeFlag) except: errMod.errOut(jobData) # Begin an "infinite" do loop. This loop will continue to loop through all # the basins until calibrations are complete. Basins are allowed ONE failure. A restart # will be attempted. If the restart fails again, a LOCK file is placed into the # run directory and an error email is sent to the user. Additionally, if the R calibration # code fails, a seperate LOCK file will be placed into the directory, and the user # will be notified about the failure. completeStatus = False # Create a "key" array. This array is of length [numBasins] and is initialized to 0.0. # Each array element can have the following values based on current model status: # 0.0 - Initial value # 0.25 - This is a special value for the first iteration. The initial default # parameter values specified in the parameter table by the user are being # applied and entered into the DB. # 0.5 - Model simulation in progress # 0.75 - The model simulation has completed. We are ready to run the R code to # generate the next set of parameter values and enter evaluation statistics # into the DB. # 0.90 - The R code is running to generate new parameters estimates. Python is # also generating new files. # 1.0 - R/Python code is complete and param/stats have been enetered into the DB. Ready to # run the next model iteration. # -0.1 - The R code to generate the initial parameter values has failed. CALIB.LOCK # is put into place. # -0.25 - The workflow has found the model simulation to have failed. # -0.5 - Model simulation failed once and a restart is being ran. # -0.75 - The R/Python code to generate new parameters/stats has failed. CALIB.LOCK # is put into place. # -1.0 - Model has failed twice. A RUN.LOCK file has been created. # Once all array elements are 1.0, then completeStatus goes to True, an entry into # the database occurs, and the program will complete. keySlot = np.empty([len(jobData.gages), int(jobData.nIter)]) keySlot[:, :] = 0.0 # Create an array to hold systme job ID values. This will only be used for # PBS as qstat has demonstrated slow behavior when doing a full qstat command. # We will track job ID values and do a qstat <jobID> and populate this array # to keep track of things. pbsJobId = np.empty([len(jobData.gages)], np.int64) pbsJobId[:] = -9999 # NOTE this is different from the spinup. We have a 2D array of values to account # for all the iterations. entryValue = float(len(jobData.gages) * int(jobData.nIter)) # Pull all the status values into the keySlot array. for basin in range(0, len(jobData.gages)): domainID = jobData.gageIDs[basin] if domainID == -9999: jobData.errMsg = "ERROR: Unable to locate domainID for gage: " + str( jobData.gages[basin]) errMod.errOut(jobData) # We are going to pull all values for one basin, then place them into the array. # This is faster then looping over each iteration at a time. statusData = db.iterationStatus(jobData, domainID, str(jobData.gages[basin])) statusData = [list(item) for item in statusData] for iteration in range(0, int(jobData.nIter)): for iteration2 in range(0, int(jobData.nIter)): if statusData[iteration2][0] == iteration + 1: keySlot[basin, iteration] = float(statusData[iteration2][1]) while not completeStatus: # Walk through calibration directories for each basin. Determine the status of # the model runs by the files available. If restarting, modify the # namelist files appropriately. Then, restart the model. If anything goes wrong, notifications # will either be emailed per the user's info, or piped to Slack for group # notification. A simulation is deemed complete when all expected RESTART # files are present and there are no jobs running for the basin. The parameter # estimation is deemed complete when CALIB_ITER.COMPLETE is present and # no calibration jobs for this basin are running. # Loop through each basin. Perform the following steps: # 1.) If status is -0.5,0.0, or 0.5, check to see if the model is running # for this basin or if parameter estimation is occurring. # 2.) If the model is not running, check for expected output and perform # necessary logistics. Continue to parameter estimation. Note that # for the first iteration, R needs to be ran before the model to get # initial default parameters. # 3.) Once the model is complete, the status goes to 0.75. # 4.) Fire off a job to run R/Python code for parameter estimation, generation # , plot generation, and generation of model eval statistics to be # entered into the DB. Status goes to 0.90. # 5.) Once the calibration job is complete, the status goes to 1.0 and the # workflow is ready for the next iteration. # If the status goes to -1.0, a LOCK file is created and must be manually # removed from the user. Once the program detects this, it will restart the # model and the status goes back to 0.5. # If the status goes to -0.75, a LOCK file is created and needs to be removed # manually by the user before the workflow can continue. for basin in range(0, len(jobData.gages)): for iteration in range(0, int(jobData.nIter)): # Holding onto the status value before the workflow iterates for checking below. keyStatusCheck1 = keySlot[basin, iteration] # If the status is already 1.0, then continue the loop as now work needs to be done. if keyStatusCheck1 == 1.0: continue else: try: calibMod.runModel(jobData, staticData, db, jobData.gageIDs[basin], jobData.gages[basin], keySlot, basin, iteration, pbsJobId) except: errMod.errOut(jobData) # Temporary for Cheyenne to slow down the strain on PBS. keyStatusCheck2 = keySlot[basin, iteration] # Put some spacing between launching model simulations to slow down que geting # overloaded. if keyStatusCheck1 == 0.25 and keyStatusCheck2 == 0.5: time.sleep(15) if keyStatusCheck1 == 0.0 and keyStatusCheck2 == 0.5: time.sleep(15) if keyStatusCheck1 == 0.5 and keyStatusCheck2 == 0.5: time.sleep(15) if keyStatusCheck1 == 0.0 and keyStatusCheck2 == 0.25: time.sleep(15) if keyStatusCheck1 == 0.5 and keyStatusCheck2 == 0.75: time.sleep(15) if keyStatusCheck1 == 0.75 and keyStatusCheck2 == 0.9: time.sleep(15) if keyStatusCheck1 == 0.5 and keyStatusCheck2 == 0.9: time.sleep(15) if keyStatusCheck1 == 0.9 and keyStatusCheck2 == 0.9: time.sleep(15) if keyStatusCheck1 == 0.9 and keyStatusCheck2 == 1.0: time.sleep(15) # TEMPORARY FOR CHEYENNE # Check to make sure program hasn't passed a prescribed time limit. If it has, # exit gracefully. #timeCheckStamp = datetime.datetime.now() #programDtCheck = timeCheckStamp - begTimeStamp #if programDtCheck.seconds/60.0 > 90.0: # # 90-minutes) # try: # fileObj = open(lockPath,'a') # fileObj.write('WORKFLOW HAS HIT TIME LIMIT - EXITING....\n') # fileObj.close() # except: # jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + lockPath # errMod.errOut(jobData) # Check to see if program requirements have been met. if keySlot.sum() == entryValue: jobData.calibComplete = 1 try: db.updateCalibStatus(jobData) except: errMod.errout(jobData) jobData.genMsg = "CALIBRATION FOR JOB ID: " + str( jobData.jobID) + " COMPLETE." errMod.sendMsg(jobData) completeStatus = True # Open the Python LOCK file. Write a blank line to the file and close it. # This action will simply modify the file modification time while only adding # a blank line. try: fileObj = open(lockPath, 'a') fileObj.write('\n') fileObj.close() except: jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + lockPath errMod.errOut(jobData) # Remove LOCK file os.remove(lockPath)