Beispiel #1
0
	def run_geogrid(self):
		Tools.Process.instance().Lock()
		self.logger.write("run_geogrid(): Enter")
		Tools.popen(self.aSet, "mv namelist.wps.geogrid " + self.wrfDir + '/' + self.startTime[0:8] + "/namelist.wps")
		with Tools.cd(self.wrfDir + '/' + self.startTime[0:8]):				
			Tools.popen(self.aSet, "chmod +x geogrid.job")
			Tools.popen(self.aSet, "qsub geogrid.job")
			# Now wait for the log files
			try:
				firstWait = [{"waitCommand": "(ls geogrid.log* && echo \"yes\") || echo \"no\"", "contains": "yes", "retCode": 1}]
				wait1 = Wait.Wait(firstWait, timeDelay = 25)
				wait1.hold()
			except Wait.TimeExpiredException:
				sys.exit("geogrid.exe job not completed, abort.")			
			# Check for completion
			self.logger.write("Log file detected, waiting for completion.")
			try:
				secondWait = [{"waitCommand": "tail -n 3 geogrid.log*", "contains": "Successful completion of program geogrid.exe", "retCode": 1},
							  {"waitCommand": "tail -n 3 geogrid.log*", "contains": "fatal", "retCode": 2},
							  {"waitCommand": "tail -n 3 geogrid.log*", "contains": "runtime", "retCode": 2},
							  {"waitCommand": "tail -n 3 geogrid.log*", "contains": "error", "retCode": 2},]
				wait2 = Wait.Wait(secondWait, timeDelay = 25)
				wRC1 = wait2.hold()
				if wRC1 == 1:
					# Success condition, proceed to the next.
					self.logger.write("Geogrid process sucessfully completed.")
				elif wRC1 == 2:
					self.logger.write("run_geogrid(): Exit (Failed, Code 2)")
					Tools.Process.instance().Unlock()
					return False
			except Wait.TimeExpiredException:
				sys.exit("geogrid.exe job not completed, abort.")					
		self.logger.write("run_geogrid(): Exit")
		Tools.Process.instance().Unlock()
Beispiel #2
0
	def run_wrf(self):
		Tools.Process.instance().Lock()
		self.logger.write("run_wrf(): Enter")
		with Tools.cd(self.wrfDir + '/' + self.startTime[0:8]):
			# Do a quick file check to ensure wrf can run
			file1 = os.popen("(ls output/wrfinput_d01 && echo \"yes\") || echo \"no\"").read()
			file2 = os.popen("(ls output/wrfbdy_d01 && echo \"yes\") || echo \"no\"").read()
			if(not ("yes" in file1 and "yes" in file2) and (not self.aSet.fetch("debugmode") == '1')):
				self.logger.write("run_wrf(): Exit (Failed, cannot run wrf.exe without wrfinput_d01 and wrfbdy_d01)")
				Tools.Process.instance().Unlock()
				return False
			# Remove the old log files as these are no longer needed
			Tools.popen(self.aSet, "rm output/rsl.out.*")
			Tools.popen(self.aSet, "rm output/rsl.error.*")	
			# chmod the job and submit
			Tools.popen(self.aSet, "chmod +x wrf.job")			
			Tools.popen(self.aSet, "qsub wrf.job")
			self.logger.write("Job has been submitted to the queue, waiting for log file to appear.")
			if(self.aSet.fetch("debugmode") == '1'):
				self.logger.write("Debug mode is active, skipping")
				Tools.Process.instance().Unlock()
				return True			
			#Submit a wait condition for the file to appear
			try:
				firstWait = [{"waitCommand": "(ls output/rsl.out.0000 && echo \"yes\") || echo \"no\"", "contains": "yes", "retCode": 1}]
				wait1 = Wait.Wait(firstWait, timeDelay = 25)
				wait1.hold()			
			except Wait.TimeExpiredException:
				sys.exit("wrf.exe job not completed, abort.")
			self.logger.write("Log file detected, waiting for completion.")
			#Now wait for the output file to be completed (Note: Allow 7 days from the output file first appearing to run)
			try:
				secondWait = [{"waitCommand": "tail -n 1 output/rsl.out.0000", "contains": "SUCCESS COMPLETE WRF", "retCode": 1},
							  {"waitCommand": "tail -n 1 output/rsl.error.0000", "contains": "fatal", "retCode": 2},
							  {"waitCommand": "tail -n 1 output/rsl.error.0000", "contains": "runtime", "retCode": 2},
							  {"waitCommand": "tail -n 1 output/rsl.error.0000", "contains": "error", "retCode": 2},]
				# Note: I have the script checking the files once every three minutes so we don't stack five calls rapidly, this can be modified later if needed.
				wait2 = Wait.Wait(secondWait, timeDelay = 180)
				wRC = wait2.hold()
				if wRC == 2:
					self.logger.write("run_wrf(): Exit (Failed, Code 2)")
					Tools.Process.instance().Unlock()
					return False
				else:
					Tools.popen(self.aSet, "mv output/rsl.out.0000 wrf_log.txt")
					Tools.popen(self.aSet, "mv output/rsl.error.0000 wrf_error_log.txt")
					Tools.popen(self.aSet, "rm output/rsl.out.*")
					Tools.popen(self.aSet, "rm output/rsl.error.*")					
					self.logger.write("run_wrf(): Exit")
					Tools.Process.instance().Unlock()
					return True				
			except Wait.TimeExpiredException:
				sys.exit("wrf.exe job not completed, abort.")				
		self.logger.write("run_wrf(): Failed to enter run directory")
		Tools.Process.instance().Unlock()
		return False			
Beispiel #3
0
    def run_postprocessing_upp(self):
        # Unipost needs to be run across multiple jobs that are broken up 24 hours of forecast per job.
        #  this is done to prevent the job time limit from expiring while UPP is running.
        Tools.Process.instance().Lock()
        curDir = os.path.dirname(os.path.abspath(__file__))
        uppDir = self.aSet.fetch("headdir") + "post/UPP/"
        fList = sorted(
            glob.glob(self.wrfDir + '/' + self.startTime[0:8] +
                      "/output/wrfout*"))
        fileCount = len(fList)
        fLogs = []
        upp_job_contents = ""
        self.logger.write("  5.b. Running UPP on " + str(fileCount) +
                          " wrfout files")

        upp_job_contents += "#!/bin/bash\n"
        upp_job_contents += "#COBALT -t " + self.aSet.fetch(
            "upp_walltime") + "\n"
        upp_job_contents += "#COBALT -n " + self.aSet.fetch(
            "num_upp_nodes") + "\n"
        upp_job_contents += "#COBALT -q default\n"
        upp_job_contents += "#COBALT -A climate_severe\n\n"
        upp_job_contents += "source " + self.aSet.fetch("sourcefile") + "\n"
        upp_job_contents += "ulimit -s unlimited\n\n"
        upp_job_contents += "export n_nodes=" + str(
            self.aSet.fetch("upp_ensemble_nodes_per_hour")) + "\n"
        upp_job_contents += "export n_mpi_ranks_per_node=32\n"
        upp_job_contents += "export n_mpi_ranks=$(($n_nodes * $n_mpi_ranks_per_node))\n"
        upp_job_contents += "export n_openmp_threads_per_rank=" + self.aSet.fetch(
            "mpi_threads_per_rank") + "\n"
        upp_job_contents += "export n_hardware_threads_per_core=2\n"
        upp_job_contents += "export n_hardware_threads_skipped_between_ranks=4\n\n"

        upp_job_contents += "cd " + self.aSet.fetch(
            "wrfdir") + '/' + self.aSet.fetch(
                "starttime")[0:8] + "/postprd" + "\n\n"

        with Tools.cd(self.postDir):
            for iFile in fList:
                dNum = iFile[-23:-20]
                year = iFile[-19:-15]
                month = iFile[-14:-12]
                day = iFile[-11:-9]
                hour = iFile[-8:-6]
                minute = iFile[-5:-3]
                second = iFile[-2:]
                logName = "unipost_log_" + dNum + "_" + year + "_" + month + "_" + day + "_" + hour + ":" + minute + ":" + second + ".log"
                fLogs.append(logName)
                catCMD = ""
                if (self.aSet.fetch("unipost_out") == "grib"):
                    catCMD = "cat > itag <<EOF\n" + iFile + '\n' + "netcdf\n" + str(
                        year) + "-" + str(month) + "-" + str(day) + "_" + str(
                            hour) + ":" + str(minute) + ":" + str(
                                second) + '\n' + "NCAR\nEOF"
                elif (self.aSet.fetch("unipost_out") == "grib2"):
                    catCMD = "cat > itag <<EOF\n" + iFile + '\n' + "netcdf\n" + "grib2\n" + str(
                        year) + "-" + str(month) + "-" + str(day) + "_" + str(
                            hour) + ":" + str(minute) + ":" + str(
                                second) + '\n' + "NCAR\nEOF"
                else:
                    #You should never end up here...
                    sys.exit(
                        "  5.b. Error: grib/grib2 not defined in control.txt")
                upp_job_contents += catCMD
                upp_job_contents += '\n' + "rm fort.*"
                if (self.aSet.fetch("unipost_out") == "grib"):
                    upp_job_contents += "\nln -sf " + uppDir + "parm/wrf_cntrl.parm fort.14"

                aprun = "aprun -n $n_mpi_ranks -N $n_mpi_ranks_per_node \\" + '\n'
                aprun += "--env OMP_NUM_THREADS=$n_openmp_threads_per_rank -cc depth \\" + '\n'
                aprun += "-d $n_hardware_threads_skipped_between_ranks \\" + '\n'
                aprun += "-j $n_hardware_threads_per_core \\" + '\n'
                aprun += "./unipost.exe > " + logName + " &\n"
                aprun += "sleep 5\n"
                upp_job_contents += "\n" + aprun + '\n\n'

                aprun = ""

            upp_job_contents += "wait\necho \"Job Complete\""
            with open("upp.job", 'w') as target_file:
                target_file.write(upp_job_contents)
            Tools.popen(self.aSet, "chmod +x upp.job")
            self.logger.write("   -> Submitting upp job to the queue")
            jobSub = Tools.popen(
                self.aSet, "qsub upp.job -q default -t " +
                str(self.aSet.fetch("upp_walltime")) + " -n " +
                str(self.aSet.fetch("num_upp_nodes")) + " --mode script")
            self.logger.write("   -> Job file submitted, wait for " +
                              jobSub.fetch()[0].rstrip("\n\r") + ".output")
            # Wait for all logs to flag as job complete
            try:
                wCond = [
                    {
                        "waitCommand":
                        "tail -n 2 " + jobSub.fetch()[0].rstrip("\n\r") +
                        ".output",
                        "contains":
                        "Job Complete",
                        "retCode":
                        1
                    },
                ]
                waitCond = Wait.Wait(wCond, timeDelay=60)
                wRC = waitCond.hold()
                if wRC == 1:
                    Tools.Process.instance().Unlock()
            except Wait.TimeExpiredException:
                sys.exit("unipost.exe job not completed, abort.")
            self.logger.write("   -> Unipost Job Completed, Verifying files.")
            # Run a quick ls -l test to ensure the number of files present matches what we're expecting
            fCountTest = Tools.popen(self.aSet, "ls -l WRFPRS*")
            cmdTxt = fCountTest.fetch()
            strCount = fCountTest[fCountTest.rfind('F'):]
            self.logger.write("  5.b. All UPP jobs completed (F" +
                              int(strCount) + " found).")
            if (not (int(strCount)) == (fileCount - 1)):
                self.logger.write("  5.b. Error: Number of expected files (" +
                                  fileCount +
                                  ") does not match actual count (" +
                                  int(strCount) + 1 + ").")
                Tools.Process.instance().Unlock()
                return False
            # Now that we have our PRS files, we can convert those to CTL files
            self.logger.write("  5.b. Running GRIB to CTL process.")
            if (self.aSet.fetch("unipost_out") == "grib"):
                for fHour in range(0, fileCount):
                    fStr = "0" + str(fHour) if fHour < 10 else str(fHour)
                    inFile = "WRFPRS.GrbF" + fStr
                    Tools.popen(
                        self.aSet, uppDir + "scripts/grib2ctl.pl " +
                        self.postDir + '/' + inFile + " > " + self.postDir +
                        "/wrfprs_f" + fStr + ".ctl")
            elif (self.aSet.fetch("unipost_out") == "grib2"):
                for fHour in range(0, fileCount):
                    fStr = "0" + str(fHour) if fHour < 10 else str(fHour)
                    inFile = "WRFPRS.GrbF" + fStr
                    Tools.popen(
                        self.aSet, uppDir + "scripts/g2ctl.pl " +
                        self.postDir + '/' + inFile + " > " + self.postDir +
                        "/wrfprs_f" + fStr + ".ctl")
            #To-Do Note: Fork off to GrADS here...
            self.logger.write("  5.b. GRIB to CTL processes completed.")
            Tools.Process.instance().Unlock()
            return True
Beispiel #4
0
 def run_preprocessing(self):
     #ungrib.exe needs to run in the data directory
     Tools.Process.instance().Lock()
     self.logger.write("run_preprocessing(): Enter")
     Tools.popen(
         self.aSet, "cp " + self.aSet.fetch("headdir") + "vtables/Vtable." +
         self.aSet.fetch("modeldata") + "* " + self.wrfDir + '/' +
         self.startTime[0:8])
     Tools.popen(
         self.aSet,
         "mv namelist.wps* " + self.wrfDir + '/' + self.startTime[0:8])
     mParms = self.modelParms.fetch()
     with Tools.cd(self.wrfDir + '/' + self.startTime[0:8]):
         Tools.popen(self.aSet, "chmod +x prerun.job")
         Tools.popen(self.aSet,
                     self.scheduleParms.fetch()["subcmd"] + " prerun.job " +
                     self.scheduleParms.fetch()["cmdline"],
                     storeOutput=False)
         self.logger.write(
             "Job has been submitted to the queue, waiting for log file to appear."
         )
         # Now wait for the log files
         try:
             firstWait = [{
                 "waitCommand":
                 "(ls ungrib.log* && echo \"yes\") || echo \"no\"",
                 "contains": "yes",
                 "retCode": 1
             }]
             wait1 = Wait.Wait(firstWait, timeDelay=25)
             wait1.hold()
         except Wait.TimeExpiredException:
             sys.exit("ungrib.exe job not completed, abort.")
         # Check for completion
         self.logger.write("Log file detected, waiting for completion.")
         try:
             secondWait = [
                 {
                     "waitCommand": "tail -n 3 ungrib.log*",
                     "contains":
                     "Successful completion of program ungrib.exe",
                     "retCode": 1
                 },
                 {
                     "waitCommand": "tail -n 3 ungrib.log*",
                     "contains": "fatal",
                     "retCode": 2
                 },
                 {
                     "waitCommand": "tail -n 3 ungrib.log*",
                     "contains": "runtime",
                     "retCode": 2
                 },
                 {
                     "waitCommand": "tail -n 3 ungrib.log*",
                     "contains": "error",
                     "retCode": 2
                 },
             ]
             wait2 = Wait.Wait(secondWait, timeDelay=25)
             wRC1 = wait2.hold()
             if wRC1 == 1:
                 # Success condition, proceed to the next.
                 self.logger.write(
                     "Ungrib process sucessfully completed, starting metgrid process."
                 )
                 try:
                     thirdWait = [{
                         "waitCommand":
                         "(ls metgrid.log* && echo \"yes\") || echo \"no\"",
                         "contains": "yes",
                         "retCode": 1
                     }]
                     wait3 = Wait.Wait(thirdWait, timeDelay=25)
                     wait3.hold()
                 except Wait.TimeExpiredException:
                     sys.exit("metgrid.exe job not completed, abort.")
                 self.logger.write(
                     "Log file detected, waiting for completion.")
                 #Now wait for the output file to be completed
                 try:
                     fourthWait = [
                         {
                             "waitCommand": "tail -n 3 metgrid.log.0000",
                             "contains":
                             "Successful completion of program metgrid.exe",
                             "retCode": 1
                         },
                         {
                             "waitCommand": "tail -n 3 metgrid.log.0000",
                             "contains": "fatal",
                             "retCode": 2
                         },
                         {
                             "waitCommand": "tail -n 3 metgrid.log.0000",
                             "contains": "runtime",
                             "retCode": 2
                         },
                         {
                             "waitCommand": "tail -n 3 metgrid.log.0000",
                             "contains": "error",
                             "retCode": 2
                         },
                         {
                             "waitCommand": "tail -n 3 metgrid.log.0000",
                             "contains": "ERROR:",
                             "retCode": 2
                         },
                     ]
                     wait4 = Wait.Wait(fourthWait, timeDelay=25)
                     wRC2 = wait4.hold()
                     if wRC2 == 1:
                         # Success Condition, proceed to real.exe
                         self.logger.write(
                             "Metgrid process sucessfully completed, starting real process."
                         )
                         Tools.popen(self.aSet,
                                     "mv metgrid.log.0000 metgrid_log.txt")
                         Tools.popen(self.aSet, "rm metgrid.log.*")
                         try:
                             fifthWait = [{
                                 "waitCommand":
                                 "(ls output/rsl.out.0000 && echo \"yes\") || echo \"no\"",
                                 "contains": "yes",
                                 "retCode": 1
                             }]
                             wait5 = Wait.Wait(fifthWait, timeDelay=25)
                             wait5.hold()
                         except Wait.TimeExpiredException:
                             sys.exit("real.exe job not completed, abort.")
                         self.logger.write(
                             "Log file detected, waiting for completion.")
                         #Now wait for the output file to be completed
                         try:
                             sixthWait = [
                                 {
                                     "waitCommand":
                                     "tail -n 3 output/rsl.out.0000",
                                     "contains": "SUCCESS COMPLETE REAL_EM",
                                     "retCode": 1
                                 },
                                 {
                                     "waitCommand":
                                     "tail -n 5 output/rsl.error.0000",
                                     "contains": "FATAL CALLED",
                                     "retCode": 2
                                 },
                                 {
                                     "waitCommand":
                                     "tail -n 5 output/rsl.error.0000",
                                     "contains": "FATAL",
                                     "retCode": 2
                                 },
                                 {
                                     "waitCommand":
                                     "tail -n 5 output/rsl.error.0000",
                                     "contains": "RUNTIME",
                                     "retCode": 2
                                 },
                                 {
                                     "waitCommand":
                                     "tail -n 5 output/rsl.error.0000",
                                     "contains": "runtime",
                                     "retCode": 2
                                 },
                                 {
                                     "waitCommand":
                                     "tail -n 5 output/rsl.error.0000",
                                     "contains": "error",
                                     "retCode": 2
                                 },
                                 {
                                     "waitCommand":
                                     "tail -n 5 output/rsl.error.0000",
                                     "contains": "ERROR",
                                     "retCode": 2
                                 },
                             ]
                             wait6 = Wait.Wait(sixthWait, timeDelay=60)
                             wRC3 = wait6.hold()
                             if wRC3 == 2:
                                 self.logger.write(
                                     "run_preprocessing(): Exit (Failed at real, Code 2)"
                                 )
                                 Tools.Process.instance().Unlock()
                                 return False
                             else:
                                 # Copy the log files.
                                 Tools.popen(
                                     self.aSet,
                                     "mv output/rsl.out.0000 real_log.txt")
                                 Tools.popen(
                                     self.aSet,
                                     "mv output/rsl.error.0000 real_error_log.txt"
                                 )
                                 #Validate the presense of the two files.
                                 file1 = os.popen(
                                     "(ls output/wrfinput_d01 && echo \"yes\") || echo \"no\""
                                 ).read()
                                 file2 = os.popen(
                                     "(ls output/wrfbdy_d01 && echo \"yes\") || echo \"no\""
                                 ).read()
                                 if ("yes" in file1 and "yes" in file2):
                                     self.logger.write(
                                         "run_preprocessing(): Exit")
                                     Tools.Process.instance().Unlock()
                                     return True
                                 self.logger.write(
                                     "run_preprocessing(): Exit (Failed at real, did not find wrfinput_d01 and wrfbdy_d01"
                                 )
                                 Tools.Process.instance().Unlock()
                                 return False
                         except Wait.TimeExpiredException:
                             sys.exit("real.exe job not completed, abort.")
                     elif wRC2 == 2:
                         self.logger.write(
                             "run_preprocessing(): Exit (Failed at metgrid, Code 2)"
                         )
                         Tools.Process.instance().Unlock()
                         return False
                 except Wait.TimeExpiredException:
                     sys.exit("metgrid.exe job not completed, abort.")
             elif wRC1 == 2:
                 self.logger.write(
                     "run_preprocessing(): Exit (Failed at ungrib, Code 2)")
                 Tools.Process.instance().Unlock()
                 return False
         except Wait.TimeExpiredException:
             sys.exit("ungrib.exe job not completed, abort.")
     self.logger.write("run_preprocessing(): Failed to enter run directory")
     Tools.Process.instance().Unlock()
     return False
Beispiel #5
0
    def prepare_job(self):
        Tools.Process.instance().Lock()
        self.logger.write(
            "  5.b. Entering prepare_job(), constructing job file.")
        fList = sorted(glob.glob(self.wrfOutDir + "/wrfout*"))
        fileCount = len(fList)
        out_job_contents = ""
        self.logger.write("  5.b. " + str(fileCount) +
                          " wrfout files have been found.")
        if (fileCount <= 0):
            # Something went wrong.
            self.logger.write(
                "  No files found, something is wrong, please check the output directory to ensure the wrfout* files are present."
            )
            return False
        out_job_contents += "#!/bin/bash\n"
        out_job_contents += "source " + self.aSet.fetch("sourcefile") + "\n"
        out_job_contents += "ulimit -s unlimited\n\n"

        out_job_contents += "export PYTHON_POST_DIR=" + self.wrfOutDir + "/\n"
        out_job_contents += "export PYTHON_POST_TARG_DIR=" + self.targetDir + "/\n"
        out_job_contents += "export PYTHON_POST_NODES=" + self.aSet.fetch(
            "num_python_nodes") + "\n"
        out_job_contents += "export PYTHON_POST_THREADS=" + self.aSet.fetch(
            "python_threads_per_rank") + "\n"
        out_job_contents += "export PYTHON_POST_FIRSTTIME=" + self.aSet.fetch(
            "starttime") + "\n"
        out_job_contents += "export PYTHON_POST_LOG_DIR=" + self.targetDir + "/\n\n"

        out_job_contents += "cd " + self.aSet.fetch("postdir") + "/Python\n\n"

        out_job_contents += self.aSet.fetch(
            "condainstallation") + " PythonPost.py&\n"
        out_job_contents += "PID_PyPost=$!\n"
        out_job_contents += "wait $PID_PyPost\n\n"

        with Tools.cd(self.targetDir):
            with open("python_post.job", 'w') as target_file:
                target_file.write(out_job_contents)
            Tools.popen(self.aSet, "chmod +x python_post.job")
            self.logger.write(
                "   -> Starting Python Post Processing Script, moving this script to holding pattern"
            )
            jobSub = Tools.popen(self.aSet, "./python_post.job")

            try:
                wCond = [{
                    "waitCommand": "tail -n 3 pypost.log",
                    "contains": "***SUCCESS***",
                    "retCode": 1
                }, {
                    "waitCommand": "tail -n 3 pypost.log",
                    "contains": "***FAIL***",
                    "retCode": 2
                }]
                waitCond = Wait.Wait(wCond, timeDelay=60)
                wRC = waitCond.hold()
                if wRC == 1:
                    Tools.Process.instance().Unlock()
                elif wRC == 2:
                    self.logger.write(
                        "PreparePyJob(): Exit (Failed at python, Code 2)")
                    Tools.Process.instance().Unlock()
                    return False
            except Wait.TimeExpiredException:
                sys.exit("Python post processing job not completed, abort.")
            return True