def clear_rundir(self): #sh.cd(self.workdir) try: sh.rm(['-r', self.rundir]) except sh.ErrorReturnCode: self.log.warning( 'Tried to remove run directory but it doesnt exist') mkdir(self.rundir) self.log.info('Emptied run directory %r' % self.rundir)
def run(self, i, restart_file=None, use_restart=True, multi_node=False, num_cores=8, overwrite_data=False, save_run=False, run_idb=False, nice_score=0): """Run the model. `num_cores`: Number of mpi cores to distribute over. `restart_file` (optional): A path to a valid restart archive. If None and `use_restart=True`, restart file (i-1) will be used. `save_run`: If True, copy the entire working directory over to GFDL_DATA so that the run can rerun without the python script. (This uses a lot of data storage!) """ self.clear_rundir() indir = P(self.rundir, 'INPUT') outdir = P(self.datadir, self.runfmt % i) resdir = P(self.rundir, 'RESTART') if os.path.isdir(outdir): if overwrite_data: self.log.warning( 'Data for run %d already exists and overwrite_data is True. Overwriting.' % i) sh.rm('-r', outdir) else: self.log.warn( 'Data for run %d already exists but overwrite_data is False. Stopping.' % i) return False # make the output run folder and copy over the input files mkdir([indir, resdir, self.restartdir]) self.codebase.write_source_control_status( P(self.rundir, 'git_hash_used.txt')) self.write_namelist(self.rundir) self.write_field_table(self.rundir) self.write_diag_table(self.rundir) for filename in self.inputfiles: sh.cp([filename, P(indir, os.path.split(filename)[1])]) mpirun_opts = '' if multi_node: mpirun_opts += ' -bootstrap pbsdsh -f $PBS_NODEFILE' if use_restart: if not restart_file: # get the restart from previous iteration restart_file = self.get_restart_file(i - 1) if not os.path.isfile(restart_file): self.log.error('Restart file not found, expecting file %r' % restart_file) raise IOError('Restart file not found, expecting file %r' % restart_file) else: self.log.info('Using restart file %r' % restart_file) self.extract_restart_archive(restart_file, indir) else: self.log.info('Running without restart file') restart_file = None vars = { 'rundir': self.rundir, 'execdir': self.codebase.builddir, 'executable': self.codebase.executable_name, 'env_source': self.env_source, 'mpirun_opts': mpirun_opts, 'num_cores': num_cores, 'run_idb': run_idb, 'nice_score': nice_score } runscript = self.templates.get_template('run.sh') # employ the template to create a runscript t = runscript.stream(**vars).dump(P(self.rundir, 'run.sh')) def _outhandler(line): handled = self.emit('run:output', self, line) if not handled: # only log the output when no event handler is used self.log_output(line) self.emit('run:ready', self, i) self.log.info("Beginning run %d" % i) try: #for line in sh.bash(P(self.rundir, 'run.sh'), _iter=True, _err_to_out=True): proc = sh.bash(P(self.rundir, 'run.sh'), _bg=True, _out=_outhandler, _err_to_out=True) self.log.info('process running as {}'.format(proc.process.pid)) proc.wait() completed = True except KeyboardInterrupt as e: self.log.error("Manual interrupt, killing process.") proc.process.terminate() proc.wait() #log.info("Cleaning run directory.") #self.clear_rundir() raise e except sh.ErrorReturnCode as e: completed = False self.log.error("Run %d failed. See log for details." % i) self.log.error("Error: %r" % e) self.emit('run:failed', self) raise FailedRunError() self.emit('run:completed', self, i) self.log.info('Run %d complete' % i) mkdir(outdir) if num_cores > 1: # use postprocessing tool to combine the output from several cores codebase_combine_script = P(self.codebase.builddir, 'mppnccombine_run.sh') if not os.path.exists(codebase_combine_script): self.log.warning( 'combine script does not exist in the commit you are running Isca from. Falling back to using $GFDL_BASE mppnccombine_run.sh script' ) sh.ln('-s', P(GFDL_BASE, 'postprocessing', 'mppnccombine_run.sh'), codebase_combine_script) combinetool = sh.Command(codebase_combine_script) for file in self.diag_table.files: netcdf_file = '%s.nc' % file filebase = P(self.rundir, netcdf_file) combinetool(self.codebase.builddir, filebase) # copy the combined netcdf file into the data archive directory sh.cp(filebase, P(outdir, netcdf_file)) # remove all netcdf fragments from the run directory sh.rm(glob.glob(filebase + '*')) self.log.debug('%s combined and copied to data directory' % netcdf_file) for restart in glob.glob(P(resdir, '*.res.nc.0000')): restartfile = restart.replace('.0000', '') combinetool(self.codebase.builddir, restartfile) sh.rm(glob.glob(restartfile + '.????')) self.log.debug("Restart file %s combined" % restartfile) self.emit('run:combined', self) # make the restart archive and delete the restart files self.make_restart_archive(self.get_restart_file(i), resdir) sh.rm('-r', resdir) if save_run: # copy the complete run directory to GFDL_DATA so that the run can # be recreated without the python script if required mkdir(resdir) sh.cp(['-a', self.rundir, outdir]) else: # just save some useful diagnostic information self.write_namelist(outdir) self.write_field_table(outdir) self.write_diag_table(outdir) self.codebase.write_source_control_status( P(outdir, 'git_hash_used.txt')) self.clear_rundir() return True
def clear_workdir(self): self.rm_workdir() mkdir(self.workdir) self.log.info('Emptied working directory %r' % self.workdir)