Ejemplo n.º 1
0
def runcmd(model_type, config_path, lab_path):

    lab = Laboratory(model_type, config_path, lab_path)
    expt = Experiment(lab)
    runlog = Runlog(expt)

    runlog.push()
Ejemplo n.º 2
0
    def __init__(self, lab):
        self.lab = lab

        # TODO: replace with dict, check versions via key-value pairs
        self.modules = set()

        # TODO: __init__ should not be a config dumping ground!
        self.config = read_config()

        # Payu experiment type
        self.debug = self.config.get('debug', False)
        self.postscript = self.config.get('postscript')
        self.repeat_run = self.config.get('repeat', False)

        # Configuration
        self.expand_shell_vars = True  # TODO: configurable

        # Model run time
        self.runtime = None
        if ('calendar' in self.config
                and 'runtime' in self.config['calendar']):
            self.runtime = self.config['calendar']['runtime']

        # Stacksize
        # NOTE: Possible PBS issue in setting non-unlimited stacksizes
        stacksize = self.config.get('stacksize', 'unlimited')
        self.set_stacksize(stacksize)

        # Initialize the submodels
        self.init_models()

        # TODO: Move to run/collate/sweep?
        self.set_expt_pathnames()
        self.set_counters()

        for model in self.models:
            model.set_input_paths()

        self.set_output_paths()

        # Miscellaneous configurations
        # TODO: Move this stuff somewhere else
        self.userscripts = self.config.get('userscripts', {})

        self.profilers = []

        init_script = self.userscripts.get('init')
        if init_script:
            self.run_userscript(init_script)

        # Logging
        if self.config.get('runlog', True):
            self.runlog = Runlog(self)
        else:
            self.runlog = None
Ejemplo n.º 3
0
    def __init__(self, lab):
        self.lab = lab

        # TODO: replace with dict, check versions via key-value pairs
        self.modules = set()

        # TODO: __init__ should not be a config dumping ground!
        self.config = read_config()

        # Payu experiment type
        self.debug = self.config.get('debug', False)
        self.postscript = self.config.get('postscript')
        self.repeat_run = self.config.get('repeat', False)

        # Configuration
        self.expand_shell_vars = True   # TODO: configurable

        # Model run time
        self.runtime = None
        if ('calendar' in self.config and
                'runtime' in self.config['calendar']):
            self.runtime = self.config['calendar']['runtime']

        # Stacksize
        # NOTE: Possible PBS issue in setting non-unlimited stacksizes
        stacksize = self.config.get('stacksize', 'unlimited')
        self.set_stacksize(stacksize)

        # Initialize the submodels
        self.init_models()

        # TODO: Move to run/collate/sweep?
        self.set_expt_pathnames()
        self.set_counters()

        for model in self.models:
            model.set_input_paths()

        self.set_output_paths()

        # Miscellaneous configurations
        # TODO: Move this stuff somewhere else
        self.userscripts = self.config.get('userscripts', {})

        self.profilers = []

        init_script = self.userscripts.get('init')
        if init_script:
            self.run_userscript(init_script)

        self.runlog = Runlog(self)

        # XXX: Temporary spot for the payu path
        #      This is horrible; payu/cli.py does this much more safely!
        #      But also does not even store it in os.environ!
        default_payu_bin = os.path.dirname(sys.argv[0])
        payu_bin = os.environ.get('PAYU_PATH', default_payu_bin)

        self.payu_path = os.path.join(payu_bin, 'payu')
Ejemplo n.º 4
0
    def __init__(self, lab):
        self.lab = lab

        # TODO: replace with dict, check versions via key-value pairs
        self.modules = set()

        # TODO: __init__ should not be a config dumping ground!
        self.config = read_config()

        # Payu experiment type
        self.debug = self.config.get('debug', False)
        self.postscript = self.config.get('postscript')
        self.repeat_run = self.config.get('repeat', False)

        # Model run time
        self.runtime = None
        if ('calendar' in self.config and
                'runtime' in self.config['calendar']):
            self.runtime = self.config['calendar']['runtime']

        # Stacksize
        # NOTE: Possible PBS issue in setting non-unlimited stacksizes
        stacksize = self.config.get('stacksize', 'unlimited')
        self.set_stacksize(stacksize)

        # Initialize the submodels
        self.init_models()

        # TODO: Move to run/collate/sweep?
        self.set_expt_pathnames()
        self.set_counters()

        for model in self.models:
            model.set_input_paths()

        self.set_output_paths()

        # Miscellaneous configurations
        # TODO: Move this stuff somewhere else
        self.userscripts = self.config.get('userscripts', {})

        self.profilers = []

        init_script = self.userscripts.get('init')
        if init_script:
            self.run_userscript(init_script)

        # Logging
        if self.config.get('runlog', True):
            self.runlog = Runlog(self)
        else:
            self.runlog = None
Ejemplo n.º 5
0
class Experiment(object):
    def __init__(self, lab):
        self.lab = lab

        # TODO: replace with dict, check versions via key-value pairs
        self.modules = set()

        # TODO: __init__ should not be a config dumping ground!
        self.config = read_config()

        # Payu experiment type
        self.debug = self.config.get('debug', False)
        self.postscript = self.config.get('postscript')
        self.repeat_run = self.config.get('repeat', False)

        # Configuration
        self.expand_shell_vars = True  # TODO: configurable

        # Model run time
        self.runtime = None
        if ('calendar' in self.config
                and 'runtime' in self.config['calendar']):
            self.runtime = self.config['calendar']['runtime']

        # Stacksize
        # NOTE: Possible PBS issue in setting non-unlimited stacksizes
        stacksize = self.config.get('stacksize', 'unlimited')
        self.set_stacksize(stacksize)

        # Initialize the submodels
        self.init_models()

        # TODO: Move to run/collate/sweep?
        self.set_expt_pathnames()
        self.set_counters()

        for model in self.models:
            model.set_input_paths()

        self.set_output_paths()

        # Miscellaneous configurations
        # TODO: Move this stuff somewhere else
        self.userscripts = self.config.get('userscripts', {})

        self.profilers = []

        init_script = self.userscripts.get('init')
        if init_script:
            self.run_userscript(init_script)

        # Logging
        if self.config.get('runlog', True):
            self.runlog = Runlog(self)
        else:
            self.runlog = None

    def init_models(self):

        self.model_name = self.config.get('model')
        assert self.model_name

        model_fields = [
            'model', 'exe', 'input', 'ncpus', 'npernode', 'build', 'mpthreads'
        ]

        # TODO: Rename this to self.submodels
        self.models = []

        submodels = self.config.get('submodels', [])

        solo_model = self.config.get('model')
        if not solo_model:
            sys.exit('payu: error: Unknown model configuration.')

        submodel_config = {f: self.config[f] for f in model_fields \
                           if f in self.config}
        submodel_config['name'] = solo_model

        submodels.append(submodel_config)

        for m_config in submodels:
            ModelType = model_index[m_config['model']]
            self.models.append(ModelType(self, m_config['name'], m_config))

        # Load the top-level model
        if self.model_name:
            ModelType = model_index[self.model_name]
            model_config = {
                f: self.config[f]
                for f in model_fields if f in self.config
            }
            self.model = ModelType(self, self.model_name, model_config)
            self.model.top_level_model = True
        else:
            self.model = None

    def set_counters(self):
        # Assume that ``set_paths`` has already been called
        assert self.archive_path

        current_counter = os.environ.get('PAYU_CURRENT_RUN')
        if current_counter:
            self.counter = int(current_counter)
        else:
            self.counter = None

        self.n_runs = int(os.environ.get('PAYU_N_RUNS', 1))

        # Initialize counter if unset
        if self.counter is None:
            # TODO: this logic can probably be streamlined
            try:
                restart_dirs = [
                    d for d in os.listdir(self.archive_path)
                    if d.startswith('restart')
                ]
            except OSError as exc:
                if exc.errno == errno.ENOENT:
                    restart_dirs = None
                else:
                    raise

            # First test for restarts
            if restart_dirs:
                self.counter = 1 + max([
                    int(d.lstrip('restart'))
                    for d in restart_dirs if d.startswith('restart')
                ])
            else:
                # uepeat runs do not generate restart files, so check outputs
                try:
                    output_dirs = [
                        d for d in os.listdir(self.archive_path)
                        if d.startswith('output')
                    ]
                except OSError as exc:
                    if exc.errno == errno.ENOENT:
                        output_dirs = None
                    else:
                        raise

                # First test for restarts
                if output_dirs:
                    self.counter = 1 + max([
                        int(d.lstrip('output'))
                        for d in output_dirs if d.startswith('output')
                    ])
                else:
                    self.counter = 0

    def set_stacksize(self, stacksize):

        if stacksize == 'unlimited':
            stacksize = resource.RLIM_INFINITY
        else:
            assert type(stacksize) is int

        resource.setrlimit(resource.RLIMIT_STACK,
                           (stacksize, resource.RLIM_INFINITY))

    def load_modules(self):

        # Scheduler
        sched_modname = self.config.get('scheduler', 'pbs')
        self.modules.add(sched_modname)

        # MPI library
        mpi_config = self.config.get('mpi', {})
        mpi_modname = mpi_config.get('module', 'openmpi')
        self.modules.add(mpi_modname)

        # Unload non-essential modules
        loaded_mods = os.environ.get('LOADEDMODULES', '').split(':')

        for mod in loaded_mods:
            mod_base = mod.split('/')[0]
            if mod_base not in core_modules:
                envmod.module('unload', mod)

        # Now load model-dependent modules
        for mod in self.modules:
            envmod.module('load', mod)

        # User-defined modules
        user_modules = self.config.get('modules', [])
        for mod in user_modules:
            envmod.module('load', mod)

        envmod.module('list')

        for prof in self.profilers:
            prof.load_modules()

        # TODO: Consolidate this profiling stuff
        c_ipm = self.config.get('ipm', False)
        if c_ipm:
            if isinstance(c_ipm, str):
                ipm_mod = os.path.join('ipm', c_ipm)
            else:
                ipm_mod = 'ipm/2.0.2'

            envmod.module('load', ipm_mod)
            os.environ['IPM_LOGDIR'] = self.work_path

        if self.config.get('mpiP', False):
            envmod.module('load', 'mpiP')

        if self.config.get('hpctoolkit', False):
            envmod.module('load', 'hpctoolkit')

        if self.debug:
            envmod.module('load', 'totalview')

    def set_expt_pathnames(self):

        # Local "control" path
        self.control_path = self.config.get('control', os.getcwd())

        # Experiment name
        self.name = self.config.get('experiment',
                                    os.path.basename(self.control_path))

        # Experiment subdirectories
        self.archive_path = os.path.join(self.lab.archive_path, self.name)
        self.work_path = os.path.join(self.lab.work_path, self.name)

        # Symbolic link paths to output
        self.work_sym_path = os.path.join(self.control_path, 'work')
        self.archive_sym_path = os.path.join(self.control_path, 'archive')

        for model in self.models:
            model.set_model_pathnames()

        # Stream output filenames
        # TODO: per-model output streams?
        self.stdout_fname = self.lab.model_type + '.out'
        self.stderr_fname = self.lab.model_type + '.err'

    def set_output_paths(self):

        # Local archive paths

        # Check to see if we've provided a hard coded path -- valid for collate
        dir_path = os.environ.get('PAYU_DIR_PATH')
        if dir_path is not None:
            self.output_path = os.path.normpath(dir_path)
        else:
            output_dir = 'output{:03}'.format(self.counter)
            self.output_path = os.path.join(self.archive_path, output_dir)

        # TODO: check case counter == 0
        prior_output_dir = 'output{:03}'.format(self.counter - 1)
        prior_output_path = os.path.join(self.archive_path, prior_output_dir)
        if os.path.exists(prior_output_path):
            self.prior_output_path = prior_output_path
        else:
            self.prior_output_path = None

        # Local restart paths
        restart_dir = 'restart{:03}'.format(self.counter)
        self.restart_path = os.path.join(self.archive_path, restart_dir)

        prior_restart_dir = 'restart{:03}'.format(self.counter - 1)
        prior_restart_path = os.path.join(self.archive_path, prior_restart_dir)
        if os.path.exists(prior_restart_path):
            self.prior_restart_path = prior_restart_path
        else:
            self.prior_restart_path = None
            if self.counter > 0 and not self.repeat_run:
                # TODO: This warning should be replaced with an abort in setup
                print('payu: warning: No restart files found.')

        for model in self.models:
            model.set_model_output_paths()

    def build_model(self):

        self.load_modules()

        for model in self.models:
            model.get_codebase()

        for model in self.models:
            model.build_model()

    def setup(self, force_archive=False):

        # Confirm that no output path already exists
        if os.path.exists(self.output_path):
            sys.exit('payu: error: Output path already exists.')

        mkdir_p(self.work_path)

        if force_archive:
            mkdir_p(self.archive_path)
            make_symlink(self.archive_path, self.archive_sym_path)

        # Archive the payu config
        # TODO: This just copies the existing config.yaml file, but we should
        #       reconstruct a new file including default values
        config_src = os.path.join(self.control_path, 'config.yaml')
        config_dst = os.path.join(self.work_path)
        shutil.copy(config_src, config_dst)

        # Stripe directory in Lustre
        # TODO: Make this more configurable
        do_stripe = self.config.get('stripedio', False)
        if do_stripe:
            cmd = 'lfs setstripe -c 8 -s 8m {}'.format(self.work_path)
            sp.check_call(shlex.split(cmd))

        make_symlink(self.work_path, self.work_sym_path)

        for model in self.models:
            model.setup()

        # Call the macro-model setup
        if len(self.models) > 1:
            self.model.setup()

        setup_script = self.userscripts.get('setup')
        if setup_script:
            self.run_userscript(setup_script)

        # Profiler setup
        expt_profs = self.config.get('profilers', [])
        if not isinstance(expt_profs, list):
            expt_profs = [expt_profs]

        for prof_name in expt_profs:
            ProfType = payu.profilers.index[prof_name]
            prof = ProfType(self)
            self.profilers.append(prof)

            # Testing
            prof.setup()

    def run(self, *user_flags):

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so')

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)
            model_prog.append(model.exec_path)

            mpi_progs.append(' '.join(model_prog))

        cmd = '{} {} {}'.format(mpi_runcmd, ' '.join(mpi_flags),
                                ' : '.join(mpi_progs))

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        print(cmd)

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd),
                            stdout=f_out,
                            stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        if self.runlog:
            self.runlog.commit()

        f_out.close()
        f_err.close()

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = os.environ.get('PBS_JOBID', '')

            for fname in (self.stdout_fname, self.stderr_fname):
                src = os.path.join(self.control_path, fname)

                # NOTE: This assumes standard .out/.err extensions
                dest = os.path.join(error_log_dir,
                                    fname[:-4] + '.' + job_id + fname[-4:])
                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            # Terminate payu
            sys.exit('payu: Model exited with error code {}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in (self.stdout_fname, self.stderr_fname):
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)

    def archive(self):

        if not self.config.get('archive', True):
            print('payu: not archiving due to config.yaml setting.')
            return

        # Check there is a work directory, otherwise bail
        if not os.path.exists(self.work_sym_path):
            sys.exit('payu: error: No work directory to archive.')

        mkdir_p(self.archive_path)
        make_symlink(self.archive_path, self.archive_sym_path)

        # Remove work symlink
        if os.path.islink(self.work_sym_path):
            os.remove(self.work_sym_path)

        mkdir_p(self.restart_path)

        for model in self.models:
            model.archive()

        # Postprocess the model suite
        if len(self.models) > 1:
            self.model.archive()

        # Double-check that the run path does not exist
        if os.path.exists(self.output_path):
            sys.exit('payu: error: Output path already exists.')

        cmd = 'mv {} {}'.format(self.work_path, self.output_path)
        sp.check_call(shlex.split(cmd))

        # Remove old restart files
        # TODO: Move to subroutine
        restart_freq = self.config.get('restart_freq', default_restart_freq)
        restart_history = self.config.get('restart_history',
                                          default_restart_history)

        # Remove any outdated restart files
        prior_restart_dirs = [
            d for d in os.listdir(self.archive_path) if d.startswith('restart')
        ]

        for res_dir in prior_restart_dirs:

            res_idx = int(res_dir.lstrip('restart'))
            if (self.repeat_run
                    or (not res_idx % restart_freq == 0 and res_idx <=
                        (self.counter - restart_history))):

                res_path = os.path.join(self.archive_path, res_dir)

                # Only delete real directories; ignore symbolic restart links
                if os.path.isdir(res_path):
                    shutil.rmtree(res_path)

        if self.config.get('collate', True):
            cmd = 'payu collate -i {}'.format(self.counter)
            sp.check_call(shlex.split(cmd))

        if self.config.get('hpctoolkit', False):
            cmd = 'payu profile -i {}'.format(self.counter)
            sp.check_call(shlex.split(cmd))

        archive_script = self.userscripts.get('archive')
        if archive_script:
            self.run_userscript(archive_script)

    def collate(self):

        for model in self.models:
            model.collate()

    def profile(self):
        for model in self.models:
            model.profile()

    def postprocess(self):
        """Submit a postprocessing script after collation"""
        assert self.postscript

        cmd = 'qsub {}'.format(self.postscript)

        cmd = shlex.split(cmd)
        rc = sp.call(cmd)
        assert rc == 0, 'Postprocessing script submission failed.'

    def remote_archive(self,
                       config_name,
                       archive_url=None,
                       max_rsync_attempts=1,
                       rsync_protocol=None):

        if not archive_url:
            archive_url = default_archive_url

        archive_address = '{usr}@{url}'.format(usr=getpass.getuser(),
                                               url=archive_url)

        ssh_key_path = os.path.join(os.getenv('HOME'), '.ssh',
                                    'id_rsa_file_transfer')

        # Top-level path is implicitly set by the SSH key
        # (Usually /projects/[group])

        # Remote mkdir is currently not possible, so any new subdirectories
        # must be created before auto-archival

        remote_path = os.path.join(self.model_name, config_name, self.name)
        remote_url = '{addr}:{path}'.format(addr=archive_address,
                                            path=remote_path)

        # Rsync ouput and restart files
        rsync_cmd = ('rsync -a --safe-links -e "ssh -i {}" '
                     ''.format(ssh_key_path))

        if rsync_protocol:
            rsync_cmd += '--protocol={} '.format(rsync_protocol)

        run_cmd = rsync_cmd + '{src} {dst}'.format(src=self.output_path,
                                                   dst=remote_url)
        rsync_calls = [run_cmd]

        if (self.counter % 5) == 0 and os.path.isdir(self.restart_path):
            # Tar restart files before rsyncing
            restart_tar_path = self.restart_path + '.tar.gz'

            cmd = ('tar -C {} -czf {} {}'
                   ''.format(self.archive_path, restart_tar_path,
                             os.path.basename(self.restart_path)))
            sp.check_call(shlex.split(cmd))

            restart_cmd = ('{} {} {}'
                           ''.format(rsync_cmd, restart_tar_path, remote_url))
            rsync_calls.append(restart_cmd)
        else:
            res_tar_path = None

        for model in self.models:
            for input_path in self.model.input_paths:
                # Using explicit path separators to rename the input directory
                input_cmd = rsync_cmd + '{} {}'.format(
                    input_path + os.path.sep,
                    os.path.join(remote_url, 'input') + os.path.sep)
                rsync_calls.append(input_cmd)

        for cmd in rsync_calls:
            cmd = shlex.split(cmd)

            for rsync_attempt in range(max_rsync_attempts):
                rc = sp.Popen(cmd).wait()
                if rc == 0:
                    break
                else:
                    print('rsync failed, reattempting')
            assert rc == 0

        # TODO: Temporary; this should be integrated with the rsync call
        if res_tar_path and os.path.exists(res_tar_path):
            os.remove(res_tar_path)

    def resubmit(self):
        next_run = self.counter + 1
        cmd = 'payu run -i {} -n {}'.format(next_run, self.n_runs)
        cmd = shlex.split(cmd)
        sp.call(cmd)

    def run_userscript(self, script_cmd):

        # First try to interpret the argument as a full command:
        try:
            sp.check_call(shlex.split(script_cmd))
        except (OSError, sp.CalledProcessError) as exc:
            # Now try to run the script explicitly
            if type(exc) == OSError and exc.errno == errno.ENOENT:
                cmd = os.path.join(self.control_path, script_cmd)
                # Simplistic recursion check
                assert os.path.isfile(cmd)
                self.run_userscript(cmd)

            # If we get a "non-executable" error, then guess the type
            elif type(exc) == OSError and exc.errno == errno.EACCES:

                # TODO: Move outside
                ext_cmd = {
                    '.py': 'python',
                    '.sh': '/bin/bash',
                    '.csh': '/bin/tcsh'
                }

                _, f_ext = os.path.splitext(script_cmd)
                shell_name = ext_cmd.get(f_ext)
                if shell_name:
                    print('payu: warning: Assuming that {} is a {} script '
                          'based on the filename extension.'
                          ''.format(os.path.basename(script_cmd),
                                    os.path.basename(shell_name)))
                    cmd = ' '.join([shell_name, script_cmd])
                    self.run_userscript(cmd)
                else:
                    # If we can't guess the shell, then abort
                    raise

            # If the script runs but the output is bad, then warn the user
            elif type(exc) == sp.CalledProcessError:
                print('payu: warning: user script \'{}\' failed (error {}).'
                      ''.format(script_cmd, exc.returncode))

            # If all else fails, raise an error
            else:
                raise

    def sweep(self, hard_sweep=False):
        # TODO: Fix the IO race conditions!

        if hard_sweep:
            if os.path.isdir(self.archive_path):
                print('Removing archive path {}'.format(self.archive_path))
                cmd = 'rm -rf {}'.format(self.archive_path)
                cmd = shlex.split(cmd)
                rc = sp.call(cmd)
                assert rc == 0

            if os.path.islink(self.archive_sym_path):
                print('Removing symlink {}'.format(self.archive_sym_path))
                os.remove(self.archive_sym_path)

        if os.path.isdir(self.work_path):
            print('Removing work path {}'.format(self.work_path))
            cmd = 'rm -rf {}'.format(self.work_path)
            cmd = shlex.split(cmd)
            rc = sp.call(cmd)
            assert rc == 0

        if os.path.islink(self.work_sym_path):
            print('Removing symlink {}'.format(self.work_sym_path))
            os.remove(self.work_sym_path)

        # TODO: model outstreams and pbs logs need to be handled separately
        default_job_name = os.path.basename(os.getcwd())
        short_job_name = str(self.config.get('jobname', default_job_name))[:15]

        logs = [
            f for f in os.listdir(os.curdir) if os.path.isfile(f) and (
                f.startswith(short_job_name +
                             '.o') or f.startswith(short_job_name + '.e')
                or f.startswith(short_job_name[:13] + '_c.o') or f.startswith(
                    short_job_name[:13] +
                    '_c.e') or f.startswith(short_job_name[:13] + '_p.o')
                or f.startswith(short_job_name[:13] + '_p.e'))
        ]

        pbs_log_path = os.path.join(self.archive_path, 'pbs_logs')
        legacy_pbs_log_path = os.path.join(self.control_path, 'pbs_logs')

        if os.path.isdir(legacy_pbs_log_path):
            # TODO: New path may still exist!
            assert not os.path.isdir(pbs_log_path)
            print('payu: Moving pbs_logs to {}'.format(pbs_log_path))
            shutil.move(legacy_pbs_log_path, pbs_log_path)
        else:
            mkdir_p(pbs_log_path)

        for f in logs:
            print('Moving log {}'.format(f))
            shutil.move(f, os.path.join(pbs_log_path, f))

        # Remove stdout/err
        for f in (self.stdout_fname, self.stderr_fname):
            if os.path.isfile(f):
                os.remove(f)
Ejemplo n.º 6
0
class Experiment(object):

    def __init__(self, lab):
        self.lab = lab

        # TODO: replace with dict, check versions via key-value pairs
        self.modules = set()

        # TODO: __init__ should not be a config dumping ground!
        self.config = read_config()

        # Payu experiment type
        self.debug = self.config.get('debug', False)
        self.postscript = self.config.get('postscript')
        self.repeat_run = self.config.get('repeat', False)

        # Configuration
        self.expand_shell_vars = True   # TODO: configurable

        # Model run time
        self.runtime = None
        if ('calendar' in self.config and
                'runtime' in self.config['calendar']):
            self.runtime = self.config['calendar']['runtime']

        # Stacksize
        # NOTE: Possible PBS issue in setting non-unlimited stacksizes
        stacksize = self.config.get('stacksize', 'unlimited')
        self.set_stacksize(stacksize)

        # Initialize the submodels
        self.init_models()

        # TODO: Move to run/collate/sweep?
        self.set_expt_pathnames()
        self.set_counters()

        for model in self.models:
            model.set_input_paths()

        self.set_output_paths()

        # Miscellaneous configurations
        # TODO: Move this stuff somewhere else
        self.userscripts = self.config.get('userscripts', {})

        self.profilers = []

        init_script = self.userscripts.get('init')
        if init_script:
            self.run_userscript(init_script)

        self.runlog = Runlog(self)

        # XXX: Temporary spot for the payu path
        #      This is horrible; payu/cli.py does this much more safely!
        #      But also does not even store it in os.environ!
        default_payu_bin = os.path.dirname(sys.argv[0])
        payu_bin = os.environ.get('PAYU_PATH', default_payu_bin)

        self.payu_path = os.path.join(payu_bin, 'payu')

    def init_models(self):

        self.model_name = self.config.get('model')
        assert self.model_name

        model_fields = ['model', 'exe', 'input', 'ncpus', 'npernode', 'build',
                        'mpthreads', 'exe_prefix']

        # TODO: Rename this to self.submodels
        self.models = []

        submodels = self.config.get('submodels', [])

        solo_model = self.config.get('model')
        if not solo_model:
            sys.exit('payu: error: Unknown model configuration.')

        submodel_config = dict((f, self.config[f]) for f in model_fields
                               if f in self.config)
        submodel_config['name'] = solo_model

        submodels.append(submodel_config)

        for m_config in submodels:
            ModelType = model_index[m_config['model']]
            self.models.append(ModelType(self, m_config['name'], m_config))

        # Load the top-level model
        if self.model_name:
            ModelType = model_index[self.model_name]
            model_config = dict((f, self.config[f]) for f in model_fields
                                if f in self.config)
            self.model = ModelType(self, self.model_name, model_config)
            self.model.top_level_model = True
        else:
            self.model = None

    def set_counters(self):
        # Assume that ``set_paths`` has already been called
        assert self.archive_path

        current_counter = os.environ.get('PAYU_CURRENT_RUN')
        if current_counter:
            self.counter = int(current_counter)
        else:
            self.counter = None

        self.n_runs = int(os.environ.get('PAYU_N_RUNS', 1))

        # Initialize counter if unset
        if self.counter is None:
            # TODO: this logic can probably be streamlined
            try:
                restart_dirs = [d for d in os.listdir(self.archive_path)
                                if d.startswith('restart')]
            except OSError as exc:
                if exc.errno == errno.ENOENT:
                    restart_dirs = None
                else:
                    raise

            # First test for restarts
            if restart_dirs:
                self.counter = 1 + max([int(d.lstrip('restart'))
                                        for d in restart_dirs
                                        if d.startswith('restart')])
            else:
                # uepeat runs do not generate restart files, so check outputs
                try:
                    output_dirs = [d for d in os.listdir(self.archive_path)
                                   if d.startswith('output')]
                except OSError as exc:
                    if exc.errno == errno.ENOENT:
                        output_dirs = None
                    else:
                        raise

                # First test for restarts
                if output_dirs:
                    self.counter = 1 + max([int(d.lstrip('output'))
                                            for d in output_dirs
                                            if d.startswith('output')])
                else:
                    self.counter = 0

    def set_stacksize(self, stacksize):

        if stacksize == 'unlimited':
            stacksize = resource.RLIM_INFINITY
        else:
            assert type(stacksize) is int

        resource.setrlimit(resource.RLIMIT_STACK,
                           (stacksize, resource.RLIM_INFINITY))

    def load_modules(self):
        # NOTE: This function is increasingly irrelevant, and may be removable.

        # Scheduler
        sched_modname = self.config.get('scheduler', 'pbs')
        self.modules.add(sched_modname)

        # MPI library
        mpi_config = self.config.get('mpi', {})

        # Assign MPI module paths
        mpi_modpath = mpi_config.get('modulepath', None)
        if mpi_modpath:
            envmod.module('use', mpi_modpath)

        mpi_modname = mpi_config.get('module', 'openmpi')
        self.modules.add(mpi_modname)

        # Unload non-essential modules
        loaded_mods = os.environ.get('LOADEDMODULES', '').split(':')

        for mod in loaded_mods:
            mod_base = mod.split('/')[0]
            if mod_base not in core_modules:
                envmod.module('unload', mod)

        # Now load model-dependent modules
        for mod in self.modules:
            envmod.module('load', mod)

        # User-defined modules
        user_modules = self.config.get('modules', [])
        for mod in user_modules:
            envmod.module('load', mod)

        envmod.module('list')

        for prof in self.profilers:
            prof.load_modules()

        # TODO: Consolidate this profiling stuff
        c_ipm = self.config.get('ipm', False)
        if c_ipm:
            if isinstance(c_ipm, str):
                ipm_mod = os.path.join('ipm', c_ipm)
            else:
                ipm_mod = 'ipm/2.0.2'

            envmod.module('load', ipm_mod)
            os.environ['IPM_LOGDIR'] = self.work_path

        if self.config.get('mpiP', False):
            envmod.module('load', 'mpiP')

        if self.config.get('hpctoolkit', False):
            envmod.module('load', 'hpctoolkit')

        if self.debug:
            envmod.module('load', 'totalview')

    def set_expt_pathnames(self):

        # Local "control" path
        self.control_path = self.config.get('control', os.getcwd())

        # Experiment name
        self.name = self.config.get('experiment',
                                    os.path.basename(self.control_path))

        # Experiment subdirectories
        self.archive_path = os.path.join(self.lab.archive_path, self.name)
        self.work_path = os.path.join(self.lab.work_path, self.name)

        # Symbolic link paths to output
        self.work_sym_path = os.path.join(self.control_path, 'work')
        self.archive_sym_path = os.path.join(self.control_path, 'archive')

        for model in self.models:
            model.set_model_pathnames()

        # Stream output filenames
        # TODO: per-model output streams?
        self.stdout_fname = self.lab.model_type + '.out'
        self.stderr_fname = self.lab.model_type + '.err'

    def set_output_paths(self):

        # Local archive paths

        # Check to see if we've provided a hard coded path -- valid for collate
        dir_path = os.environ.get('PAYU_DIR_PATH')
        if dir_path is not None:
            self.output_path = os.path.normpath(dir_path)
        else:
            output_dir = 'output{0:03}'.format(self.counter)
            self.output_path = os.path.join(self.archive_path, output_dir)

        # TODO: check case counter == 0
        prior_output_dir = 'output{0:03}'.format(self.counter - 1)
        prior_output_path = os.path.join(self.archive_path, prior_output_dir)
        if os.path.exists(prior_output_path):
            self.prior_output_path = prior_output_path
        else:
            self.prior_output_path = None

        # Local restart paths
        restart_dir = 'restart{0:03}'.format(self.counter)
        self.restart_path = os.path.join(self.archive_path, restart_dir)

        prior_restart_dir = 'restart{0:03}'.format(self.counter - 1)
        prior_restart_path = os.path.join(self.archive_path, prior_restart_dir)
        if os.path.exists(prior_restart_path):
            self.prior_restart_path = prior_restart_path
        else:
            self.prior_restart_path = None
            if self.counter > 0 and not self.repeat_run:
                # TODO: This warning should be replaced with an abort in setup
                print('payu: warning: No restart files found.')

        for model in self.models:
            model.set_model_output_paths()

    def build_model(self):

        self.load_modules()

        for model in self.models:
            model.get_codebase()

        for model in self.models:
            model.build_model()

    def setup(self, force_archive=False):

        # Confirm that no output path already exists
        if os.path.exists(self.output_path):
            sys.exit('payu: error: Output path already exists.')

        mkdir_p(self.work_path)

        if force_archive:
            mkdir_p(self.archive_path)
            make_symlink(self.archive_path, self.archive_sym_path)

        # Archive the payu config
        # TODO: This just copies the existing config.yaml file, but we should
        #       reconstruct a new file including default values
        config_src = os.path.join(self.control_path, 'config.yaml')
        config_dst = os.path.join(self.work_path)
        shutil.copy(config_src, config_dst)

        # Stripe directory in Lustre
        # TODO: Make this more configurable
        do_stripe = self.config.get('stripedio', False)
        if do_stripe:
            cmd = 'lfs setstripe -c 8 -s 8m {0}'.format(self.work_path)
            sp.check_call(shlex.split(cmd))

        make_symlink(self.work_path, self.work_sym_path)

        for model in self.models:
            model.setup()

        # Call the macro-model setup
        if len(self.models) > 1:
            self.model.setup()

        setup_script = self.userscripts.get('setup')
        if setup_script:
            self.run_userscript(setup_script)

        # Profiler setup
        expt_profs = self.config.get('profilers', [])
        if not isinstance(expt_profs, list):
            expt_profs = [expt_profs]

        for prof_name in expt_profs:
            ProfType = payu.profilers.index[prof_name]
            prof = ProfType(self)
            self.profilers.append(prof)

            # Testing
            prof.setup()

    def run(self, *user_flags):

        # XXX: This was previously done in reversion
        envmod.setup()

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so')

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {0}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {0}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{0}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{0}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{0}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)
            model_prog.append(model.exec_path)

            mpi_progs.append(' '.join(model_prog))

        cmd = '{runcmd} {flags} {exes}'.format(
            runcmd=mpi_runcmd,
            flags=' '.join(mpi_flags),
            exes=' : '.join(mpi_progs)
        )

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        print(cmd)

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        if self.runlog:
            self.runlog.commit()

        f_out.close()
        f_err.close()

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = os.environ.get('PBS_JOBID', '')

            for fname in (self.stdout_fname, self.stderr_fname):
                src = os.path.join(self.control_path, fname)

                # NOTE: This assumes standard .out/.err extensions
                dest = os.path.join(error_log_dir,
                                    fname[:-4] + '.' + job_id + fname[-4:])
                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            # Terminate payu
            sys.exit('payu: Model exited with error code {0}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in (self.stdout_fname, self.stderr_fname):
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)

    def archive(self):
        if not self.config.get('archive', True):
            print('payu: not archiving due to config.yaml setting.')
            return

        # Check there is a work directory, otherwise bail
        if not os.path.exists(self.work_sym_path):
            sys.exit('payu: error: No work directory to archive.')

        mkdir_p(self.archive_path)
        make_symlink(self.archive_path, self.archive_sym_path)

        # Remove work symlink
        if os.path.islink(self.work_sym_path):
            os.remove(self.work_sym_path)

        mkdir_p(self.restart_path)

        for model in self.models:
            model.archive()

        # Postprocess the model suite
        if len(self.models) > 1:
            self.model.archive()

        # Double-check that the run path does not exist
        if os.path.exists(self.output_path):
            sys.exit('payu: error: Output path already exists.')

        cmd = 'mv {work} {output}'.format(
            work=self.work_path,
            output=self.output_path
        )
        sp.check_call(shlex.split(cmd))

        # Remove old restart files
        # TODO: Move to subroutine
        restart_freq = self.config.get('restart_freq', default_restart_freq)
        restart_history = self.config.get('restart_history',
                                          default_restart_history)

        # Remove any outdated restart files
        prior_restart_dirs = [d for d in os.listdir(self.archive_path)
                              if d.startswith('restart')]

        for res_dir in prior_restart_dirs:

            res_idx = int(res_dir.lstrip('restart'))
            if (self.repeat_run or
                    (not res_idx % restart_freq == 0 and
                     res_idx <= (self.counter - restart_history))):

                res_path = os.path.join(self.archive_path, res_dir)

                # Only delete real directories; ignore symbolic restart links
                if os.path.isdir(res_path):
                    shutil.rmtree(res_path)

        collate_config = self.config.get('collate', {})
        if collate_config.get('enable', True):
            cmd = '{python} {payu} collate -i {expt}'.format(
                python=sys.executable,
                payu=self.payu_path,
                expt=self.counter
            )
            sp.check_call(shlex.split(cmd))

        if self.config.get('hpctoolkit', False):
            cmd = '{python} {payu} profile -i {expt}'.format(
                python=sys.executable,
                payu=self.payu_path,
                expt=self.counter
            )
            sp.check_call(shlex.split(cmd))

        archive_script = self.userscripts.get('archive')
        if archive_script:
            self.run_userscript(archive_script)

    def collate(self):
        for model in self.models:
            model.collate()

    def profile(self):
        for model in self.models:
            model.profile()

    def postprocess(self):
        """Submit a postprocessing script after collation"""
        assert self.postscript

        cmd = 'qsub {script}'.format(script=self.postscript)

        cmd = shlex.split(cmd)
        rc = sp.call(cmd)
        assert rc == 0, 'Postprocessing script submission failed.'

    def remote_archive(self, config_name, archive_url=None,
                       max_rsync_attempts=1, rsync_protocol=None):

        if not archive_url:
            archive_url = default_archive_url

        archive_address = '{usr}@{url}'.format(usr=getpass.getuser(),
                                               url=archive_url)

        ssh_key_path = os.path.join(os.getenv('HOME'), '.ssh',
                                    'id_rsa_file_transfer')

        # Top-level path is implicitly set by the SSH key
        # (Usually /projects/[group])

        # Remote mkdir is currently not possible, so any new subdirectories
        # must be created before auto-archival

        remote_path = os.path.join(self.model_name, config_name, self.name)
        remote_url = '{addr}:{path}'.format(addr=archive_address,
                                            path=remote_path)

        # Rsync ouput and restart files
        rsync_cmd = ('rsync -a --safe-links -e "ssh -i {key}" '
                     ''.format(key=ssh_key_path))

        if rsync_protocol:
            rsync_cmd += '--protocol={0} '.format(rsync_protocol)

        run_cmd = rsync_cmd + '{src} {dst}'.format(src=self.output_path,
                                                   dst=remote_url)
        rsync_calls = [run_cmd]

        if (self.counter % 5) == 0 and os.path.isdir(self.restart_path):
            # Tar restart files before rsyncing
            restart_tar_path = self.restart_path + '.tar.gz'

            cmd = ('tar -C {0} -czf {1} {2}'
                   ''.format(self.archive_path, restart_tar_path,
                             os.path.basename(self.restart_path)))
            sp.check_call(shlex.split(cmd))

            restart_cmd = ('{0} {1} {2}'
                           ''.format(rsync_cmd, restart_tar_path, remote_url))
            rsync_calls.append(restart_cmd)
        else:
            res_tar_path = None

        for model in self.models:
            for input_path in self.model.input_paths:
                # Using explicit path separators to rename the input directory
                input_cmd = rsync_cmd + '{0} {1}'.format(
                    input_path + os.path.sep,
                    os.path.join(remote_url, 'input') + os.path.sep)
                rsync_calls.append(input_cmd)

        for cmd in rsync_calls:
            cmd = shlex.split(cmd)

            for rsync_attempt in range(max_rsync_attempts):
                rc = sp.Popen(cmd).wait()
                if rc == 0:
                    break
                else:
                    print('rsync failed, reattempting')
            assert rc == 0

        # TODO: Temporary; this should be integrated with the rsync call
        if res_tar_path and os.path.exists(res_tar_path):
            os.remove(res_tar_path)

    def resubmit(self):
        next_run = self.counter + 1
        cmd = '{python} {payu} run -i {start} -n {n}'.format(
            python=sys.executable,
            payu=self.payu_path,
            start=next_run,
            n=self.n_runs
        )
        cmd = shlex.split(cmd)
        sp.call(cmd)

    def run_userscript(self, script_cmd):

        # First try to interpret the argument as a full command:
        try:
            sp.check_call(shlex.split(script_cmd))
        except (OSError, sp.CalledProcessError) as exc:
            # Now try to run the script explicitly
            if type(exc) == OSError and exc.errno == errno.ENOENT:
                cmd = os.path.join(self.control_path, script_cmd)
                # Simplistic recursion check
                assert os.path.isfile(cmd)
                self.run_userscript(cmd)

            # If we get a "non-executable" error, then guess the type
            elif type(exc) == OSError and exc.errno == errno.EACCES:

                # TODO: Move outside
                ext_cmd = {'.py': 'python',
                           '.sh': '/bin/bash',
                           '.csh': '/bin/tcsh'}

                _, f_ext = os.path.splitext(script_cmd)
                shell_name = ext_cmd.get(f_ext)
                if shell_name:
                    print('payu: warning: Assuming that {0} is a {1} script '
                          'based on the filename extension.'
                          ''.format(os.path.basename(script_cmd),
                                    os.path.basename(shell_name)))
                    cmd = ' '.join([shell_name, script_cmd])
                    self.run_userscript(cmd)
                else:
                    # If we can't guess the shell, then abort
                    raise

            # If the script runs but the output is bad, then warn the user
            elif type(exc) == sp.CalledProcessError:
                print('payu: warning: user script \'{0}\' failed (error {1}).'
                      ''.format(script_cmd, exc.returncode))

            # If all else fails, raise an error
            else:
                raise

    def sweep(self, hard_sweep=False):
        # TODO: Fix the IO race conditions!

        if hard_sweep:
            if os.path.isdir(self.archive_path):
                print('Removing archive path {0}'.format(self.archive_path))
                cmd = 'rm -rf {0}'.format(self.archive_path)
                cmd = shlex.split(cmd)
                rc = sp.call(cmd)
                assert rc == 0

            if os.path.islink(self.archive_sym_path):
                print('Removing symlink {0}'.format(self.archive_sym_path))
                os.remove(self.archive_sym_path)

        if os.path.isdir(self.work_path):
            print('Removing work path {0}'.format(self.work_path))
            cmd = 'rm -rf {0}'.format(self.work_path)
            cmd = shlex.split(cmd)
            rc = sp.call(cmd)
            assert rc == 0

        if os.path.islink(self.work_sym_path):
            print('Removing symlink {0}'.format(self.work_sym_path))
            os.remove(self.work_sym_path)

        # TODO: model outstreams and pbs logs need to be handled separately
        default_job_name = os.path.basename(os.getcwd())
        short_job_name = str(self.config.get('jobname', default_job_name))[:15]

        logs = [
            f for f in os.listdir(os.curdir) if os.path.isfile(f) and (
                f.startswith(short_job_name + '.o') or
                f.startswith(short_job_name + '.e') or
                f.startswith(short_job_name[:13] + '_c.o') or
                f.startswith(short_job_name[:13] + '_c.e') or
                f.startswith(short_job_name[:13] + '_p.o') or
                f.startswith(short_job_name[:13] + '_p.e')
            )
        ]

        pbs_log_path = os.path.join(self.archive_path, 'pbs_logs')
        legacy_pbs_log_path = os.path.join(self.control_path, 'pbs_logs')

        if os.path.isdir(legacy_pbs_log_path):
            # TODO: New path may still exist!
            assert not os.path.isdir(pbs_log_path)
            print('payu: Moving pbs_logs to {0}'.format(pbs_log_path))
            shutil.move(legacy_pbs_log_path, pbs_log_path)
        else:
            mkdir_p(pbs_log_path)

        for f in logs:
            print('Moving log {0}'.format(f))
            shutil.move(f, os.path.join(pbs_log_path, f))

        # Remove stdout/err
        for f in (self.stdout_fname, self.stderr_fname):
            if os.path.isfile(f):
                os.remove(f)
Ejemplo n.º 7
0
    def __init__(self, lab, reproduce=False, force=False):
        self.lab = lab

        if not force:
            # check environment for force flag under PBS
            self.force = os.environ.get('PAYU_FORCE', False)
        else:
            self.force = force

        self.start_time = datetime.datetime.now()

        # TODO: replace with dict, check versions via key-value pairs
        self.modules = set()

        # TODO: __init__ should not be a config dumping ground!
        self.config = read_config()

        # Payu experiment type
        self.debug = self.config.get('debug', False)
        self.postscript = self.config.get('postscript')
        self.repeat_run = self.config.get('repeat', False)

        # Configuration
        self.expand_shell_vars = True   # TODO: configurable

        # Model run time
        self.runtime = None
        if ('calendar' in self.config and
                'runtime' in self.config['calendar']):
            self.runtime = self.config['calendar']['runtime']

        # Stacksize
        # NOTE: Possible PBS issue in setting non-unlimited stacksizes
        stacksize = self.config.get('stacksize', 'unlimited')
        self.set_stacksize(stacksize)

        # Initialize the submodels
        self.init_models()

        # TODO: Move to run/collate/sweep?
        self.set_expt_pathnames()
        self.set_counters()

        for model in self.models:
            model.set_input_paths()

        self.set_output_paths()

        if not reproduce:
            # check environment for reproduce flag under PBS
            reproduce = os.environ.get('PAYU_REPRODUCE', False)

        # Initialize manifest
        self.manifest = Manifest(self.config.get('manifest', {}),
                                 reproduce=reproduce)

        # Miscellaneous configurations
        # TODO: Move this stuff somewhere else
        self.userscripts = self.config.get('userscripts', {})

        self.profilers = []

        init_script = self.userscripts.get('init')
        if init_script:
            self.run_userscript(init_script)

        self.runlog = Runlog(self)

        # XXX: Temporary spot for the payu path
        #      This is horrible; payu/cli.py does this much more safely!
        #      But also does not even store it in os.environ!
        default_payu_bin = os.path.dirname(sys.argv[0])
        payu_bin = os.environ.get('PAYU_PATH', default_payu_bin)

        self.payu_path = os.path.join(payu_bin, 'payu')

        self.run_id = None