コード例 #1
0
    def postprocess(self):
        """Submit a postprocessing script after collation"""
        assert self.postscript
        envmod.setup()
        envmod.module('load', 'pbs')

        cmd = 'qsub {script}'.format(script=self.postscript)

        cmd = shlex.split(cmd)
        rc = sp.call(cmd)
        assert rc == 0, 'Postprocessing script submission failed.'
コード例 #2
0
ファイル: experiment.py プロジェクト: marshallward/payu
    def postprocess(self):
        """Submit a postprocessing script after collation"""
        assert self.postscript
        envmod.setup()
        envmod.module('load', 'pbs')

        cmd = 'qsub {script}'.format(script=self.postscript)

        cmd = shlex.split(cmd)
        rc = sp.call(cmd)
        assert rc == 0, 'Postprocessing script submission failed.'
コード例 #3
0
ファイル: reversion.py プロジェクト: dkhutch/payu
def repython(version, script_path):
    """Update the Python environment modules to the specified ``version`` and
    replace the current process with an updated Python execution running the
    script specified by ``script_path``.
    """

    # Establish the environment modules
    envmod.setup()

    if not os.environ['MODULESHOME']:
        print('payu: warning: Environment modules unavailable; aborting '
              'reversion.')
        return

    # Ensure that payu is loaded
    try:
        envmod.module('use', os.environ['PAYU_MODULEPATH'])
        envmod.module('load', os.environ['PAYU_MODULENAME'])
    except KeyError:
        pass

    # NOTE: Older versions (<2.7) require the version as a tuple
    version_tuple = tuple(int(i) for i in version.split('.'))
    module_name = os.path.join('python', version)

    python_modules = [
        m for m in os.environ['LOADEDMODULES'].split(':')
        if m.startswith('python')
    ]

    if sys.version_info < version_tuple or module_name not in python_modules:

        # First unload all python (and supporting) modules
        python_modules = [
            m for m in os.environ['LOADEDMODULES'].split(':')
            if m.startswith('python')
        ]

        for mod in python_modules:
            envmod.module('unload', mod)

        # Replace with specified version
        envmod.module('load', module_name)

        # Replace the current python process with the updated version
        os.execl(script_path, *sys.argv)
コード例 #4
0
ファイル: reversion.py プロジェクト: dkhutch/payu
def repython(version, script_path):
    """Update the Python environment modules to the specified ``version`` and
    replace the current process with an updated Python execution running the
    script specified by ``script_path``.
    """

    # Establish the environment modules
    envmod.setup()

    if not os.environ['MODULESHOME']:
        print('payu: warning: Environment modules unavailable; aborting '
              'reversion.')
        return

    # Ensure that payu is loaded
    try:
        envmod.module('use', os.environ['PAYU_MODULEPATH'])
        envmod.module('load', os.environ['PAYU_MODULENAME'])
    except KeyError:
        pass

    # NOTE: Older versions (<2.7) require the version as a tuple
    version_tuple = tuple(int(i) for i in version.split('.'))
    module_name = os.path.join('python', version)

    python_modules = [m for m in os.environ['LOADEDMODULES'].split(':')
                      if m.startswith('python')]

    if sys.version_info < version_tuple or module_name not in python_modules:

        # First unload all python (and supporting) modules
        python_modules = [m for m in os.environ['LOADEDMODULES'].split(':')
                          if m.startswith('python')]

        for mod in python_modules:
            envmod.module('unload', mod)

        # Replace with specified version
        envmod.module('load', module_name)

        # Replace the current python process with the updated version
        os.execl(script_path, *sys.argv)
コード例 #5
0
ファイル: experiment.py プロジェクト: nicjhan/payu
    def run(self, *user_flags):

        # XXX: This was previously done in reversion
        envmod.setup()

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(model.exec_path, 'libmpi.so')

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {0}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {0}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{0}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{0}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{0}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)
            model_prog.append(model.exec_path)

            mpi_progs.append(' '.join(model_prog))

        cmd = '{runcmd} {flags} {exes}'.format(
            runcmd=mpi_runcmd,
            flags=' '.join(mpi_flags),
            exes=' : '.join(mpi_progs)
        )

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        print(cmd)

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        if self.runlog:
            self.runlog.commit()

        f_out.close()
        f_err.close()

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = os.environ.get('PBS_JOBID', '')

            for fname in (self.stdout_fname, self.stderr_fname):
                src = os.path.join(self.control_path, fname)

                # NOTE: This assumes standard .out/.err extensions
                dest = os.path.join(error_log_dir,
                                    fname[:-4] + '.' + job_id + fname[-4:])
                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            # Terminate payu
            sys.exit('payu: Model exited with error code {0}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in (self.stdout_fname, self.stderr_fname):
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)
コード例 #6
0
def submit_job(pbs_script, pbs_config, pbs_vars=None):
    """Submit a userscript the scheduler."""

    # Initialisation
    if pbs_vars is None:
        pbs_vars = {}

    pbs_flags = []

    pbs_queue = pbs_config.get('queue', 'normal')
    pbs_flags.append('-q {queue}'.format(queue=pbs_queue))

    pbs_project = pbs_config.get('project', os.environ['PROJECT'])
    pbs_flags.append('-P {project}'.format(project=pbs_project))

    pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs']

    for res_key in pbs_resources:
        res_flags = []
        res_val = pbs_config.get(res_key)
        if res_val:
            res_flags.append('{key}={val}'.format(key=res_key, val=res_val))

        if res_flags:
            pbs_flags.append('-l {res}'.format(res=','.join(res_flags)))

    # TODO: Need to pass lab.config_path somehow...
    pbs_jobname = pbs_config.get('jobname', os.path.basename(os.getcwd()))
    if pbs_jobname:
        # PBSPro has a 15-character jobname limit
        pbs_flags.append('-N {name}'.format(name=pbs_jobname[:15]))

    pbs_priority = pbs_config.get('priority')
    if pbs_priority:
        pbs_flags.append('-p {priority}'.format(priority=pbs_priority))

    pbs_flags.append('-l wd')

    pbs_join = pbs_config.get('join', 'n')
    if pbs_join not in ('oe', 'eo', 'n'):
        print('payu: error: unknown qsub IO stream join setting.')
        sys.exit(-1)
    else:
        pbs_flags.append('-j {join}'.format(join=pbs_join))

    # Append environment variables to qsub command
    # TODO: Support full export of environment variables: `qsub -V`
    pbs_vstring = ','.join('{0}={1}'.format(k, v) for k, v in pbs_vars.items())
    pbs_flags.append('-v ' + pbs_vstring)

    # Append any additional qsub flags here
    pbs_flags_extend = pbs_config.get('qsub_flags')
    if pbs_flags_extend:
        pbs_flags.append(pbs_flags_extend)

    if not os.path.isabs(pbs_script):
        # NOTE: PAYU_PATH is always set if `set_env_vars` was always called.
        #       This is currently always true, but is not explicitly enforced.
        #       So this conditional check is a bit redundant.
        payu_bin = pbs_vars.get('PAYU_PATH', os.path.dirname(sys.argv[0]))
        pbs_script = os.path.join(payu_bin, pbs_script)
        assert os.path.isfile(pbs_script)

    # Set up environment modules here for PBS.
    envmod.setup()
    envmod.module('load', 'pbs')

    # Construct job submission command
    cmd = 'qsub {flags} -- {python} {script}'.format(flags=' '.join(pbs_flags),
                                                     python=sys.executable,
                                                     script=pbs_script)
    print(cmd)

    subprocess.check_call(shlex.split(cmd))
コード例 #7
0
    def run(self, *user_flags):

        # XXX: This was previously done in reversion
        envmod.setup()

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path_local:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(
                    model.exec_path_local,
                    'libmpi.so'
                )

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {0}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {0}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{0}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{0}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{0}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)

            # Use the full path to symlinked exec_name in work as some
            # older MPI libraries complained executable was not in PATH
            model_prog.append(os.path.join(model.work_path, model.exec_name))

            mpi_progs.append(' '.join(model_prog))

        cmd = '{runcmd} {flags} {exes}'.format(
            runcmd=mpi_runcmd,
            flags=' '.join(mpi_flags),
            exes=' : '.join(mpi_progs)
        )

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        # TODO: Consider making this default
        if self.config.get('coredump', False):
            enable_core_dump()

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # Dump out environment
        with open(self.env_fname, 'w') as file:
            file.write(yaml.dump(dict(os.environ), default_flow_style=False))

        self.runlog.create_manifest()
        if self.runlog.enabled:
            self.runlog.commit()

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        print(cmd)
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        f_out.close()
        f_err.close()

        self.finish_time = datetime.datetime.now()

        info = get_job_info()

        if info is None:
            # Not being run under PBS, reverse engineer environment
            info = {
                'PAYU_PATH': os.path.dirname(self.payu_path)
            }

        # Add extra information to save to jobinfo
        info.update(
            {
                'PAYU_CONTROL_DIR': self.control_path,
                'PAYU_RUN_ID': self.run_id,
                'PAYU_CURRENT_RUN': self.counter,
                'PAYU_N_RUNS':  self.n_runs,
                'PAYU_JOB_STATUS': rc,
                'PAYU_START_TIME': self.start_time.isoformat(),
                'PAYU_FINISH_TIME': self.finish_time.isoformat(),
                'PAYU_WALLTIME': "{0} s".format(
                    (self.finish_time - self.start_time).total_seconds()
                ),
            }
        )

        # Dump job info
        with open(self.job_fname, 'w') as file:
            file.write(yaml.dump(info, default_flow_style=False))

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = get_job_id(short=False)

            if job_id == '':
                job_id = str(self.run_id)[:6]

            for fname in self.output_fnames:

                src = os.path.join(self.control_path, fname)

                stem, suffix = os.path.splitext(fname)
                dest = os.path.join(error_log_dir,
                                    ".".join((stem, job_id)) + suffix)

                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            error_script = self.userscripts.get('error')
            if error_script:
                self.run_userscript(error_script)

            # Terminate payu
            sys.exit('payu: Model exited with error code {0}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in self.output_fnames:
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)
コード例 #8
0
def submit_job(pbs_script, pbs_config, pbs_vars=None):
    """Submit a userscript the scheduler."""

    pbs_flags = []

    pbs_queue = pbs_config.get('queue', 'normal')
    pbs_flags.append('-q {}'.format(pbs_queue))

    pbs_project = pbs_config.get('project', os.environ['PROJECT'])
    pbs_flags.append('-P {}'.format(pbs_project))

    pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs']

    for res_key in pbs_resources:
        res_flags = []
        res_val = pbs_config.get(res_key)
        if res_val:
            res_flags.append('{}={}'.format(res_key, res_val))

        if res_flags:
            pbs_flags.append('-l {}'.format(','.join(res_flags)))

    pbs_jobname = pbs_config.get('jobname')
    if pbs_jobname:
        # PBSPro has a 15-character jobname limit
        pbs_flags.append('-N {}'.format(pbs_jobname[:15]))

    pbs_priority = pbs_config.get('priority')
    if pbs_priority:
        pbs_flags.append('-p {}'.format(pbs_priority))

    pbs_flags.append('-l wd')

    pbs_join = pbs_config.get('join', 'oe')
    if pbs_join not in ('oe', 'eo', 'n'):
        print('payu: error: unknown qsub IO stream join setting.')
        sys.exit(-1)
    else:
        pbs_flags.append('-j {}'.format(pbs_join))

    if pbs_vars:
        pbs_vstring = ','.join('{}={}'.format(k, v)
                               for k, v in pbs_vars.iteritems())
        pbs_flags.append('-v ' + pbs_vstring)

    # Append any additional qsub flags here
    pbs_flags_extend = pbs_config.get('qsub_flags')
    if pbs_flags_extend:
        pbs_flags.append(pbs_flags_extend)

    # Enable PBS, in case it's not available
    envmod.setup()
    envmod.module('load', 'pbs')

    # If script path does not exist, then check the PATH directories
    if not os.path.isabs(pbs_script):
        for path in os.environ['PATH'].split(':'):
            if os.path.isdir(path) and pbs_script in os.listdir(path):
                pbs_script = os.path.join(path, pbs_script)
                break

    # Construct full command
    cmd = 'qsub {} {}'.format(' '.join(pbs_flags), pbs_script)
    print(cmd)

    subprocess.check_call(shlex.split(cmd))
コード例 #9
0
ファイル: fms.py プロジェクト: HoWol76/payu
    def collate(self):

        # Set the stacksize to be unlimited
        res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY))

        collate_config = self.expt.config.get('collate', {})

        # The mpi flag implies using mppnccombine-fast
        mpi = collate_config.get('mpi', False)

        if mpi:
            # Must use envmod to be able to load mpi modules for collation
            envmod.setup()
            self.expt.load_modules()
            default_exe = 'mppnccombine-fast'
        else:
            default_exe = 'mppnccombine'

        # Locate the FMS collation tool
        # Check config for collate executable
        mppnc_path = collate_config.get('exe')
        if mppnc_path is None:
            for f in os.listdir(self.expt.lab.bin_path):
                if f == default_exe:
                    mppnc_path = os.path.join(self.expt.lab.bin_path, f)
                    break
        else:
            if not os.path.isabs(mppnc_path):
                mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path)

        assert mppnc_path, 'No mppnccombine program found'

        # Check config for collate command line options
        collate_flags = collate_config.get('flags')
        if collate_flags is None:
            if mpi:
                collate_flags = '-r'
            else:
                collate_flags = '-n4 -z -m -r'

        if mpi:
            # The output file is the first argument after the flags
            # and mppnccombine-fast uses an explicit -o flag to specify
            # the output
            collate_flags = " ".join([collate_flags, '-o'])
            envmod.lib_update(mppnc_path, 'libmpi.so')

        # Import list of collated files to ignore
        collate_ignore = collate_config.get('ignore')
        if collate_ignore is None:
            collate_ignore = []
        elif type(collate_ignore) != list:
            collate_ignore = [collate_ignore]

        # Generate collated file list and identify the first tile
        tile_fnames = {}
        fnames = Fms.get_uncollated_files(self.output_path)
        tile_fnames[self.output_path] = fnames

        print(tile_fnames)

        if (collate_config.get('restart', False) and
                self.prior_restart_path is not None):
            # Add uncollated restart files
            fnames = Fms.get_uncollated_files(self.prior_restart_path)
            tile_fnames[self.prior_restart_path] = fnames

        # mnc_tiles = defaultdict(list)
        mnc_tiles = defaultdict(defaultdict(list).copy)
        for t_dir in tile_fnames:
            for t_fname in tile_fnames[t_dir]:
                t_base, t_ext = os.path.splitext(t_fname)
                t_ext = t_ext.lstrip('.')

                # Skip any files listed in the ignore list
                if t_base in collate_ignore:
                    continue

                mnc_tiles[t_dir][t_base].append(t_fname)

        # print(mnc_tiles)

        if mpi and collate_config.get('glob', True):
            for t_base in mnc_tiles:
                globstr = "{}.*".format(t_base)
                # Try an equivalent glob and check the same files are returned
                mnc_glob = fnmatch.filter(os.listdir(self.output_path),
                                          globstr)
                if mnc_tiles[t_base] == sorted(mnc_glob):
                    mnc_tiles[t_base] = [globstr, ]
                    print("Note: using globstr ({}) for collating {}"
                          .format(globstr, t_base))
                else:
                    print("Warning: cannot use globstr {} to collate {}"
                          .format(globstr, t_base))
                    if len(mnc_tiles[t_base]) > MPI_FORK_MAX_FILE_LIMIT:
                        print("Warning: large number of tiles: {} "
                              .format(len(mnc_tiles[t_base])))
                        print("Warning: collation will be slow and may fail")

        cpucount = int(collate_config.get('ncpus',
                       multiprocessing.cpu_count()))

        if mpi:
            # Default to one for mpi
            nprocesses = int(collate_config.get('threads', 1))
        else:
            nprocesses = int(collate_config.get('threads', cpucount))

        ncpusperprocess = int(cpucount/nprocesses)

        if ncpusperprocess == 1 and mpi:
            print("Warning: running collate with mpirun on a single processor")

        pool = multiprocessing.Pool(processes=nprocesses)

        # Collate each tileset into a single file
        results = []
        codes = []
        outputs = []
        for output_path in mnc_tiles:
            for nc_fname in mnc_tiles[output_path]:
                nc_path = os.path.join(output_path, nc_fname)

                # Remove the collated file if it already exists, since it is
                # probably from a failed collation attempt
                # TODO: Validate this somehow
                if os.path.isfile(nc_path):
                    os.remove(nc_path)

                cmd = ' '.join([mppnc_path, collate_flags, nc_fname,
                                ' '.join(mnc_tiles[output_path][nc_fname])])
                if mpi:
                    cmd = "mpirun -n {} {}".format(ncpusperprocess, cmd)

                print(cmd)
                results.append(
                    pool.apply_async(cmdthread, args=(cmd, output_path)))

        pool.close()
        pool.join()

        for result in results:
            rc, op = result.get()
            codes.append(rc)
            outputs.append(op)

        # TODO: Categorise the return codes
        if any(rc is not None for rc in codes):
            for p, rc, op in zip(count(), codes, outputs):
                if rc is not None:
                    print('payu: error: Thread {p} crashed with error code '
                          '{rc}.'.format(p=p, rc=rc), file=sys.stderr)
                    print(' Error message:', file=sys.stderr)
                    print(op.decode(), file=sys.stderr)
            sys.exit(-1)
コード例 #10
0
ファイル: fms.py プロジェクト: aidanheerdegen/payu
    def collate(self):

        # Set the stacksize to be unlimited
        res.setrlimit(res.RLIMIT_STACK, (res.RLIM_INFINITY, res.RLIM_INFINITY))

        collate_config = self.expt.config.get('collate', {})

        # The mpi flag implies using mppnccombine-fast
        mpi = collate_config.get('mpi', False)

        if mpi:
            # Must use envmod to be able to load mpi modules for collation
            envmod.setup()
            self.expt.load_modules()
            default_exe = 'mppnccombine-fast'
        else:
            default_exe = 'mppnccombine'

        # Locate the FMS collation tool
        # Check config for collate executable
        mppnc_path = collate_config.get('exe')
        if mppnc_path is None:
            for f in os.listdir(self.expt.lab.bin_path):
                if f == default_exe:
                    mppnc_path = os.path.join(self.expt.lab.bin_path, f)
                    break
        else:
            if not os.path.isabs(mppnc_path):
                mppnc_path = os.path.join(self.expt.lab.bin_path, mppnc_path)

        assert mppnc_path, 'No mppnccombine program found'

        # Check config for collate command line options
        collate_flags = collate_config.get('flags')
        if collate_flags is None:
            if mpi:
                collate_flags = '-r'
            else:
                collate_flags = '-n4 -z -m -r'

        if mpi:
            # The output file is the first argument after the flags
            # and mppnccombine-fast uses an explicit -o flag to specify
            # the output
            collate_flags = " ".join([collate_flags, '-o'])
            mpi_module = envmod.lib_update(mppnc_path, 'libmpi.so')

        # Import list of collated files to ignore
        collate_ignore = collate_config.get('ignore')
        if collate_ignore is None:
            collate_ignore = []
        elif type(collate_ignore) != list:
            collate_ignore = [collate_ignore]

        # Generate collated file list and identify the first tile
        tile_fnames = [f for f in os.listdir(self.output_path)
                       if f[-4:].isdigit() and f[-8:-4] == '.nc.']

        tile_fnames.sort()

        mnc_tiles = defaultdict(list)
        for t_fname in tile_fnames:
            t_base, t_ext = os.path.splitext(t_fname)
            t_ext = t_ext.lstrip('.')

            # Skip any files listed in the ignore list
            if t_base in collate_ignore:
                continue

            mnc_tiles[t_base].append(t_fname)

        if mpi and collate_config.get('glob', True):
            for t_base in mnc_tiles:
                globstr = "{}.*".format(t_base)
                # Try an equivalent glob and check the same files are returned
                mnc_glob = fnmatch.filter(os.listdir(self.output_path,
                                                     globstr))
                if mnc_tiles[t_base] == sorted(mnc_glob):
                    mnc_tiles[t_base] = [globstr, ]
                    print("Note: using globstr ({}) for collating {}"
                          .format(globstr, t_base))
                else:
                    print("Warning: cannot use globstr {} to collate {}"
                          .format(globstr, t_base))
                    if len(mnc_tiles[t_base]) > MPI_FORK_MAX_FILE_LIMIT:
                        print("Warning: large number of tiles: {} "
                              .format(len(mnc_tiles[t_base])))
                        print("Warning: collation will be slow and may fail")

        cpucount = int(collate_config.get('ncpus',
                       multiprocessing.cpu_count()))

        if mpi:
            # Default to one for mpi
            nprocesses = int(collate_config.get('threads', 1))
        else:
            nprocesses = int(collate_config.get('threads', cpucount))

        ncpusperprocess = int(cpucount/nprocesses)

        if ncpusperprocess == 1 and mpi:
            print("Warning: running collate with mpirun on a single processor")

        pool = multiprocessing.Pool(processes=nprocesses)

        # Collate each tileset into a single file
        results = []
        codes = []
        outputs = []
        for nc_fname in mnc_tiles:
            nc_path = os.path.join(self.output_path, nc_fname)

            # Remove the collated file if it already exists, since it is
            # probably from a failed collation attempt
            # TODO: Validate this somehow
            if os.path.isfile(nc_path):
                os.remove(nc_path)

            cmd = ' '.join([mppnc_path, collate_flags, nc_fname,
                            ' '.join(mnc_tiles[nc_fname])])
            if mpi:

                cmd = "mpirun -n {n} {cmd}".format(
                    n=ncpusperprocess,
                    cmd=cmd
                )

            print(cmd)
            results.append(
                pool.apply_async(cmdthread, args=(cmd, self.output_path)))

        pool.close()
        pool.join()

        for result in results:
            rc, op = result.get()
            codes.append(rc)
            outputs.append(op)

        # TODO: Categorise the return codes
        if any(rc is not None for rc in codes):
            for p, rc, op in zip(count(), codes, outputs):
                if rc is not None:
                    print('payu: error: Thread {p} crashed with error code '
                          '{rc}.'.format(p=p, rc=rc), file=sys.stderr)
                    print(' Error message:', file=sys.stderr)
                    print(op.decode(), file=sys.stderr)
            sys.exit(-1)
コード例 #11
0
ファイル: cli.py プロジェクト: aidanheerdegen/payu
def submit_job(pbs_script, pbs_config, pbs_vars=None):
    """Submit a userscript the scheduler."""

    # Initialisation
    if pbs_vars is None:
        pbs_vars = {}

    pbs_flags = []

    pbs_queue = pbs_config.get('queue', 'normal')
    pbs_flags.append('-q {queue}'.format(queue=pbs_queue))

    pbs_project = pbs_config.get('project', os.environ['PROJECT'])
    pbs_flags.append('-P {project}'.format(project=pbs_project))

    pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs']

    for res_key in pbs_resources:
        res_flags = []
        res_val = pbs_config.get(res_key)
        if res_val:
            res_flags.append('{key}={val}'.format(key=res_key, val=res_val))

        if res_flags:
            pbs_flags.append('-l {res}'.format(res=','.join(res_flags)))

    # TODO: Need to pass lab.config_path somehow...
    pbs_jobname = pbs_config.get('jobname', os.path.basename(os.getcwd()))
    if pbs_jobname:
        # PBSPro has a 15-character jobname limit
        pbs_flags.append('-N {name}'.format(name=pbs_jobname[:15]))

    pbs_priority = pbs_config.get('priority')
    if pbs_priority:
        pbs_flags.append('-p {priority}'.format(priority=pbs_priority))

    pbs_flags.append('-l wd')

    pbs_join = pbs_config.get('join', 'n')
    if pbs_join not in ('oe', 'eo', 'n'):
        print('payu: error: unknown qsub IO stream join setting.')
        sys.exit(-1)
    else:
        pbs_flags.append('-j {join}'.format(join=pbs_join))

    # Append environment variables to qsub command
    # TODO: Support full export of environment variables: `qsub -V`
    pbs_vstring = ','.join('{0}={1}'.format(k, v)
                           for k, v in pbs_vars.items())
    pbs_flags.append('-v ' + pbs_vstring)

    # Append any additional qsub flags here
    pbs_flags_extend = pbs_config.get('qsub_flags')
    if pbs_flags_extend:
        pbs_flags.append(pbs_flags_extend)

    if not os.path.isabs(pbs_script):
        # NOTE: PAYU_PATH is always set if `set_env_vars` was always called.
        #       This is currently always true, but is not explicitly enforced.
        #       So this conditional check is a bit redundant.
        payu_bin = pbs_vars.get('PAYU_PATH', os.path.dirname(sys.argv[0]))
        pbs_script = os.path.join(payu_bin, pbs_script)
        assert os.path.isfile(pbs_script)

    # Set up environment modules here for PBS.
    envmod.setup()
    envmod.module('load', 'pbs')

    # Construct job submission command
    cmd = 'qsub {flags} -- {python} {script}'.format(
        flags=' '.join(pbs_flags),
        python=sys.executable,
        script=pbs_script
    )
    print(cmd)

    subprocess.check_call(shlex.split(cmd))
コード例 #12
0
ファイル: pbs.py プロジェクト: marshallward/payu
def generate_command(pbs_script, pbs_config, pbs_vars=None, python_exe=None):
    """Prepare a correct PBS command string"""

    pbs_env_init()

    # Initialisation
    if pbs_vars is None:
        pbs_vars = {}

    # Necessary for testing
    if python_exe is None:
        python_exe = sys.executable

    pbs_flags = []

    pbs_queue = pbs_config.get('queue', 'normal')
    pbs_flags.append('-q {queue}'.format(queue=pbs_queue))

    pbs_project = pbs_config.get('project', os.environ['PROJECT'])
    pbs_flags.append('-P {project}'.format(project=pbs_project))

    pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs']

    for res_key in pbs_resources:
        res_flags = []
        res_val = pbs_config.get(res_key)
        if res_val:
            res_flags.append('{key}={val}'.format(key=res_key, val=res_val))

        if res_flags:
            pbs_flags.append('-l {res}'.format(res=','.join(res_flags)))

    # TODO: Need to pass lab.config_path somehow...
    pbs_jobname = pbs_config.get('jobname', os.path.basename(os.getcwd()))
    if pbs_jobname:
        # PBSPro has a 15-character jobname limit
        pbs_flags.append('-N {name}'.format(name=pbs_jobname[:15]))

    pbs_priority = pbs_config.get('priority')
    if pbs_priority:
        pbs_flags.append('-p {priority}'.format(priority=pbs_priority))

    pbs_flags.append('-l wd')

    pbs_join = pbs_config.get('join', 'n')
    if pbs_join not in ('oe', 'eo', 'n'):
        print('payu: error: unknown qsub IO stream join setting.')
        sys.exit(-1)
    else:
        pbs_flags.append('-j {join}'.format(join=pbs_join))

    # Append environment variables to qsub command
    # TODO: Support full export of environment variables: `qsub -V`
    pbs_vstring = ','.join('{0}={1}'.format(k, v)
                           for k, v in pbs_vars.items())
    pbs_flags.append('-v ' + pbs_vstring)

    storages = set()
    storage_config = pbs_config.get('storage', {})
    mounts = set(['/scratch', '/g/data'])
    for mount in storage_config:
        mounts.add(mount)
        for project in storage_config[mount]:
            storages.add(make_mount_string(encode_mount(mount), project))

    # Append any additional qsub flags here
    pbs_flags_extend = pbs_config.get('qsub_flags')
    if pbs_flags_extend:
        pbs_flags.append(pbs_flags_extend)

    payu_path = pbs_vars.get('PAYU_PATH', os.path.dirname(sys.argv[0]))
    pbs_script = check_exe_path(payu_path, pbs_script)

    # Check for storage paths that might need to be mounted in the
    # python and script paths
    extra_search_paths = [python_exe, payu_path, pbs_script]

    laboratory_path = pbs_config.get('laboratory', None)
    if laboratory_path is not None:
        extra_search_paths.append(laboratory_path)
    short_path = pbs_config.get('shortpath', None)
    if short_path is not None:
        extra_search_paths.append(short_path)

    storages.update(find_mounts(extra_search_paths, mounts))
    storages.update(find_mounts(get_manifest_paths(), mounts))

    # Add storage flags. Note that these are sorted to get predictable
    # behaviour for testing
    pbs_flags_extend = '+'.join(sorted(storages))
    if pbs_flags_extend:
        pbs_flags.append("-l storage={}".format(pbs_flags_extend))

    # Set up environment modules here for PBS.
    envmod.setup()
    envmod.module('load', 'pbs')

    # Construct job submission command
    cmd = 'qsub {flags} -- {python} {script}'.format(
        flags=' '.join(pbs_flags),
        python=python_exe,
        script=pbs_script
    )

    return cmd
コード例 #13
0
ファイル: experiment.py プロジェクト: marshallward/payu
    def run(self, *user_flags):

        # XXX: This was previously done in reversion
        envmod.setup()

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path_local:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(
                    model.exec_path_local,
                    'libmpi.so'
                )

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {0}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {0}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{0}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{0}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{0}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)

            # Use the full path to symlinked exec_name in work as some
            # older MPI libraries complained executable was not in PATH
            model_prog.append(os.path.join(model.work_path, model.exec_name))

            mpi_progs.append(' '.join(model_prog))

        cmd = '{runcmd} {flags} {exes}'.format(
            runcmd=mpi_runcmd,
            flags=' '.join(mpi_flags),
            exes=' : '.join(mpi_progs)
        )

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        # TODO: Consider making this default
        if self.config.get('coredump', False):
            enable_core_dump()

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # Dump out environment
        with open(self.env_fname, 'w') as file:
            file.write(yaml.dump(dict(os.environ), default_flow_style=False))

        self.runlog.create_manifest()
        if self.runlog.enabled:
            self.runlog.commit()

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        print(cmd)
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd), stdout=f_out, stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        f_out.close()
        f_err.close()

        self.finish_time = datetime.datetime.now()

        info = get_job_info()

        if info is None:
            # Not being run under PBS, reverse engineer environment
            info = {
                'PAYU_PATH': os.path.dirname(self.payu_path)
            }

        # Add extra information to save to jobinfo
        info.update(
            {
                'PAYU_CONTROL_DIR': self.control_path,
                'PAYU_RUN_ID': self.run_id,
                'PAYU_CURRENT_RUN': self.counter,
                'PAYU_N_RUNS':  self.n_runs,
                'PAYU_JOB_STATUS': rc,
                'PAYU_START_TIME': self.start_time.isoformat(),
                'PAYU_FINISH_TIME': self.finish_time.isoformat(),
                'PAYU_WALLTIME': "{0} s".format(
                    (self.finish_time - self.start_time).total_seconds()
                ),
            }
        )

        # Dump job info
        with open(self.job_fname, 'w') as file:
            file.write(yaml.dump(info, default_flow_style=False))

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = get_job_id(short=False)

            if job_id == '':
                job_id = self.run_id[:6]

            for fname in self.output_fnames:

                src = os.path.join(self.control_path, fname)

                stem, suffix = os.path.splitext(fname)
                dest = os.path.join(error_log_dir,
                                    ".".join((stem, job_id)) + suffix)

                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            # Terminate payu
            sys.exit('payu: Model exited with error code {0}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in self.output_fnames:
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)
コード例 #14
0
ファイル: cli.py プロジェクト: Subhash1998/product_sale
def submit_job(pbs_script, pbs_config, pbs_vars=None):
    """Submit a userscript the scheduler."""

    pbs_flags = []

    pbs_queue = pbs_config.get('queue', 'normal')
    pbs_flags.append('-q {}'.format(pbs_queue))

    pbs_project = pbs_config.get('project', os.environ['PROJECT'])
    pbs_flags.append('-P {}'.format(pbs_project))

    pbs_resources = ['walltime', 'ncpus', 'mem', 'jobfs']

    for res_key in pbs_resources:
        res_flags = []
        res_val = pbs_config.get(res_key)
        if res_val:
            res_flags.append('{}={}'.format(res_key, res_val))

        if res_flags:
            pbs_flags.append('-l {}'.format(','.join(res_flags)))

    # TODO: Need to pass lab.config_path somehow...
    pbs_jobname = pbs_config.get('jobname', os.path.basename(os.getcwd()))
    if pbs_jobname:
        # PBSPro has a 15-character jobname limit
        pbs_flags.append('-N {}'.format(pbs_jobname[:15]))

    pbs_priority = pbs_config.get('priority')
    if pbs_priority:
        pbs_flags.append('-p {}'.format(pbs_priority))

    pbs_flags.append('-l wd')

    pbs_join = pbs_config.get('join', 'n')
    if pbs_join not in ('oe', 'eo', 'n'):
        print('payu: error: unknown qsub IO stream join setting.')
        sys.exit(-1)
    else:
        pbs_flags.append('-j {}'.format(pbs_join))

    if pbs_vars:
        pbs_vstring = ','.join('{}={}'.format(k, v)
                               for k, v in pbs_vars.iteritems())
        pbs_flags.append('-v ' + pbs_vstring)

    # Append any additional qsub flags here
    pbs_flags_extend = pbs_config.get('qsub_flags')
    if pbs_flags_extend:
        pbs_flags.append(pbs_flags_extend)

    # Enable PBS, in case it's not available
    envmod.setup()
    envmod.module('load', 'pbs')

    # If script path does not exist, then check the PATH directories
    if not os.path.isabs(pbs_script):
        for path in os.environ['PATH'].split(':'):
            if os.path.isdir(path) and pbs_script in os.listdir(path):
                pbs_script = os.path.join(path, pbs_script)
                break

    # Construct full command
    cmd = 'qsub {} {}'.format(' '.join(pbs_flags), pbs_script)
    print(cmd)

    subprocess.check_call(shlex.split(cmd))
コード例 #15
0
ファイル: experiment.py プロジェクト: strat123123/payu
    def run(self, *user_flags):

        # XXX: This was previously done in reversion
        envmod.setup()

        self.load_modules()

        f_out = open(self.stdout_fname, 'w')
        f_err = open(self.stderr_fname, 'w')

        # Set MPI environment variables
        env = self.config.get('env')

        # Explicitly check for `None`, in case of an empty `env:` entry
        if env is None:
            env = {}

        for var in env:

            if env[var] is None:
                env_value = ''
            else:
                env_value = str(env[var])

            os.environ[var] = env_value

        mpi_config = self.config.get('mpi', {})
        mpi_runcmd = mpi_config.get('runcmd', 'mpirun')

        if self.config.get('scalasca', False):
            mpi_runcmd = ' '.join(['scalasca -analyze', mpi_runcmd])

        # MPI runtime flags
        mpi_flags = mpi_config.get('flags', [])
        if not mpi_flags:
            mpi_flags = self.config.get('mpirun', [])
            # TODO: Legacy config removal warning

        if type(mpi_flags) != list:
            mpi_flags = [mpi_flags]

        # TODO: More uniform support needed here
        if self.config.get('scalasca', False):
            mpi_flags = ['\"{0}\"'.format(f) for f in mpi_flags]

        # XXX: I think this may be broken
        if user_flags:
            mpi_flags.extend(list(user_flags))

        if self.debug:
            mpi_flags.append('--debug')

        mpi_progs = []
        for model in self.models:

            # Skip models without executables (e.g. couplers)
            if not model.exec_path_local:
                continue

            mpi_config = self.config.get('mpi', {})
            mpi_module = mpi_config.get('module', None)

            # Update MPI library module (if not explicitly set)
            # TODO: Check for MPI library mismatch across multiple binaries
            if mpi_module is None:
                mpi_module = envmod.lib_update(model.exec_path_local,
                                               'libmpi.so')

            model_prog = []

            # Our MPICH wrapper does not support a working directory flag
            if not mpi_module.startswith('mvapich'):
                model_prog.append('-wdir {0}'.format(model.work_path))

            # Append any model-specific MPI flags
            model_flags = model.config.get('mpiflags', [])
            if not isinstance(model_flags, list):
                model_prog.append(model_flags)
            else:
                model_prog.extend(model_flags)

            model_ncpus = model.config.get('ncpus')
            if model_ncpus:
                model_prog.append('-np {0}'.format(model_ncpus))

            model_npernode = model.config.get('npernode')
            # TODO: New Open MPI format?
            if model_npernode:
                if model_npernode % 2 == 0:
                    npernode_flag = ('-map-by ppr:{0}:socket'
                                     ''.format(model_npernode / 2))
                else:
                    npernode_flag = ('-map-by ppr:{0}:node'
                                     ''.format(model_npernode))

                if self.config.get('scalasca', False):
                    npernode_flag = '\"{0}\"'.format(npernode_flag)
                model_prog.append(npernode_flag)

            if self.config.get('hpctoolkit', False):
                os.environ['HPCRUN_EVENT_LIST'] = 'WALLCLOCK@5000'
                model_prog.append('hpcrun')

            for prof in self.profilers:
                if prof.runscript:
                    model_prog = model_prog.append(prof.runscript)

            model_prog.append(model.exec_prefix)

            # Use the exec_name (without path) as this is now linked in work
            model_prog.append(model.exec_name)

            mpi_progs.append(' '.join(model_prog))

        cmd = '{runcmd} {flags} {exes}'.format(runcmd=mpi_runcmd,
                                               flags=' '.join(mpi_flags),
                                               exes=' : '.join(mpi_progs))

        for prof in self.profilers:
            cmd = prof.wrapper(cmd)

        # Expand shell variables inside flags
        if self.expand_shell_vars:
            cmd = os.path.expandvars(cmd)

        print(cmd)

        # Our MVAPICH wrapper does not support working directories
        if mpi_module.startswith('mvapich'):
            curdir = os.getcwd()
            os.chdir(self.work_path)
        else:
            curdir = None

        # NOTE: This may not be necessary, since env seems to be getting
        # correctly updated.  Need to look into this.
        if env:
            # TODO: Replace with mpirun -x flag inputs
            proc = sp.Popen(shlex.split(cmd),
                            stdout=f_out,
                            stderr=f_err,
                            env=os.environ.copy())
            proc.wait()
            rc = proc.returncode
        else:
            rc = sp.call(shlex.split(cmd), stdout=f_out, stderr=f_err)

        # Return to control directory
        if curdir:
            os.chdir(curdir)

        self.runlog.create_manifest()
        if self.runlog.enabled:
            self.runlog.commit()

        f_out.close()
        f_err.close()

        # Remove any empty output files (e.g. logs)
        for fname in os.listdir(self.work_path):
            fpath = os.path.join(self.work_path, fname)
            if os.path.getsize(fpath) == 0:
                os.remove(fpath)

        # Clean up any profiling output
        # TODO: Move after `rc` code check?
        for prof in self.profilers:
            prof.postprocess()

        # TODO: Need a model-specific cleanup method call here
        # NOTE: This does not appear to catch hanging jobs killed by PBS
        if rc != 0:
            # Backup logs for failed runs
            error_log_dir = os.path.join(self.archive_path, 'error_logs')
            mkdir_p(error_log_dir)

            # NOTE: This is PBS-specific
            job_id = os.environ.get('PBS_JOBID', '')

            for fname in (self.stdout_fname, self.stderr_fname):
                src = os.path.join(self.control_path, fname)

                # NOTE: This assumes standard .out/.err extensions
                dest = os.path.join(error_log_dir,
                                    fname[:-4] + '.' + job_id + fname[-4:])
                print(src, dest)

                shutil.copyfile(src, dest)

            # Create the symlink to the logs if it does not exist
            make_symlink(self.archive_path, self.archive_sym_path)

            # Terminate payu
            sys.exit('payu: Model exited with error code {0}; aborting.'
                     ''.format(rc))

        # Decrement run counter on successful run
        stop_file_path = os.path.join(self.control_path, 'stop_run')
        if os.path.isfile(stop_file_path):
            assert os.stat(stop_file_path).st_size == 0
            os.remove(stop_file_path)
            print('payu: Stop file detected; terminating resubmission.')
            self.n_runs = 0
        else:
            self.n_runs -= 1

        # Move logs to archive (or delete if empty)
        for f in (self.stdout_fname, self.stderr_fname):
            f_path = os.path.join(self.control_path, f)
            if os.path.getsize(f_path) == 0:
                os.remove(f_path)
            else:
                shutil.move(f_path, self.work_path)

        run_script = self.userscripts.get('run')
        if run_script:
            self.run_userscript(run_script)