Example #1
0
    def __init__(self, event_list, event, config, filemanager):


        self.config = config
        self.account = config['global'].get('account', '')
        self.event_list = event_list
        self.filemanager = filemanager
        self.dryrun = True if config['global']['dryrun'] == True else False
        self.debug = True if config['global']['debug'] == True else False
        self._resource_path = config['global']['resource_path']
        """
        A list of cases, dictionaries structured as:
            case (str): the full case name
            jobs (list): a list of job.Jobs
            short_name (str): the short name of the case
        """
        self.cases = list()

        self.running_jobs = list()
        self.kill_event = event
        self._job_total = 0
        self._job_complete = 0

        self.slurm = Slurm()
        max_jobs = config['global']['max_jobs']
        self.max_running_jobs = max_jobs if max_jobs else self.slurm.get_node_number() * 3
        while self.max_running_jobs == 0:
            sleep(1)
            msg = 'Unable to communication with scontrol, checking again'
            print_line(msg, event_list)
            logging.error(msg)
            self.max_running_jobs = self.slurm.get_node_number() * 3
Example #2
0
class RunManager(object):
    def __init__(self, event_list, event, config, filemanager):

        self.config = config
        self.account = config['global'].get('account', '')
        self.event_list = event_list
        self.filemanager = filemanager
        self.dryrun = True if config['global']['dryrun'] == True else False
        self.debug = True if config['global']['debug'] == True else False
        self._resource_path = config['global']['resource_path']
        """
        A list of cases, dictionaries structured as:
            case (str): the full case name
            jobs (list): a list of job.Jobs
            short_name (str): the short name of the case
        """
        self.cases = list()

        self.running_jobs = list()
        self.kill_event = event
        self._job_total = 0
        self._job_complete = 0

        self.slurm = Slurm()
        max_jobs = config['global']['max_jobs']
        self.max_running_jobs = max_jobs if max_jobs else self.slurm.get_node_number(
        ) * 6
        while self.max_running_jobs == 0:
            sleep(1)
            msg = 'Unable to communication with scontrol, checking again'
            print_line(msg, event_list)
            logging.error(msg)
            self.max_running_jobs = self.slurm.get_node_number() * 6

    def check_max_running_jobs(self):
        """
        Checks if the maximum number of jobs are running

        Returns True if the max or more are running, false otherwise
        """
        try:
            job_info = self.slurm.queue()
        except:
            return True
        else:
            running_jobs = 0
            for job in job_info:
                if job['STATE'] in ['R', 'PD']:
                    running_jobs += 1
                if running_jobs >= self.max_running_jobs:
                    return True
            return False

    def add_pp_type_to_cases(self,
                             freqs,
                             job_type,
                             start,
                             end,
                             case,
                             run_type=None):
Example #3
0
 def test_shownode(self):
     print '\n'
     print_message(
         '---- Starting Test: {} ----'.format(inspect.stack()[0][3]), 'ok')
     slurm = Slurm()
     node = os.environ['HOSTNAME'].lower().split('.')[0]
     node_info = slurm.shownode(node)
     self.assertTrue(node_info['Arch'] == 'x86_64')
Example #4
0
 def test_shownode(self):
     print '\n'
     print_message(
         '---- Starting Test: {} ----'.format(inspect.stack()[0][3]), 'ok')
     slurm = Slurm()
     node = 'acme1'
     node_info = slurm.shownode(node)
     self.assertTrue(node_info['Arch'] == 'x86_64')
     self.assertTrue(node_info['CoresPerSocket'] == '24')
Example #5
0
    def execute(self, dryrun=False):
        """
        Calls ncclimo in a subprocess
        """
        if self.postvalidate():
            self.status = JobStatus.COMPLETED
            message = 'Timeseries already computed, skipping'
            self.event_list.push(message=message)
            return 0

        file_list = self.config['file_list']
        file_list.sort()
        list_string = ' '.join(file_list)
        slurm_command = ' '.join([
            'ncclimo', '-a', self.config['annual_mode'], '-c',
            self.config['caseId'], '-v', ','.join(self.config['var_list']),
            '-s',
            str(self.config['start_year']), '-e',
            str(self.config['end_year']), '-o',
            self.config['output_directory'],
            '--map={}'.format(self.config.get('regrid_map_path')), list_string
        ])

        # Submitting the job to SLURM
        expected_name = '{type}_{start:04d}_{end:04d}'.format(
            start=self.config.get('start_year'),
            end=self.config.get('end_year'),
            type=self.type)
        run_script = os.path.join(self.config.get('run_scripts_path'),
                                  expected_name)
        if os.path.exists(run_script):
            os.remove(run_script)

        self.slurm_args['output_file'] = '-o {output_file}'.format(
            output_file=run_script + '.out')
        slurm_prefix = '\n'.join(
            ['#SBATCH ' + self.slurm_args[s] for s in self.slurm_args]) + '\n'

        with open(run_script, 'w') as batchfile:
            batchfile.write('#!/bin/bash\n')
            batchfile.write(slurm_prefix)
            batchfile.write(slurm_command)

        slurm = Slurm()
        print 'submitting to queue {type}: {start:04d}-{end:04d}'.format(
            type=self.type, start=self.start_year, end=self.end_year)
        self.job_id = slurm.batch(run_script, '--oversubscribe')
        self.status = JobStatus.SUBMITTED
        message = '{type} id: {id} changed state to {state}'.format(
            type=self.type, id=self.job_id, state=self.status)
        logging.info(message)
        self.event_list.push(message=message)

        return self.job_id
    def monitor_running_jobs(self):
        slurm = Slurm()
        for job in self.running_jobs:
            if job.job_id == 0:
                self.handle_completed_job(job)
                self.running_jobs.remove(job)
                continue
            job_info = slurm.showjob(job.job_id)
            status = job_info.get('JobState')
            if not status:
                print 'No status yet for {}'.format(job.type)
                continue
            status = StatusMap[status]
            if status != job.status:
                msg = '{job}-{start:04d}-{end:04d}:{id} changed from {s1} to {s2}'.format(
                    job=job.type,
                    start=job.start_year,
                    end=job.end_year,
                    s1=job.status,
                    s2=status,
                    id=job.job_id)
                print msg
                self.event_list.push(message=msg)
                job.status = status

                if status == JobStatus.RUNNING:
                    job.start_time = datetime.now()
                    for job_set in self.job_sets:
                        if job_set.set_number == job.year_set \
                                and job_set.status != SetStatus.FAILED:
                            job_set.status = SetStatus.RUNNING
                            break
                elif status == JobStatus.COMPLETED:
                    job.end_time = datetime.now()
                    self.handle_completed_job(job)
                    self.running_jobs.remove(job)
                elif status in [JobStatus.FAILED, JobStatus.CANCELLED]:
                    job.end_time = datetime.now()
                    for job_set in self.job_sets:
                        if job_set.set_number == job.year_set:
                            job_set.status = SetStatus.FAILED
                            break
                    self.running_jobs.remove(job)
 def __init__(self, event_list, output_path, caseID, scripts_path,
              thread_list, event):
     self.output_path = output_path
     self.slurm = Slurm()
     self.event_list = event_list
     self.caseID = caseID
     self.job_sets = []
     self.running_jobs = []
     self.monitor_thread = None
     self.thread_list = thread_list
     self.kill_event = event
     self._dryrun = False
     self.scripts_path = scripts_path
     if not os.path.exists(self.scripts_path):
         os.makedirs(self.scripts_path)
Example #8
0
    def test_batch(self):
        print '\n'
        print_message(
            '---- Starting Test: {} ----'.format(inspect.stack()[0][3]), 'ok')
        slurm = Slurm()
        command = os.path.join('tests', 'test_slurm_batch.sh')
        job_id = slurm.batch(command, '-n 1 -N 1')
        self.assertTrue(job_id)
        self.assertTrue(isinstance(job_id, int))

        info = slurm.showjob(job_id)
        allowed_states = ['PENDING', 'RUNNING', 'COMPLETE', 'COMPLETING']
        self.assertTrue(info['JobState'] in allowed_states)

        info = slurm.queue()
        in_queue = False
        for item in info:
            if int(item['JOBID']) == job_id:
                in_queue = True
                self.assertTrue(item['STATE'] in ['PD', 'R'])
                break
        self.assertTrue(in_queue)
        slurm.cancel(job_id)
Example #9
0
    def _submit_cmd_to_manager(self, config, cmd):
        """
        Takes the jobs main cmd, generates a batch script and submits the script
        to the resource manager controller

        Parameters:
            cmd (str): the command to submit
            config (dict): the global configuration object
        Retuns:
            job_id (int): the job_id from the resource manager
        """
        # setup for the run script
        scripts_path = os.path.join(config['global']['project_path'], 'output',
                                    'scripts')
        if self._run_type is not None:
            run_name = '{type}_{run_type}_{start:04d}_{end:04d}_{case}'.format(
                type=self.job_type,
                run_type=self._run_type,
                start=self.start_year,
                end=self.end_year,
                case=self.short_name)
        elif isinstance(self, Diag):
            run_name = '{type}_{start:04d}_{end:04d}_{case}_vs_{comp}'.format(
                type=self.job_type,
                run_type=self._run_type,
                start=self.start_year,
                end=self.end_year,
                case=self.short_name,
                comp=self._short_comp_name)
        else:
            run_name = '{type}_{start:04d}_{end:04d}_{case}'.format(
                type=self.job_type,
                start=self.start_year,
                end=self.end_year,
                case=self.short_name)
        run_script = os.path.join(scripts_path, run_name)
        self._console_output_path = '{}.out'.format(run_script)
        if os.path.exists(run_script):
            os.remove(run_script)

        try:
            manager = Slurm()
            manager_prefix = '#SBATCH'
            self._manager_args['slurm'].append('-o {}'.format(
                self._console_output_path))
        except:
            try:
                manager = PBS()
                manager_prefix = '#PBS'
                self._manager_args['pbs'].append('-o {}'.format(
                    self._console_output_path))
                self._manager_args['pbs'].append('-e {}'.format(
                    self._console_output_path.replace('.out', '.err')))
            except:
                raise Exception("No resource manager found")

        # generate the run script using the manager arguments and command
        command = ' '.join(cmd)
        script_prefix = ''

        if isinstance(manager, Slurm):
            margs = self._manager_args['slurm']
        else:
            margs = self._manager_args['pbs']
        for item in margs:
            script_prefix += '{prefix} {value}\n'.format(prefix=manager_prefix,
                                                         value=item)

        with open(run_script, 'w') as batchfile:
            batchfile.write('#!/bin/bash\n')
            batchfile.write(script_prefix)

        template_input_path = os.path.join(config['global']['resource_path'],
                                           'env_loader.bash')
        variables = {
            'user_env_path': os.environ['CONDA_PREFIX'],
            'cmd': command
        }
        render(variables=variables,
               input_path=template_input_path,
               output_path=run_script)
        # with open(run_script, 'w+') as batchfile:
        #     batchfile.write(command)

        # if this is a dry run, set the status and exit
        if self._dryrun:
            msg = '{}: dryrun is set, completing without running'.format(
                self.msg_prefix())
            logging.info(msg)
            self.status = JobStatus.COMPLETED
            return False
        else:
            if not self.prevalidate():
                return False
            if self.postvalidate(config):
                self.status = JobStatus.COMPLETED
                return True

        # submit the run script to the resource controller
        self._job_id = manager.batch(run_script)
        self._has_been_executed = True
        return self._job_id
Example #10
0
    def monitor_running_jobs(self):
        slurm = Slurm()
        for_removal = list()
        for item in self.running_jobs:
            job = self.get_job_by_id(item['job_id'])
            if item['slurm_id'] == 0:
                self._job_complete += 1
                for_removal.append(item)
                job.handle_completion(
                    self.filemanager,
                    self.event_list,
                    self.config)
                self.report_completed_job()
                continue
            try:
                job_info = slurm.showjob(item['slurm_id'])
                if not job_info or job_info.get('JobState') is None:
                    continue
            except Exception as e:
                # if the job is old enough it wont be in the slurm list anymore
                # which will throw an exception
                self._job_complete += 1
                for_removal.append(item)
                
                valid = job.postvalidate(self.config, event_list=self.event_list)
                if valid:
                    job.status = JobStatus.COMPLETED
                    job.handle_completion(
                        self.filemanager,
                        self.event_list,
                        self.config)
                    self.report_completed_job()
                else:
                    line = "slurm lookup error for {job}: {id}".format(
                        job=job.job_type,
                        id=item['job_id'])
                    print_line(
                        line=line,
                        event_list=self.event_list)
                continue
            status = StatusMap[job_info.get('JobState')]
            if status != job.status:
                msg = '{prefix}: Job changed from {s1} to {s2}'.format(
                        prefix=job.msg_prefix(),
                        s1=ReverseMap[job.status],
                        s2=ReverseMap[status])
                print_line(msg, self.event_list)
                job.status = status

                if status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]:
                    self._job_complete += 1
                    valid = job.postvalidate(self.config, event_list=self.event_list)
                    if not valid:
                        job.status = JobStatus.FAILED
                    job.handle_completion(
                        self.filemanager,
                        self.event_list,
                        self.config)
                    for_removal.append(item)
                    self.report_completed_job()
                    if status in [JobStatus.FAILED, JobStatus.CANCELLED]:
                        for depjob in self.get_jobs_that_depend(job.id):
                            depjob.status = JobStatus.FAILED
        if not for_removal:
            return
        else:
            self.running_jobs = [x for x in self.running_jobs if x not in for_removal]
        return
Example #11
0
    def execute(self, dryrun=False):
        """
        Perform the actual work
        """
        # First check if the job has already been completed
        if self.postvalidate():
            self.status = JobStatus.COMPLETED
            message = 'AMWG job already computed, skipping'
            self.event_list.push(message=message)
            logging.info(message)
            return 0

        # Create directory of regridded climos

        regrid_path = os.path.join(
            os.sep.join(self.config['test_path_diag'].split(os.sep)[:-2]),
            'climo_regrid')
        file_list = get_climo_output_files(input_path=regrid_path,
                                           start_year=self.start_year,
                                           end_year=self.end_year)
        if not file_list or len(file_list) == 0:
            print """
ERROR: AMWG: {start:04d}-{end:04d} could not find input climatologies at {path}\n
did you add ncclimo to this year_set?""".format(start=self.start_year,
                                                end=self.end_year,
                                                path=regrid_path)
            self.status = JobStatus.FAILED
            return 0
        if not os.path.exists(self.config['test_path_climo']):
            print 'creating temp directory for amwg'
            os.makedirs(self.config['test_path_climo'])
        create_symlink_dir(src_dir=regrid_path,
                           src_list=file_list,
                           dst=self.config['test_path_climo'])

        # Rename the files to the format amwg expects
        for item in os.listdir(self.config['test_path_climo']):
            search = re.search(r'\_\d\d\d\d\d\d\_', item)
            if not search:
                continue
            index = search.start()
            os.rename(
                os.path.join(self.config['test_path_climo'], item),
                os.path.join(self.config['test_path_climo'],
                             item[:index] + '_climo.nc'))

        # render the csh script into the output directory
        self.output_path = self.config['output_path']
        template_out = os.path.join(self.output_path, 'amwg.csh')
        render(variables=self.config,
               input_path=self.config.get('template_path'),
               output_path=template_out)

        expected_name = '{type}_{start:04d}-{end:04d}'.format(
            start=self.config.get('start_year'),
            end=self.config.get('end_year'),
            type=self.type)
        # Copy the rendered run script into the scripts directory
        run_script_template_out = os.path.join(
            self.config.get('run_scripts_path'), expected_name)
        copyfile(src=template_out, dst=run_script_template_out)

        # setup sbatch script

        run_script = os.path.join(self.config.get('run_scripts_path'),
                                  expected_name)
        if os.path.exists(run_script):
            os.remove(run_script)

        self.slurm_args['output_file'] = '-o {output_file}'.format(
            output_file=run_script + '.out')
        cmd = '\ncsh {template}'.format(template=template_out)
        slurm_args_str = [
            '#SBATCH {value}'.format(value=v)
            for k, v in self.slurm_args.items()
        ]
        slurm_prefix = '\n'.join(slurm_args_str)
        with open(run_script, 'w') as batchfile:
            batchfile.write('#!/bin/bash\n')
            batchfile.write(slurm_prefix)
            batchfile.write(cmd)

        if dryrun:
            self.status = JobStatus.COMPLETED
            return 0

        slurm = Slurm()
        print 'submitting to queue {type}: {start:04d}-{end:04d}'.format(
            type=self.type, start=self.start_year, end=self.end_year)
        self.job_id = slurm.batch(run_script, '--oversubscribe')

        status = slurm.showjob(self.job_id)
        self.status = StatusMap[status.get('JobState')]
        message = '{type} id: {id} changed state to {state}'.format(
            type=self.type, id=self.job_id, state=self.status)
        logging.info(message)
        self.event_list.push(message=message)

        return self.job_id
Example #12
0
    def execute(self, dryrun=False):

        # Check if the output already exists
        if self.postvalidate():
            self.status = JobStatus.COMPLETED
            message = 'ACME diags already computed, skipping'
            self.event_list.push(message=message)
            logging.info(message)
            return 0
        # render the parameters file
        self.output_path = self.config['output_path']
        template_out = os.path.join(self.output_path, 'params.py')
        variables = {
            'sets': self.config['sets'],
            'backend': self.config['backend'],
            'reference_data_path': self.config['reference_data_path'],
            'test_data_path': self.config['regrided_climo_path'],
            'test_name': self.config['test_name'],
            'seasons': self.config['seasons'],
            'results_dir': self.config['results_dir']
        }
        render(variables=variables,
               input_path=self.config.get('template_path'),
               output_path=template_out)

        run_name = '{type}_{start:04d}_{end:04d}'.format(
            start=self.config.get('start_year'),
            end=self.config.get('end_year'),
            type=self.type)
        template_copy = os.path.join(self.config.get('run_scripts_path'),
                                     run_name)
        copyfile(src=template_out, dst=template_copy)

        # Create directory of regridded climos
        file_list = get_climo_output_files(
            input_path=self.config['regrid_base_path'],
            start_year=self.start_year,
            end_year=self.end_year)
        create_symlink_dir(src_dir=self.config['regrid_base_path'],
                           src_list=file_list,
                           dst=self.config['regrided_climo_path'])

        # setup sbatch script
        run_script = os.path.join(self.config.get('run_scripts_path'),
                                  run_name)
        if os.path.exists(run_script):
            os.remove(run_script)

        self.slurm_args['output_file'] = '-o {output_file}'.format(
            output_file=run_script + '.out')

        cmd = 'acme_diags_driver.py -p {template}'.format(
            template=template_out)

        slurm_args_str = [
            '#SBATCH {value}\n'.format(value=v)
            for k, v in self.slurm_args.items()
        ]
        slurm_prefix = ''.join(slurm_args_str)
        with open(run_script, 'w') as batchfile:
            batchfile.write('#!/bin/bash\n')
            batchfile.write(slurm_prefix)
            batchfile.write(cmd)

        slurm = Slurm()
        print 'submitting to queue {type}: {start:04d}-{end:04d}'.format(
            type=self.type, start=self.start_year, end=self.end_year)
        self.job_id = slurm.batch(run_script, '--oversubscribe')
        status = slurm.showjob(self.job_id)
        self.status = StatusMap[status.get('JobState')]
        message = '{type} id: {id} changed state to {state}'.format(
            type=self.type, id=self.job_id, state=self.status)
        logging.info(message)
        self.event_list.push(message=message)

        return self.job_id
Example #13
0
    def _submit_cmd_to_slurm(self, config, cmd):
        """
        Takes the jobs main cmd, generates a batch script and submits the script
        to the slurm controller
        
        Parameters:
            cmd (str): the command to submit
            config (dict): the global configuration object
        Retuns:
            job_id (int): the slurm job_id
        """
        # setup for the run script
        scripts_path = os.path.join(config['global']['project_path'], 'output',
                                    'scripts')
        if self._run_type is not None:
            run_name = '{type}_{run_type}_{start:04d}_{end:04d}_{case}'.format(
                type=self.job_type,
                run_type=self._run_type,
                start=self.start_year,
                end=self.end_year,
                case=self.short_name)
        elif isinstance(self, Diag):
            run_name = '{type}_{start:04d}_{end:04d}_{case}_vs_{comp}'.format(
                type=self.job_type,
                run_type=self._run_type,
                start=self.start_year,
                end=self.end_year,
                case=self.short_name,
                comp=self._short_comp_name)
        else:
            run_name = '{type}_{start:04d}_{end:04d}_{case}'.format(
                type=self.job_type,
                start=self.start_year,
                end=self.end_year,
                case=self.short_name)
        run_script = os.path.join(scripts_path, run_name)
        self._console_output_path = '{}.out'.format(run_script)
        if os.path.exists(run_script):
            os.remove(run_script)

        # generate the run script using the slurm arguments and command
        slurm_command = ' '.join(cmd)
        self._slurm_args['output_file'] = '-o {output_file}'.format(
            output_file=self._console_output_path)
        slurm_prefix = ''
        for key, val in self._slurm_args.items():
            slurm_prefix += '#SBATCH {}\n'.format(val)

        with open(run_script, 'w') as batchfile:
            batchfile.write('#!/bin/bash\n')
            batchfile.write(slurm_prefix)
            batchfile.write(slurm_command)

        # if this is a dry run, set the status and exit
        if self._dryrun:
            self.status = JobStatus.COMPLETED
            return 0

        # submit the run script to the slurm controller
        slurm = Slurm()
        self._job_id = slurm.batch(run_script)
        self._has_been_executed = True
        return self._job_id
Example #14
0
    def execute(self, dryrun=False):
        """
        Perform the actual work
        """
        # First check if the job has already been completed
        if self.postvalidate():
            self.status = JobStatus.COMPLETED
            message = 'Coupled_diag job already computed, skipping'
            self.event_list.push(message=message)
            return 0

        # create symlinks to the input data
        setup_status = self.setup_input_directory()
        if not setup_status:
            return -1
        elif setup_status == 2:
            return False

        set_string = '{start:04d}_{end:04d}'.format(
            start=self.config.get('start_year'),
            end=self.config.get('end_year'))

        # Setup output directory
        if not os.path.exists(self.config['output_path']):
            os.makedirs(self.config['output_path'])

        # render run template
        template_out = os.path.join(self.output_path, 'run_aprime.bash')
        variables = {
            'output_base_dir': self.output_path,
            'test_casename': self.config['experiment'],
            'test_archive_dir': self.config['input_path'],
            'test_atm_res': self.config['test_atm_res'],
            'test_mpas_mesh_name': self.config['test_mpas_mesh_name'],
            'begin_yr': self.start_year,
            'end_yr': self.end_year
        }
        render(variables=variables,
               input_path=self.config['template_path'],
               output_path=template_out)

        # copy the tempalte into the run_scripts directory
        run_name = '{type}_{start:04d}_{end:04d}'.format(start=self.start_year,
                                                         end=self.end_year,
                                                         type=self.type)
        template_copy = os.path.join(self.config.get('run_scripts_path'),
                                     run_name)
        copyfile(src=template_out, dst=template_copy)

        # create the slurm run script
        cmd = 'sh {run_aprime}'.format(run_aprime=template_out)

        run_script = os.path.join(self.config.get('run_scripts_path'),
                                  run_name)
        if os.path.exists(run_script):
            os.remove(run_script)

        self.slurm_args['out_file'] = '-o {out}'.format(out=run_script +
                                                        '.out')
        self.slurm_args['working_dir'] = '--workdir {dir}'.format(
            dir=self.config.get('aprime_code_path'))
        slurm_args = [
            '#SBATCH {}'.format(self.slurm_args[s]) for s in self.slurm_args
        ]
        slurm_prefix = '\n'.join(slurm_args) + '\n'

        with open(run_script, 'w') as batchfile:
            batchfile.write('#!/bin/bash\n')
            batchfile.write(slurm_prefix)
            batchfile.write('export OMP_NUM_THREADS=2\n')
            batchfile.write(cmd)

        slurm = Slurm()
        print 'submitting to queue {type}: {start:04d}-{end:04d}'.format(
            type=self.type, start=self.start_year, end=self.end_year)
        self.job_id = slurm.batch(run_script)
        status = slurm.showjob(self.job_id)
        self.status = StatusMap[status.get('JobState')]
        message = "## {job} id: {id} changed status to {status}".format(
            job=self.type, id=self.job_id, status=self.status)
        logging.info(message)

        return self.job_id