def __init__(self, event_list, event, config, filemanager): self.config = config self.account = config['global'].get('account', '') self.event_list = event_list self.filemanager = filemanager self.dryrun = True if config['global']['dryrun'] == True else False self.debug = True if config['global']['debug'] == True else False self._resource_path = config['global']['resource_path'] """ A list of cases, dictionaries structured as: case (str): the full case name jobs (list): a list of job.Jobs short_name (str): the short name of the case """ self.cases = list() self.running_jobs = list() self.kill_event = event self._job_total = 0 self._job_complete = 0 self.slurm = Slurm() max_jobs = config['global']['max_jobs'] self.max_running_jobs = max_jobs if max_jobs else self.slurm.get_node_number() * 3 while self.max_running_jobs == 0: sleep(1) msg = 'Unable to communication with scontrol, checking again' print_line(msg, event_list) logging.error(msg) self.max_running_jobs = self.slurm.get_node_number() * 3
class RunManager(object): def __init__(self, event_list, event, config, filemanager): self.config = config self.account = config['global'].get('account', '') self.event_list = event_list self.filemanager = filemanager self.dryrun = True if config['global']['dryrun'] == True else False self.debug = True if config['global']['debug'] == True else False self._resource_path = config['global']['resource_path'] """ A list of cases, dictionaries structured as: case (str): the full case name jobs (list): a list of job.Jobs short_name (str): the short name of the case """ self.cases = list() self.running_jobs = list() self.kill_event = event self._job_total = 0 self._job_complete = 0 self.slurm = Slurm() max_jobs = config['global']['max_jobs'] self.max_running_jobs = max_jobs if max_jobs else self.slurm.get_node_number( ) * 6 while self.max_running_jobs == 0: sleep(1) msg = 'Unable to communication with scontrol, checking again' print_line(msg, event_list) logging.error(msg) self.max_running_jobs = self.slurm.get_node_number() * 6 def check_max_running_jobs(self): """ Checks if the maximum number of jobs are running Returns True if the max or more are running, false otherwise """ try: job_info = self.slurm.queue() except: return True else: running_jobs = 0 for job in job_info: if job['STATE'] in ['R', 'PD']: running_jobs += 1 if running_jobs >= self.max_running_jobs: return True return False def add_pp_type_to_cases(self, freqs, job_type, start, end, case, run_type=None):
def test_shownode(self): print '\n' print_message( '---- Starting Test: {} ----'.format(inspect.stack()[0][3]), 'ok') slurm = Slurm() node = os.environ['HOSTNAME'].lower().split('.')[0] node_info = slurm.shownode(node) self.assertTrue(node_info['Arch'] == 'x86_64')
def test_shownode(self): print '\n' print_message( '---- Starting Test: {} ----'.format(inspect.stack()[0][3]), 'ok') slurm = Slurm() node = 'acme1' node_info = slurm.shownode(node) self.assertTrue(node_info['Arch'] == 'x86_64') self.assertTrue(node_info['CoresPerSocket'] == '24')
def execute(self, dryrun=False): """ Calls ncclimo in a subprocess """ if self.postvalidate(): self.status = JobStatus.COMPLETED message = 'Timeseries already computed, skipping' self.event_list.push(message=message) return 0 file_list = self.config['file_list'] file_list.sort() list_string = ' '.join(file_list) slurm_command = ' '.join([ 'ncclimo', '-a', self.config['annual_mode'], '-c', self.config['caseId'], '-v', ','.join(self.config['var_list']), '-s', str(self.config['start_year']), '-e', str(self.config['end_year']), '-o', self.config['output_directory'], '--map={}'.format(self.config.get('regrid_map_path')), list_string ]) # Submitting the job to SLURM expected_name = '{type}_{start:04d}_{end:04d}'.format( start=self.config.get('start_year'), end=self.config.get('end_year'), type=self.type) run_script = os.path.join(self.config.get('run_scripts_path'), expected_name) if os.path.exists(run_script): os.remove(run_script) self.slurm_args['output_file'] = '-o {output_file}'.format( output_file=run_script + '.out') slurm_prefix = '\n'.join( ['#SBATCH ' + self.slurm_args[s] for s in self.slurm_args]) + '\n' with open(run_script, 'w') as batchfile: batchfile.write('#!/bin/bash\n') batchfile.write(slurm_prefix) batchfile.write(slurm_command) slurm = Slurm() print 'submitting to queue {type}: {start:04d}-{end:04d}'.format( type=self.type, start=self.start_year, end=self.end_year) self.job_id = slurm.batch(run_script, '--oversubscribe') self.status = JobStatus.SUBMITTED message = '{type} id: {id} changed state to {state}'.format( type=self.type, id=self.job_id, state=self.status) logging.info(message) self.event_list.push(message=message) return self.job_id
def monitor_running_jobs(self): slurm = Slurm() for job in self.running_jobs: if job.job_id == 0: self.handle_completed_job(job) self.running_jobs.remove(job) continue job_info = slurm.showjob(job.job_id) status = job_info.get('JobState') if not status: print 'No status yet for {}'.format(job.type) continue status = StatusMap[status] if status != job.status: msg = '{job}-{start:04d}-{end:04d}:{id} changed from {s1} to {s2}'.format( job=job.type, start=job.start_year, end=job.end_year, s1=job.status, s2=status, id=job.job_id) print msg self.event_list.push(message=msg) job.status = status if status == JobStatus.RUNNING: job.start_time = datetime.now() for job_set in self.job_sets: if job_set.set_number == job.year_set \ and job_set.status != SetStatus.FAILED: job_set.status = SetStatus.RUNNING break elif status == JobStatus.COMPLETED: job.end_time = datetime.now() self.handle_completed_job(job) self.running_jobs.remove(job) elif status in [JobStatus.FAILED, JobStatus.CANCELLED]: job.end_time = datetime.now() for job_set in self.job_sets: if job_set.set_number == job.year_set: job_set.status = SetStatus.FAILED break self.running_jobs.remove(job)
def __init__(self, event_list, output_path, caseID, scripts_path, thread_list, event): self.output_path = output_path self.slurm = Slurm() self.event_list = event_list self.caseID = caseID self.job_sets = [] self.running_jobs = [] self.monitor_thread = None self.thread_list = thread_list self.kill_event = event self._dryrun = False self.scripts_path = scripts_path if not os.path.exists(self.scripts_path): os.makedirs(self.scripts_path)
def test_batch(self): print '\n' print_message( '---- Starting Test: {} ----'.format(inspect.stack()[0][3]), 'ok') slurm = Slurm() command = os.path.join('tests', 'test_slurm_batch.sh') job_id = slurm.batch(command, '-n 1 -N 1') self.assertTrue(job_id) self.assertTrue(isinstance(job_id, int)) info = slurm.showjob(job_id) allowed_states = ['PENDING', 'RUNNING', 'COMPLETE', 'COMPLETING'] self.assertTrue(info['JobState'] in allowed_states) info = slurm.queue() in_queue = False for item in info: if int(item['JOBID']) == job_id: in_queue = True self.assertTrue(item['STATE'] in ['PD', 'R']) break self.assertTrue(in_queue) slurm.cancel(job_id)
def _submit_cmd_to_manager(self, config, cmd): """ Takes the jobs main cmd, generates a batch script and submits the script to the resource manager controller Parameters: cmd (str): the command to submit config (dict): the global configuration object Retuns: job_id (int): the job_id from the resource manager """ # setup for the run script scripts_path = os.path.join(config['global']['project_path'], 'output', 'scripts') if self._run_type is not None: run_name = '{type}_{run_type}_{start:04d}_{end:04d}_{case}'.format( type=self.job_type, run_type=self._run_type, start=self.start_year, end=self.end_year, case=self.short_name) elif isinstance(self, Diag): run_name = '{type}_{start:04d}_{end:04d}_{case}_vs_{comp}'.format( type=self.job_type, run_type=self._run_type, start=self.start_year, end=self.end_year, case=self.short_name, comp=self._short_comp_name) else: run_name = '{type}_{start:04d}_{end:04d}_{case}'.format( type=self.job_type, start=self.start_year, end=self.end_year, case=self.short_name) run_script = os.path.join(scripts_path, run_name) self._console_output_path = '{}.out'.format(run_script) if os.path.exists(run_script): os.remove(run_script) try: manager = Slurm() manager_prefix = '#SBATCH' self._manager_args['slurm'].append('-o {}'.format( self._console_output_path)) except: try: manager = PBS() manager_prefix = '#PBS' self._manager_args['pbs'].append('-o {}'.format( self._console_output_path)) self._manager_args['pbs'].append('-e {}'.format( self._console_output_path.replace('.out', '.err'))) except: raise Exception("No resource manager found") # generate the run script using the manager arguments and command command = ' '.join(cmd) script_prefix = '' if isinstance(manager, Slurm): margs = self._manager_args['slurm'] else: margs = self._manager_args['pbs'] for item in margs: script_prefix += '{prefix} {value}\n'.format(prefix=manager_prefix, value=item) with open(run_script, 'w') as batchfile: batchfile.write('#!/bin/bash\n') batchfile.write(script_prefix) template_input_path = os.path.join(config['global']['resource_path'], 'env_loader.bash') variables = { 'user_env_path': os.environ['CONDA_PREFIX'], 'cmd': command } render(variables=variables, input_path=template_input_path, output_path=run_script) # with open(run_script, 'w+') as batchfile: # batchfile.write(command) # if this is a dry run, set the status and exit if self._dryrun: msg = '{}: dryrun is set, completing without running'.format( self.msg_prefix()) logging.info(msg) self.status = JobStatus.COMPLETED return False else: if not self.prevalidate(): return False if self.postvalidate(config): self.status = JobStatus.COMPLETED return True # submit the run script to the resource controller self._job_id = manager.batch(run_script) self._has_been_executed = True return self._job_id
def monitor_running_jobs(self): slurm = Slurm() for_removal = list() for item in self.running_jobs: job = self.get_job_by_id(item['job_id']) if item['slurm_id'] == 0: self._job_complete += 1 for_removal.append(item) job.handle_completion( self.filemanager, self.event_list, self.config) self.report_completed_job() continue try: job_info = slurm.showjob(item['slurm_id']) if not job_info or job_info.get('JobState') is None: continue except Exception as e: # if the job is old enough it wont be in the slurm list anymore # which will throw an exception self._job_complete += 1 for_removal.append(item) valid = job.postvalidate(self.config, event_list=self.event_list) if valid: job.status = JobStatus.COMPLETED job.handle_completion( self.filemanager, self.event_list, self.config) self.report_completed_job() else: line = "slurm lookup error for {job}: {id}".format( job=job.job_type, id=item['job_id']) print_line( line=line, event_list=self.event_list) continue status = StatusMap[job_info.get('JobState')] if status != job.status: msg = '{prefix}: Job changed from {s1} to {s2}'.format( prefix=job.msg_prefix(), s1=ReverseMap[job.status], s2=ReverseMap[status]) print_line(msg, self.event_list) job.status = status if status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED]: self._job_complete += 1 valid = job.postvalidate(self.config, event_list=self.event_list) if not valid: job.status = JobStatus.FAILED job.handle_completion( self.filemanager, self.event_list, self.config) for_removal.append(item) self.report_completed_job() if status in [JobStatus.FAILED, JobStatus.CANCELLED]: for depjob in self.get_jobs_that_depend(job.id): depjob.status = JobStatus.FAILED if not for_removal: return else: self.running_jobs = [x for x in self.running_jobs if x not in for_removal] return
def execute(self, dryrun=False): """ Perform the actual work """ # First check if the job has already been completed if self.postvalidate(): self.status = JobStatus.COMPLETED message = 'AMWG job already computed, skipping' self.event_list.push(message=message) logging.info(message) return 0 # Create directory of regridded climos regrid_path = os.path.join( os.sep.join(self.config['test_path_diag'].split(os.sep)[:-2]), 'climo_regrid') file_list = get_climo_output_files(input_path=regrid_path, start_year=self.start_year, end_year=self.end_year) if not file_list or len(file_list) == 0: print """ ERROR: AMWG: {start:04d}-{end:04d} could not find input climatologies at {path}\n did you add ncclimo to this year_set?""".format(start=self.start_year, end=self.end_year, path=regrid_path) self.status = JobStatus.FAILED return 0 if not os.path.exists(self.config['test_path_climo']): print 'creating temp directory for amwg' os.makedirs(self.config['test_path_climo']) create_symlink_dir(src_dir=regrid_path, src_list=file_list, dst=self.config['test_path_climo']) # Rename the files to the format amwg expects for item in os.listdir(self.config['test_path_climo']): search = re.search(r'\_\d\d\d\d\d\d\_', item) if not search: continue index = search.start() os.rename( os.path.join(self.config['test_path_climo'], item), os.path.join(self.config['test_path_climo'], item[:index] + '_climo.nc')) # render the csh script into the output directory self.output_path = self.config['output_path'] template_out = os.path.join(self.output_path, 'amwg.csh') render(variables=self.config, input_path=self.config.get('template_path'), output_path=template_out) expected_name = '{type}_{start:04d}-{end:04d}'.format( start=self.config.get('start_year'), end=self.config.get('end_year'), type=self.type) # Copy the rendered run script into the scripts directory run_script_template_out = os.path.join( self.config.get('run_scripts_path'), expected_name) copyfile(src=template_out, dst=run_script_template_out) # setup sbatch script run_script = os.path.join(self.config.get('run_scripts_path'), expected_name) if os.path.exists(run_script): os.remove(run_script) self.slurm_args['output_file'] = '-o {output_file}'.format( output_file=run_script + '.out') cmd = '\ncsh {template}'.format(template=template_out) slurm_args_str = [ '#SBATCH {value}'.format(value=v) for k, v in self.slurm_args.items() ] slurm_prefix = '\n'.join(slurm_args_str) with open(run_script, 'w') as batchfile: batchfile.write('#!/bin/bash\n') batchfile.write(slurm_prefix) batchfile.write(cmd) if dryrun: self.status = JobStatus.COMPLETED return 0 slurm = Slurm() print 'submitting to queue {type}: {start:04d}-{end:04d}'.format( type=self.type, start=self.start_year, end=self.end_year) self.job_id = slurm.batch(run_script, '--oversubscribe') status = slurm.showjob(self.job_id) self.status = StatusMap[status.get('JobState')] message = '{type} id: {id} changed state to {state}'.format( type=self.type, id=self.job_id, state=self.status) logging.info(message) self.event_list.push(message=message) return self.job_id
def execute(self, dryrun=False): # Check if the output already exists if self.postvalidate(): self.status = JobStatus.COMPLETED message = 'ACME diags already computed, skipping' self.event_list.push(message=message) logging.info(message) return 0 # render the parameters file self.output_path = self.config['output_path'] template_out = os.path.join(self.output_path, 'params.py') variables = { 'sets': self.config['sets'], 'backend': self.config['backend'], 'reference_data_path': self.config['reference_data_path'], 'test_data_path': self.config['regrided_climo_path'], 'test_name': self.config['test_name'], 'seasons': self.config['seasons'], 'results_dir': self.config['results_dir'] } render(variables=variables, input_path=self.config.get('template_path'), output_path=template_out) run_name = '{type}_{start:04d}_{end:04d}'.format( start=self.config.get('start_year'), end=self.config.get('end_year'), type=self.type) template_copy = os.path.join(self.config.get('run_scripts_path'), run_name) copyfile(src=template_out, dst=template_copy) # Create directory of regridded climos file_list = get_climo_output_files( input_path=self.config['regrid_base_path'], start_year=self.start_year, end_year=self.end_year) create_symlink_dir(src_dir=self.config['regrid_base_path'], src_list=file_list, dst=self.config['regrided_climo_path']) # setup sbatch script run_script = os.path.join(self.config.get('run_scripts_path'), run_name) if os.path.exists(run_script): os.remove(run_script) self.slurm_args['output_file'] = '-o {output_file}'.format( output_file=run_script + '.out') cmd = 'acme_diags_driver.py -p {template}'.format( template=template_out) slurm_args_str = [ '#SBATCH {value}\n'.format(value=v) for k, v in self.slurm_args.items() ] slurm_prefix = ''.join(slurm_args_str) with open(run_script, 'w') as batchfile: batchfile.write('#!/bin/bash\n') batchfile.write(slurm_prefix) batchfile.write(cmd) slurm = Slurm() print 'submitting to queue {type}: {start:04d}-{end:04d}'.format( type=self.type, start=self.start_year, end=self.end_year) self.job_id = slurm.batch(run_script, '--oversubscribe') status = slurm.showjob(self.job_id) self.status = StatusMap[status.get('JobState')] message = '{type} id: {id} changed state to {state}'.format( type=self.type, id=self.job_id, state=self.status) logging.info(message) self.event_list.push(message=message) return self.job_id
def _submit_cmd_to_slurm(self, config, cmd): """ Takes the jobs main cmd, generates a batch script and submits the script to the slurm controller Parameters: cmd (str): the command to submit config (dict): the global configuration object Retuns: job_id (int): the slurm job_id """ # setup for the run script scripts_path = os.path.join(config['global']['project_path'], 'output', 'scripts') if self._run_type is not None: run_name = '{type}_{run_type}_{start:04d}_{end:04d}_{case}'.format( type=self.job_type, run_type=self._run_type, start=self.start_year, end=self.end_year, case=self.short_name) elif isinstance(self, Diag): run_name = '{type}_{start:04d}_{end:04d}_{case}_vs_{comp}'.format( type=self.job_type, run_type=self._run_type, start=self.start_year, end=self.end_year, case=self.short_name, comp=self._short_comp_name) else: run_name = '{type}_{start:04d}_{end:04d}_{case}'.format( type=self.job_type, start=self.start_year, end=self.end_year, case=self.short_name) run_script = os.path.join(scripts_path, run_name) self._console_output_path = '{}.out'.format(run_script) if os.path.exists(run_script): os.remove(run_script) # generate the run script using the slurm arguments and command slurm_command = ' '.join(cmd) self._slurm_args['output_file'] = '-o {output_file}'.format( output_file=self._console_output_path) slurm_prefix = '' for key, val in self._slurm_args.items(): slurm_prefix += '#SBATCH {}\n'.format(val) with open(run_script, 'w') as batchfile: batchfile.write('#!/bin/bash\n') batchfile.write(slurm_prefix) batchfile.write(slurm_command) # if this is a dry run, set the status and exit if self._dryrun: self.status = JobStatus.COMPLETED return 0 # submit the run script to the slurm controller slurm = Slurm() self._job_id = slurm.batch(run_script) self._has_been_executed = True return self._job_id
def execute(self, dryrun=False): """ Perform the actual work """ # First check if the job has already been completed if self.postvalidate(): self.status = JobStatus.COMPLETED message = 'Coupled_diag job already computed, skipping' self.event_list.push(message=message) return 0 # create symlinks to the input data setup_status = self.setup_input_directory() if not setup_status: return -1 elif setup_status == 2: return False set_string = '{start:04d}_{end:04d}'.format( start=self.config.get('start_year'), end=self.config.get('end_year')) # Setup output directory if not os.path.exists(self.config['output_path']): os.makedirs(self.config['output_path']) # render run template template_out = os.path.join(self.output_path, 'run_aprime.bash') variables = { 'output_base_dir': self.output_path, 'test_casename': self.config['experiment'], 'test_archive_dir': self.config['input_path'], 'test_atm_res': self.config['test_atm_res'], 'test_mpas_mesh_name': self.config['test_mpas_mesh_name'], 'begin_yr': self.start_year, 'end_yr': self.end_year } render(variables=variables, input_path=self.config['template_path'], output_path=template_out) # copy the tempalte into the run_scripts directory run_name = '{type}_{start:04d}_{end:04d}'.format(start=self.start_year, end=self.end_year, type=self.type) template_copy = os.path.join(self.config.get('run_scripts_path'), run_name) copyfile(src=template_out, dst=template_copy) # create the slurm run script cmd = 'sh {run_aprime}'.format(run_aprime=template_out) run_script = os.path.join(self.config.get('run_scripts_path'), run_name) if os.path.exists(run_script): os.remove(run_script) self.slurm_args['out_file'] = '-o {out}'.format(out=run_script + '.out') self.slurm_args['working_dir'] = '--workdir {dir}'.format( dir=self.config.get('aprime_code_path')) slurm_args = [ '#SBATCH {}'.format(self.slurm_args[s]) for s in self.slurm_args ] slurm_prefix = '\n'.join(slurm_args) + '\n' with open(run_script, 'w') as batchfile: batchfile.write('#!/bin/bash\n') batchfile.write(slurm_prefix) batchfile.write('export OMP_NUM_THREADS=2\n') batchfile.write(cmd) slurm = Slurm() print 'submitting to queue {type}: {start:04d}-{end:04d}'.format( type=self.type, start=self.start_year, end=self.end_year) self.job_id = slurm.batch(run_script) status = slurm.showjob(self.job_id) self.status = StatusMap[status.get('JobState')] message = "## {job} id: {id} changed status to {status}".format( job=self.type, id=self.job_id, status=self.status) logging.info(message) return self.job_id