def _delete_job(job_state): """Delete given job. Both state for and contents of job will be removed. Args: job_state (JobState): State object for the job to remove. """ job_cancel_states = ['Scheduled', 'Queued', 'Running'] if job_state['state'] in job_cancel_states: cfgfile = os.path.join(pce_root, 'bin', 'onramp_pce_config.cfg') specfile = os.path.join(pce_root, 'src', 'configspecs', 'onramp_pce_config.cfgspec') cfg = ConfigObj(cfgfile, configspec=specfile) cfg.validate(Validator()) scheduler = Scheduler(cfg['cluster']['batch_scheduler']) result = scheduler.cancel_job(job_state['scheduler_job_num']) _logger.debug('Cancel job output: %s' % result[1]) job_state_file = os.path.join(_job_state_dir, str(job_state['job_id'])) os.remove(job_state_file) args = (job_state['username'], job_state['mod_name'], job_state['mod_id'], job_state['run_name']) run_dir = os.path.join(pce_root, 'users/%s/%s_%d/%s' % args) shutil.rmtree(run_dir, ignore_errors=True) job_state.clear()
def job_run(job_id, job_state_file=None): # Determine batch scheduler to user from config. cfg = ConfigObj(os.path.join(pce_root, 'bin', 'onramp_pce_config.cfg'), configspec=os.path.join(pce_root, 'src', 'configspecs', 'onramp_pce_config.cfgspec')) cfg.validate(Validator()) scheduler = Scheduler(cfg['cluster']['batch_scheduler']) ret_dir = os.getcwd() with JobState(job_id, job_state_file) as job_state: run_dir = job_state['run_dir'] run_name = job_state['run_name'] os.chdir(run_dir) # Write batch script. with open('script.sh', 'w') as f: f.write(scheduler.get_batch_script(run_name)) # Schedule job. result = scheduler.schedule(run_dir) if result['status_code'] != 0: _logger.error(result['msg']) with JobState(job_id, job_state_file) as job_state: job_state['state'] = 'Schedule failed' job_state['error'] = result['msg'] os.chdir(ret_dir) if job_state['_marked_for_del']: _delete_job(job_state) return (-2, 'Job %d deleted' % job_id) return (result['returncode'], result['msg']) with JobState(job_id, job_state_file) as job_state: job_state['state'] = 'Scheduled' job_state['error'] = None job_state['scheduler_job_num'] = result['job_num'] os.chdir(ret_dir) if job_state['_marked_for_del']: _delete_job(job_state) return (-2, 'Job %d deleted' % job_id) return (0, 'Job scheduled')
def _build_job(job_id, job_state_file=None): """Launch actions required to maintain job state and/or currate job results and return the state. When current job state (as a function of both PCE state tracking and scheduler output) warrants, initiate job postprocessing and/or status checking prior to building and returning state. Args: job_id (int): Id of the job to get state for. Returns: OnRamp formatted dictionary containing job attrs. """ status_check_states = ['Scheduled', 'Queued', 'Running'] with JobState(job_id, job_state_file) as job_state: _logger.debug('Building at %s' % time.time()) if 'state' not in job_state.keys(): _logger.debug('No state at %s' % time.time()) _logger.debug('job_state keys: %s' % job_state.keys()) return {} if job_state['state'] in status_check_states: specfile = os.path.join(pce_root, 'src', 'configspecs', 'onramp_pce_config.cfgspec') cfg = ConfigObj(os.path.join(pce_root, 'bin', 'onramp_pce_config.cfg'), configspec=specfile) cfg.validate(Validator()) scheduler = Scheduler(cfg['cluster']['batch_scheduler']) sched_job_num = job_state['scheduler_job_num'] job_status = scheduler.check_status(sched_job_num) # Bad. if job_status[0] != 0: _logger.debug('Bad job status: %s' % job_status[1]) job_state['state'] = 'Run failed' job_state['error'] = job_status[1] if job_status[0] != -2: job_state['state'] = job_status[1] if job_state['_marked_for_del']: _delete_job(job_state) # FIXME: This might cause trouble. About to return {}. return copy.deepcopy(job_state) return copy.deepcopy(job_state) # Good. if job_status[1] in ['Done', 'No info']: job_state['state'] = 'Postprocessing' if job_state['_marked_for_del']: _delete_job(job_state) # FIXME: This might cause trouble. About to return {}. return copy.deepcopy(job_state) job_state['error'] = None job_state['mod_status_output'] = None p = Process(target=job_postprocess, args=(job_id, job_state_file)) p.start() elif job_status[1] == 'Running': job_state['state'] = 'Running' job_state['error'] = None if job_state['_marked_for_del']: _delete_job(job_state) # FIXME: This might cause trouble. About to return {}. return copy.deepcopy(job_state) run_dir = job_state['run_dir'] mod_status_output = _get_module_status_output(run_dir) job_state['mod_status_output'] = mod_status_output elif job_status[1] == 'Queued': job_state['state'] = 'Queued' job_state['error'] = None if job_state['_marked_for_del']: _delete_job(job_state) # FIXME: This might cause trouble. About to return {}. return copy.deepcopy(job_state) job = copy.deepcopy(job_state) if job['state'] in ['Launch failed', 'Setting up launch']: return job # Build visible files. _logger.debug('job state: %s' % str(job)) dir_args = (job['username'], job['mod_name'], job['mod_id'], job['run_name']) run_dir = os.path.join(pce_root, 'users/%s/%s_%d/%s' % dir_args) cfg_file = os.path.join(run_dir, 'config/onramp_metadata.cfg') try: conf = ConfigObj(cfg_file, file_error=True) except (IOError, SyntaxError): # Badly formed or non-existant config/onramp_metadata.cfg. _logger.debug('Bad metadata') _logger.debug(cfg_file) return job if 'onramp' in conf.keys() and 'visible' in conf['onramp'].keys(): globs = conf['onramp']['visible'] if isinstance(globs, basestring): # Globs is only a single string. Convert to list. globs = [globs] else: globs = [] ret_dir = os.getcwd() os.chdir(run_dir) filenames = [ name for name in chain.from_iterable( map(glob.glob, globs) ) ] prefix = os.path.join(pce_root, 'users') + '/' url_prefix = run_dir.split(prefix)[1] job['visible_files'] = [{ 'name': filename, 'size': os.path.getsize(os.path.join(run_dir, filename)), 'url': os.path.join('files', os.path.join(url_prefix, filename)) } for filename in filenames ] os.chdir(ret_dir) return job
def launch_job(job_id, mod_id, username, run_name, run_params): """Schedule job launch using system batch scheduler as configured in onramp_pce_config.cfg. Args: job_id (int): Unique identifier for job. mod_id (int): Id for OnRamp educational module to run in this job. username (str): Username of user running the job. run_name (str): Human-readable label for this job run. Returns: Tuple with 0th position being error code and 1st position being string indication of status. """ accepted_states = ['Schedule failed', 'Launch failed', 'Preprocess failed'] _logger.debug('PCE.tools.launch_job() called') # Initialize job state. with JobState(job_id) as job_state: if ('state' in job_state.keys() and job_state['state'] not in accepted_states): msg = 'Job launch already initiated' _logger.warn(msg) return (-1, msg) job_state['job_id'] = job_id job_state['mod_id'] = mod_id job_state['username'] = username job_state['run_name'] = run_name job_state['scheduler_job_num'] = None job_state['state'] = 'Setting up launch' job_state['error'] = None job_state['mod_status_output'] = None job_state['output'] = None job_state['visible_files'] = None job_state['mod_name'] = None job_state['_marked_for_del'] = False _logger.debug('Waiting on ModState at: %s' % time.time()) with ModState(mod_id) as mod_state: _logger.debug('Done waiting on ModState at: %s' % time.time()) if ('state' not in mod_state.keys() or mod_state['state'] != 'Module ready'): msg = 'Module not ready' job_state['state'] = 'Launch failed' job_state['error'] = msg _logger.warn(msg) _logger.warn('mod_state: %s' % str(mod_state)) if job_state['_marked_for_del']: _delete_job(job_state) return (-2, 'Job %d deleted' % job_id) return (-1, 'Module not ready') job_state['mod_name'] = mod_state['mod_name'] proj_loc = mod_state['installed_path'] mod_name = mod_state['mod_name'] _logger.debug('Testing project location') if not os.path.isdir(proj_loc): msg = 'Project location does not exist' _logger.error(msg) return (-1, msg) _logger.debug('Project location good') # Initialize dir structure. user_dir = os.path.join(os.path.join(pce_root, 'users'), username) user_mod_dir = os.path.join(user_dir, '%s_%d' % (mod_name, mod_id)) run_dir = os.path.join(user_mod_dir, run_name) try: os.mkdir(user_dir) except OSError: # Thrown if dir already exists. pass try: os.mkdir(user_mod_dir) except OSError: # Thrown if dir already exists. pass # The way the following is setup, if a run_dir has already been setup with # this run_name, it will be used (that is, not overwritten) for this launch. try: shutil.copytree(proj_loc, run_dir) except shutil.Error as e: pass if run_params: _logger.debug('Handling run_params') spec = os.path.join(run_dir, 'config/onramp_uioptions.cfgspec') params = ConfigObj(run_params, configspec=spec) result = params.validate(Validator()) if result: with open(os.path.join(run_dir, 'onramp_runparams.cfg'), 'w') as f: params.write(f) else: msg = 'Runparams failed validation' _logger.warn(msg) return (-1, msg) ret_dir = os.getcwd() os.chdir(run_dir) # Preprocess. _logger.info('Calling bin/onramp_preprocess.py') with JobState(job_id) as job_state: job_state['state'] = 'Preprocessing' job_state['error'] = None try: result = check_output([os.path.join(pce_root, 'src/env/bin/python'), 'bin/onramp_preprocess.py'], stderr=STDOUT) except CalledProcessError as e: code = e.returncode if code > 127: code -= 256 result = e.output msg = ('Preprocess exited with return status %d and output: %s' % (code, result)) with JobState(job_id) as job_state: job_state['state'] = 'Preprocess failed' job_state['error'] = msg _logger.error(msg) os.chdir(ret_dir) if job_state['_marked_for_del']: _delete_job(job_state) return (-2, 'Job %d deleted' % job_id) return (-1, msg) finally: module_log(run_dir, 'preprocess', result) # Determine batch scheduler to user from config. cfg = ConfigObj(os.path.join(pce_root, 'bin', 'onramp_pce_config.cfg'), configspec=os.path.join(pce_root, 'src', 'configspecs', 'onramp_pce_config.cfgspec')) cfg.validate(Validator()) scheduler = Scheduler(cfg['cluster']['batch_scheduler']) # Write batch script. with open('script.sh', 'w') as f: f.write(scheduler.get_batch_script(run_name)) # Schedule job. result = scheduler.schedule(run_dir) if result['status_code'] != 0: _logger.error(result['msg']) with JobState(job_id) as job_state: job_state['state'] = 'Schedule failed' job_state['error'] = result['msg'] os.chdir(ret_dir) if job_state['_marked_for_del']: _delete_job(job_state) return (-2, 'Job %d deleted' % job_id) return (result['returncode'], result['msg']) with JobState(job_id) as job_state: job_state['state'] = 'Scheduled' job_state['error'] = None job_state['scheduler_job_num'] = result['job_num'] os.chdir(ret_dir) if job_state['_marked_for_del']: _delete_job(job_state) return (-2, 'Job %d deleted' % job_id) return (0, 'Job scheduled')
def _build_job(job_id, job_state_file=None): """Launch actions required to maintain job state and/or currate job results and return the state. When current job state (as a function of both PCE state tracking and scheduler output) warrants, initiate job postprocessing and/or status checking prior to building and returning state. Args: job_id (int): Id of the job to get state for. Returns: OnRamp formatted dictionary containing job attrs. """ status_check_states = ['Scheduled', 'Queued', 'Running'] with JobState(job_id, job_state_file) as job_state: _logger.debug('Building at %s' % time.time()) if 'state' not in job_state.keys(): _logger.debug('No state at %s' % time.time()) _logger.debug('job_state keys: %s' % job_state.keys()) return {} if job_state['state'] in status_check_states: specfile = os.path.join(pce_root, 'src', 'configspecs', 'onramp_pce_config.cfgspec') cfg = ConfigObj(os.path.join(pce_root, 'bin', 'onramp_pce_config.cfg'), configspec=specfile) cfg.validate(Validator()) scheduler = Scheduler(cfg['cluster']['batch_scheduler']) sched_job_num = job_state['scheduler_job_num'] job_status = scheduler.check_status(sched_job_num) # Bad. if job_status[0] != 0: _logger.debug('Bad job status: %s' % job_status[1]) job_state['state'] = 'Run failed' job_state['error'] = job_status[1] if job_status[0] != -2: job_state['state'] = job_status[1] if job_state['_marked_for_del']: _delete_job(job_state) # FIXME: This might cause trouble. About to return {}. return copy.deepcopy(job_state) return copy.deepcopy(job_state) # Good. if job_status[1] in ['Done', 'No info']: job_state['state'] = 'Postprocessing' if job_state['_marked_for_del']: _delete_job(job_state) # FIXME: This might cause trouble. About to return {}. return copy.deepcopy(job_state) job_state['error'] = None job_state['mod_status_output'] = None p = Process(target=job_postprocess, args=(job_id, job_state_file)) p.start() elif job_status[1] == 'Running': job_state['state'] = 'Running' job_state['error'] = None if job_state['_marked_for_del']: _delete_job(job_state) # FIXME: This might cause trouble. About to return {}. return copy.deepcopy(job_state) run_dir = job_state['run_dir'] mod_status_output = _get_module_status_output(run_dir) job_state['mod_status_output'] = mod_status_output elif job_status[1] == 'Queued': job_state['state'] = 'Queued' job_state['error'] = None if job_state['_marked_for_del']: _delete_job(job_state) # FIXME: This might cause trouble. About to return {}. return copy.deepcopy(job_state) job = copy.deepcopy(job_state) if job['state'] in ['Launch failed', 'Setting up launch']: return job # Build visible files. _logger.debug('job state: %s' % str(job)) dir_args = (job['username'], job['mod_name'], job['mod_id'], job['run_name']) run_dir = os.path.join(pce_root, 'users/%s/%s_%d/%s' % dir_args) cfg_file = os.path.join(run_dir, 'config/onramp_metadata.cfg') try: conf = ConfigObj(cfg_file, file_error=True) except (IOError, SyntaxError): # Badly formed or non-existant config/onramp_metadata.cfg. _logger.debug('Bad metadata') _logger.debug(cfg_file) return job if 'onramp' in conf.keys() and 'visible' in conf['onramp'].keys(): globs = conf['onramp']['visible'] if isinstance(globs, basestring): # Globs is only a single string. Convert to list. globs = [globs] else: globs = [] ret_dir = os.getcwd() os.chdir(run_dir) filenames = [name for name in chain.from_iterable(map(glob.glob, globs))] prefix = os.path.join(pce_root, 'users') + '/' url_prefix = run_dir.split(prefix)[1] job['visible_files'] = [{ 'name': filename, 'size': os.path.getsize(os.path.join(run_dir, filename)), 'url': os.path.join('files', os.path.join(url_prefix, filename)) } for filename in filenames] os.chdir(ret_dir) return job
def job_run(job_id, job_state_file=None): # Determine batch scheduler to user from config. cfg = ConfigObj(os.path.join(pce_root, 'bin', 'onramp_pce_config.cfg'), configspec=os.path.join(pce_root, 'src', 'configspecs', 'onramp_pce_config.cfgspec')) cfg.validate(Validator()) scheduler = Scheduler(cfg['cluster']['batch_scheduler']) _logger.debug("in job_run: trying to launch using scheduler %s", cfg['cluster']['batch_scheduler']) #ret_dir = os.getcwd() with JobState(job_id, job_state_file) as job_state: run_dir = job_state['run_dir'] run_name = job_state['run_name'] os.chdir(run_dir) #_logger.debug("in job_run: attempting to be in %s, really in %s", run_dir, os.get_cwd()) # Load run params: run_np = None run_nodes = None run_cfg = ConfigObj('onramp_runparams.cfg') if 'onramp' in run_cfg.keys(): if 'np' in run_cfg['onramp']: run_np = run_cfg['onramp']['np'] if 'nodes' in run_cfg['onramp']: run_nodes = run_cfg['onramp']['nodes'] _logger.debug("in job_run: loaded params np: %d and nodes: %d", run_np, run_nodes) # Write batch script. with open('script.sh', 'w') as f: if run_np and run_nodes: f.write(scheduler.get_batch_script(run_name, numtasks=run_np, num_nodes=run_nodes)) elif run_np: f.write(scheduler.get_batch_script(run_name, numtasks=run_np)) elif run_nodes: f.write(scheduler.get_batch_script(run_name, num_nodes=run_nodes)) else: f.write(scheduler.get_batch_script(run_name)) # Schedule job. result = scheduler.schedule(run_dir) if result['status_code'] != 0: _logger.error(result['msg']) with JobState(job_id, job_state_file) as job_state: job_state['state'] = 'Schedule failed' job_state['error'] = result['msg'] os.chdir(ret_dir) if job_state['_marked_for_del']: _delete_job(job_state) return (-2, 'Job %d deleted' % job_id) return (result['returncode'], result['msg']) with JobState(job_id, job_state_file) as job_state: job_state['state'] = 'Scheduled' job_state['error'] = None job_state['scheduler_job_num'] = result['job_num'] os.chdir(ret_dir) if job_state['_marked_for_del']: _delete_job(job_state) return (-2, 'Job %d deleted' % job_id) return (0, 'Job scheduled')