def retrieve_jobs(self, gc_id_jobnum_list): # Process output sandboxes returned by getJobsOutput jobnum_list_retrieved = [] for jobnum_input, output_dn in self._get_jobs_output(gc_id_jobnum_list): # jobnum_input != None, output_dn == None => Job could not be retrieved if output_dn is None: if jobnum_input not in jobnum_list_retrieved: yield (jobnum_input, -1, {}, None) continue # jobnum_input == None, output_dn != None => Found leftovers of job retrieval if jobnum_input is None: continue # jobnum_input != None, output_dn != None => Job retrieval from WMS was ok job_fn = os.path.join(output_dn, 'job.info') retrieve_result = self._parse_job_info_file(jobnum_input, job_fn, output_dn, jobnum_list_retrieved) if retrieve_result is not None: yield retrieve_result continue # Clean empty output_dns for sub_dn in imap(lambda x: x[0], os.walk(output_dn, topdown=False)): ignore_exception(Exception, None, os.rmdir, sub_dn) if os.path.exists(output_dn): # Preserve failed job ensure_dir_exists(self._path_fail, 'failed output directory') _force_move(self._log, output_dn, os.path.join(self._path_fail, os.path.basename(output_dn))) yield (jobnum_input, -1, {}, None)
def _read_jobs(self, job_limit): ensure_dir_exists(self._path_db, 'job database directory', JobError) candidates = [] for job_fn in fnmatch.filter(os.listdir(self._path_db), 'job_*.txt'): try: # 2xsplit is faster than regex jobnum = int(job_fn.split(".")[0].split("_")[1]) except Exception: clear_current_exception() continue candidates.append((jobnum, job_fn)) (job_map, max_job_len) = ({}, len(candidates)) activity = Activity('Reading job infos') idx = 0 for (jobnum, job_fn) in sorted(candidates): idx += 1 if jobnum >= job_limit >= 0: self._log.info('Stopped reading job infos at job #%d out of %d available job files, ' + 'since the limit of %d jobs is reached', jobnum, len(candidates), job_limit) break try: job_fn_full = os.path.join(self._path_db, job_fn) data = self._fmt.parse(SafeFile(job_fn_full).iter_close()) job_obj = self._create_job_obj(job_fn_full, data) except Exception: raise JobError('Unable to process job file %r' % job_fn_full) job_map[jobnum] = job_obj activity.update('Reading job infos %d [%d%%]' % (idx, (100.0 * idx) / max_job_len)) activity.finish() return job_map
def __init__(self, config, datasource_name, repository, keep_old=True): BaseDataParameterSource.__init__(self, config, datasource_name, repository) # hide provider property set by __new__ self._provider = self.provider del self.provider if self._provider.need_init_query(): self._provider.get_block_list_cached(show_stats=False) data_src_text = 'Dataset source %r' % datasource_name # Select dataset refresh rate data_refresh = config.get_time('%s refresh' % datasource_name, -1, on_change=None) if data_refresh >= 0: data_refresh = max(data_refresh, self._provider.get_query_interval()) self._log.info('%s will be queried every %s', data_src_text, str_time_long(data_refresh)) self.setup_resync(interval=data_refresh, force=config.get_state('resync', detail='datasets')) splitter_name = config.get('%s splitter' % datasource_name, 'FileBoundarySplitter') splitter_cls = self._provider.check_splitter(DataSplitter.get_class(splitter_name)) self._splitter = splitter_cls(config, datasource_name) # Settings: (self._dn, self._keep_old) = (config.get_work_path(), keep_old) ensure_dir_exists(self._dn, 'partition map directory', DatasetError) self._set_reader(self._init_reader()) if not self.get_parameter_len(): if data_refresh < 0: raise UserError('%s does not provide jobs to process' % data_src_text) self._log.warning('%s does not provide jobs to process', data_src_text)
def deploy_task(self, task, transfer_se, transfer_sb): # HACK self._output_fn_list = lmap(lambda d_s_t: d_s_t[2], self._get_out_transfer_info_list(task)) task.validate_variables() # add task SE files to SM self._sm_se_in.add_file_list(lmap(lambda d_s_t: d_s_t[2], task.get_se_in_fn_list())) # Transfer common SE files if transfer_se: self._sm_se_in.do_transfer(task.get_se_in_fn_list()) def _convert(fn_list): for fn in fn_list: if isinstance(fn, str): yield (fn, os.path.basename(fn)) else: yield (fn, os.path.basename(fn.name)) # Package sandbox tar file self._log.log(logging.INFO1, 'Packing sandbox') sandbox = self._get_sandbox_name(task) ensure_dir_exists(os.path.dirname(sandbox), 'sandbox directory') if not os.path.exists(sandbox) or transfer_sb: sandbox_file_list = self._get_sandbox_file_list(task, [self._sm_se_in, self._sm_se_out]) create_tarball(_convert(sandbox_file_list), name=sandbox)
def __init__(self, config, source): self._psrc_raw = source BasicParameterAdapter.__init__(self, config, source) self._map_jobnum2pnum = {} ensure_dir_exists(config.get_work_path(), 'parameter storage directory', ParameterError) self._path_jobnum2pnum = config.get_work_path('params.map.gz') self._path_params = config.get_work_path('params.dat.gz') # Find out if init should be performed - overrides resync_requested! init_requested = config.get_state('init', detail='parameters') init_needed = False if not (os.path.exists(self._path_params) and os.path.exists(self._path_jobnum2pnum)): init_needed = True # Init needed if no parameter log exists if init_requested and not init_needed and (source.get_parameter_len() is not None): self._log.warning('Re-Initialization will overwrite the current mapping ' + 'between jobs and parameter/dataset content! This can lead to invalid results!') user_msg = ('Do you want to perform a syncronization between ' + 'the current mapping and the new one to avoid this?') if UserInputInterface().prompt_bool(user_msg, True): init_requested = False do_init = init_requested or init_needed # Find out if resync should be performed resync_by_user = config.get_state('resync', detail='parameters') config.set_state(False, 'resync', detail='parameters') psrc_hash = self._psrc_raw.get_psrc_hash() self._psrc_hash_stored = config.get('parameter hash', psrc_hash, persistent=True) psrc_hash_changed = self._psrc_hash_stored != psrc_hash # Resync if parameters have changed resync_by_psrc = self._psrc_raw.get_resync_request() if do_init: # Write current state self._write_jobnum2pnum(self._path_jobnum2pnum) ParameterSource.get_class('GCDumpParameterSource').write(self._path_params, self.get_job_len(), self.get_job_metadata(), self.iter_jobs()) elif resync_by_user or resync_by_psrc or psrc_hash_changed: # Perform sync if psrc_hash_changed: self._log.info('Parameter hash has changed') self._log.debug('\told hash: %s', self._psrc_hash_stored) self._log.debug('\tnew hash: %s', psrc_hash) self._log.log(logging.DEBUG1, '\tnew src: %s', self._psrc_raw) config.set_state(True, 'init', detail='config') elif resync_by_psrc: self._log.info('Parameter source requested resync') self._log.debug('\t%r', str.join(', ', imap(repr, resync_by_psrc))) elif resync_by_user: self._log.info('User requested resync') self._psrc_hash_stored = None self._resync_state = self.resync(force=True) else: # Reuse old mapping activity = Activity('Loading cached parameter information') self._read_jobnum2pnum() activity.finish() return # do not set parameter hash in config config.set('parameter hash', self._psrc_raw.get_psrc_hash())
def _setup_work_path(config): # Check work dir validity (default work directory is the config file name) if not os.path.exists(config.get_work_path()): if not config.get_state('init'): log = logging.getLogger('workflow') log.warning('Starting initialization of %s!', config.get_work_path()) config.set_state(True, 'init') work_dn_create_msg = 'Do you want to create the working directory %s?' if config.get_choice_yes_no('workdir create', True, interactive_msg=work_dn_create_msg % config.get_work_path()): ensure_dir_exists(config.get_work_path(), 'work directory')
def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration root_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0])) else: tmp_dn = root_dn ensure_dir_exists(tmp_dn) except Exception: raise BackendError('Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobs = self._write_wms_id_list(gc_id_jobnum_list) activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) proc = LocalProcess(self._output_exec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn) # yield output dirs todo = map_gc_id2jobnum.values() current_jobnum = None for line in imap(str.strip, proc.stdout.iter(timeout=60)): if line.startswith(tmp_dn): todo.remove(current_jobnum) output_dn = line.strip() unpack_wildcard_tar(self._log, output_dn) yield (current_jobnum, output_dn) current_jobnum = None else: current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum) exit_code = proc.status(timeout=0, terminate=True) activity.finish() if exit_code != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout=0): remove_files([jobs, root_dn]) raise StopIteration else: self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dn in os.listdir(root_dn): yield (None, os.path.join(root_dn, dn)) # return unretrievable jobs for jobnum in todo: yield (jobnum, None) remove_files([jobs, tmp_dn])
def __init__(self, config, datasource_name, repository, keep_old=True): BaseDataParameterSource.__init__(self, config, datasource_name, repository) # hide provider property set by __new__ self._provider = self.provider del self.provider if self._provider.need_init_query(): self._provider.get_block_list_cached(show_stats=False) data_src_text = 'Dataset source %r' % datasource_name # Select dataset refresh rate data_refresh = config.get_time('%s refresh' % datasource_name, -1, on_change=None) if data_refresh >= 0: data_refresh = max(data_refresh, self._provider.get_query_interval()) self._log.info('%s will be queried every %s', data_src_text, str_time_long(data_refresh)) self.setup_resync(interval=data_refresh, force=config.get_state('resync', detail='datasets')) splitter_name = config.get('%s splitter' % datasource_name, 'FileBoundarySplitter') splitter_cls = self._provider.check_splitter( DataSplitter.get_class(splitter_name)) self._splitter = splitter_cls(config, datasource_name) # Settings: (self._dn, self._keep_old) = (config.get_work_path(), keep_old) ensure_dir_exists(self._dn, 'partition map directory', DatasetError) self._set_reader(self._init_reader()) if not self.get_parameter_len(): if data_refresh < 0: raise UserError('%s does not provide jobs to process' % data_src_text) self._log.warning('%s does not provide jobs to process', data_src_text)
def freeze(self, write_config=True, show_unused=True, raise_on_change=True): # Inform the user about unused options def _match_unused_entries(entry): return ('!' not in entry.section) and not entry.accessed self._container_cur.protect(raise_on_change) if show_unused: unused = lfilter(_match_unused_entries, self._view.iter_entries()) log = logging.getLogger('config.freeze') if unused: log.log(logging.INFO1, 'There are %s unused config options!', len(unused)) for entry in unused: log.log(logging.INFO1, '\t%s', entry.format(print_section=True)) if write_config or not os.path.exists(self._config_path_old): ensure_dir_exists(os.path.dirname(self._config_path_old), 'config storage directory', ConfigError) # Write user friendly, flat config file and config file with saved settings self._write_file(self._config_path_min, print_minimal=True, print_default=False, print_workdir=True, print_unused=False) self._write_file(self._config_path_old, print_minimal=True, print_default=True, print_source=True, print_unused=True, msg='; ==> DO NOT EDIT THIS FILE! <==\n; This file is used to find config changes!\n')
def retrieve_jobs(self, gc_id_jobnum_list ): # Process output sandboxes returned by getJobsOutput jobnum_list_retrieved = [] for jobnum_input, output_dn in self._get_jobs_output( gc_id_jobnum_list): # jobnum_input != None, output_dn == None => Job could not be retrieved if output_dn is None: if jobnum_input not in jobnum_list_retrieved: yield (jobnum_input, -1, {}, None) continue # jobnum_input == None, output_dn != None => Found leftovers of job retrieval if jobnum_input is None: continue # jobnum_input != None, output_dn != None => Job retrieval from WMS was ok job_fn = os.path.join(output_dn, 'job.info') retrieve_result = self._parse_job_info_file( jobnum_input, job_fn, output_dn, jobnum_list_retrieved) if retrieve_result is not None: yield retrieve_result continue # Clean empty output_dns for sub_dn in imap(lambda x: x[0], os.walk(output_dn, topdown=False)): ignore_exception(Exception, None, os.rmdir, sub_dn) if os.path.exists(output_dn): # Preserve failed job ensure_dir_exists(self._path_fail, 'failed output directory') _force_move( self._log, output_dn, os.path.join(self._path_fail, os.path.basename(output_dn))) yield (jobnum_input, -1, {}, None)
def __init__(self, config, name, check_executor, cancel_executor): WMS.__init__(self, config, name) for executor in [check_executor, cancel_executor]: executor.setup(self._log) (self._check_executor, self._cancel_executor) = (check_executor, cancel_executor) if self._name != self.__class__.__name__.upper(): self._log.info('Using batch system: %s (%s)', self.__class__.__name__, self._name) else: self._log.info('Using batch system: %s', self._name) self._runlib = config.get_work_path('gc-run.lib') fp = SafeFile(self._runlib, 'w') content = SafeFile(get_path_share('gc-run.lib')).read() fp.write(content.replace('__GC_VERSION__', __import__('grid_control').__version__)) fp.close() self._path_output = config.get_work_path('output') self._path_file_cache = config.get_work_path('files') ensure_dir_exists(self._path_output, 'output directory') self._path_fail = config.get_work_path('fail') # Initialise access token and storage managers # UI -> SE -> WN self._sm_se_in = config.get_plugin('se input manager', 'SEStorageManager', cls=StorageManager, bind_kwargs={'tags': [self]}, pargs=('se', 'se input', 'SE_INPUT')) self._sm_sb_in = config.get_plugin('sb input manager', 'LocalSBStorageManager', cls=StorageManager, bind_kwargs={'tags': [self]}, pargs=('sandbox', 'sandbox', 'SB_INPUT')) # UI <- SE <- WN self._sm_se_out = config.get_plugin('se output manager', 'SEStorageManager', cls=StorageManager, bind_kwargs={'tags': [self]}, pargs=('se', 'se output', 'SE_OUTPUT')) self._sm_sb_out = None self._token = config.get_composited_plugin(['proxy', 'access token'], 'TrivialAccessToken', 'MultiAccessToken', cls=AccessToken, bind_kwargs={'inherit': True, 'tags': [self]}) self._output_fn_list = None
def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration tmp_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(tmp_dn, md5_hex(gc_id_jobnum_list[0][0])) ensure_dir_exists(tmp_dn) except Exception: raise BackendError( 'Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobnum_list_todo = list(map_gc_id2jobnum.values()) wms_id_list_done = [] activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) chunk_pos_iter = irange(0, len(gc_id_jobnum_list), self._chunk_size) for ids in imap(lambda x: gc_id_jobnum_list[x:x + self._chunk_size], chunk_pos_iter): for (current_jobnum, output_dn) in self.get_jobs_output_chunk( tmp_dn, ids, wms_id_list_done): unpack_wildcard_tar(self._log, output_dn) jobnum_list_todo.remove(current_jobnum) yield (current_jobnum, output_dn) activity.finish() # return unretrievable jobs for jobnum in jobnum_list_todo: yield (jobnum, None) self._purge_done_jobs(wms_id_list_done) remove_files([tmp_dn])
def _ssh_link_secure(ssh_link_fn, init_dn): ssh_link_dn = ensure_dir_exists(os.path.dirname(ssh_link_fn), 'SSH link direcory', BackendError) if ssh_link_dn != os.path.dirname(os.path.expanduser('~/.ssh/')): try: os.chmod(ssh_link_dn, stat.S_IRWXU) except Exception: raise BackendError('Could not secure directory for SSHLink %s' % ssh_link_dn) if init_dn: return if os.path.exists(ssh_link_fn): if not stat.S_ISSOCK(os.stat(ssh_link_fn).st_mode): raise BackendError('Non-socket object already exists for SSHLink %s' % ssh_link_fn) try: os.chmod(ssh_link_fn, stat.S_IRWXU) except Exception: raise BackendError('Could not secure SSHLink %s' % ssh_link_fn)
def _ssh_link_secure(ssh_link_fn, init_dn): ssh_link_dn = ensure_dir_exists(os.path.dirname(ssh_link_fn), 'SSH link direcory', BackendError) if ssh_link_dn != os.path.dirname(os.path.expanduser('~/.ssh/')): try: os.chmod(ssh_link_dn, stat.S_IRWXU) except Exception: raise BackendError('Could not secure directory for SSHLink %s' % ssh_link_dn) if init_dn: return if os.path.exists(ssh_link_fn): if not stat.S_ISSOCK(os.stat(ssh_link_fn).st_mode): raise BackendError( 'Non-socket object already exists for SSHLink %s' % ssh_link_fn) try: os.chmod(ssh_link_fn, stat.S_IRWXU) except Exception: raise BackendError('Could not secure SSHLink %s' % ssh_link_fn)
def _get_sandbox_dn(self, jobnum=''): # return path to sandbox for a specific job or basepath sandpath = os.path.join(self._sandbox_dn, str(jobnum), '') return ensure_dir_exists(sandpath, 'sandbox directory', BackendError)
def __init__(self, config): self._cache = [] self._path = config.get_path('sandbox path', config.get_work_path('sandbox'), must_exist=False) ensure_dir_exists(self._path, 'sandbox base', BackendError)
def getSandboxPath(self, subdirToken=""): sandpath = os.path.join(self._sandboxDir, str(subdirToken), '') return ensure_dir_exists(sandpath, 'sandbox directory', BackendError)
def retrieve_jobs(self, gc_id_jobnum_list ): # Process output sandboxes returned by getJobsOutput # Function to force moving a directory def _force_move(source, target): try: if os.path.exists(target): shutil.rmtree(target) except IOError: self._log.exception('%r cannot be removed', target) clear_current_exception() return False try: shutil.move(source, target) except IOError: self._log.exception( 'Error moving job output directory from %r to %r', source, target) clear_current_exception() return False return True jobnum_list_retrieved = [] for jobnum_input, output_dn in self._get_jobs_output( gc_id_jobnum_list): # jobnum_input != None, output_dn == None => Job could not be retrieved if output_dn is None: if jobnum_input not in jobnum_list_retrieved: yield (jobnum_input, -1, {}, None) continue # jobnum_input == None, output_dn != None => Found leftovers of job retrieval if jobnum_input is None: continue # jobnum_input != None, output_dn != None => Job retrieval from WMS was ok job_fn = os.path.join(output_dn, 'job.info') job_info = ignore_exception(Exception, None, self._job_parser.process, output_dn) if job_info is None: self._log.exception('Unable to parse job.info') if job_info: jobnum = job_info[JobResult.JOBNUM] if jobnum != jobnum_input: raise BackendError('Invalid job id in job file %s' % job_fn) if _force_move( output_dn, os.path.join(self._path_output, 'job_%d' % jobnum)): jobnum_list_retrieved.append(jobnum_input) yield (jobnum, job_info[JobResult.EXITCODE], job_info[JobResult.RAW], output_dn) else: yield (jobnum, -1, {}, None) continue # Clean empty output_dns for sub_dn in imap(lambda x: x[0], os.walk(output_dn, topdown=False)): ignore_exception(Exception, None, os.rmdir, sub_dn) if os.path.exists(output_dn): # Preserve failed job ensure_dir_exists(self._path_fail, 'failed output directory') _force_move( output_dn, os.path.join(self._path_fail, os.path.basename(output_dn))) yield (jobnum_input, -1, {}, None)
def getSandboxPath(self, subdirToken=""): sandpath = os.path.join(self._sandboxDir, str(subdirToken), '' ) return ensure_dir_exists(sandpath, 'sandbox directory', BackendError)