def _getBlocksInternal(self): # Split files into blocks/datasets via key functions and determine metadata intersection (protoBlocks, commonDS, commonB) = ({}, {}, {}) def getActiveKeys(kUser, kGuard, gIdx): return kUser + (kGuard or lchain(imap(lambda x: x.getGuards()[gIdx], self._scanner))) keysDS = getActiveKeys(self._ds_keys_user, self._ds_keys_guard, 0) keysB = getActiveKeys(self._b_keys_user, self._b_keys_guard, 1) for fileInfo in ifilter(itemgetter(0), self._collectFiles()): hashDS = self._generateKey(keysDS, md5_hex(repr(self._datasetExpr)) + md5_hex(repr(self._datasetNick)), *fileInfo) hashB = self._generateKey(keysB, hashDS + md5_hex(repr(fileInfo[3])), *fileInfo) # [3] == SE list if not self._ds_select or (hashDS in self._ds_select): if not self._b_select or (hashB in self._b_select): fileInfo[1].update({'DS_KEY': hashDS, 'BLOCK_KEY': hashB}) protoBlocks.setdefault(hashDS, {}).setdefault(hashB, []).append(fileInfo) utils.intersectDict(commonDS.setdefault(hashDS, dict(fileInfo[1])), fileInfo[1]) utils.intersectDict(commonB.setdefault(hashDS, {}).setdefault(hashB, dict(fileInfo[1])), fileInfo[1]) # Generate names for blocks/datasets using common metadata (hashNameDictDS, hashNameDictB) = ({}, {}) for hashDS in protoBlocks: hashNameDictDS[hashDS] = self._generateDatasetName(hashDS, commonDS[hashDS]) for hashB in protoBlocks[hashDS]: hashNameDictB[hashB] = (hashDS, self._generateBlockName(hashB, commonB[hashDS][hashB])) self._findCollision('dataset', hashNameDictDS, commonDS, keysDS, lambda name, key: [key]) self._findCollision('block', hashNameDictB, commonB, keysDS + keysB, lambda name, key: [name[0], key], lambda name: name[1]) for block in self._buildBlocks(protoBlocks, hashNameDictDS, hashNameDictB): yield block
def makeEnum(members = None, cls = None, useHash = True): members = members or [] if cls: enumID = md5_hex(str(members) + '!' + cls.__name__)[:4] else: enumID = md5_hex(str(members))[:4] cls = type('Enum_%s_%s' % (enumID, str.join('_', members)), (), {}) def getValue(idx, name): if useHash: return idx + int(enumID, 16) else: return idx values = lsmap(getValue, enumerate(members)) cls.enumNames = members cls.enumValues = values enumMapNV = dict(izip(imap(str.lower, cls.enumNames), cls.enumValues)) enumMapVN = dict(izip(cls.enumValues, cls.enumNames)) if len(enumMapNV) != len(enumMapVN): raise APIError('Invalid enum definition!') def str2enum(cls, value, *args): return enumMapNV.get(value.lower(), *args) cls.enum2str = enumMapVN.get cls.str2enum = classmethod(str2enum) for name, value in izip(cls.enumNames, cls.enumValues): setattr(cls, name, value) return cls
def make_enum(enum_name_list=None, cls=None, use_hash=True, register=True): enum_name_list = enum_name_list or [] if cls: enum_id = md5_hex(str(enum_name_list) + '!' + cls.__name__)[:4] else: enum_id = md5_hex(str(enum_name_list))[:4] cls = type('Enum_%s_%s' % (enum_id, str.join('_', enum_name_list)), (), {}) def _intstr2enum(cls, value, default=unspecified): enum = ignore_exception(Exception, default, int, value) if enum not in cls.enum_value_list: allowed_str = str.join(', ', imap(lambda nv: '%s=%s', _map_name2value.items())) raise Exception('Invalid enum value %s (allowed are %r)' % (repr(value), allowed_str)) return enum def _register_enum(cls, name): value = len(cls.enum_name_list) if use_hash: value += int(enum_id, 16) for enum_cls in make_enum.enum_list: if use_hash and (value in enum_cls.enum_value_list) and (enum_cls.enum_id != enum_id): raise APIError('enum value collision detected!') cls.enum_name_list.append(name) cls.enum_value_list.append(value) setattr(cls, name, value) _map_name2value[name.lower()] = value _map_value2name[value] = name if len(_map_name2value) != len(_map_value2name): raise APIError('Invalid enum definition! (%s:%s)' % (_map_name2value, _map_value2name)) def _str2enum(cls, value, *args): lookup_fun = _map_name2value.__getitem__ if args: lookup_fun = _map_name2value.get try: return lookup_fun(value.lower(), *args) except Exception: allowed_str = str.join(', ', cls.enum_name_list) raise Exception('Invalid enum string %s (allowed are %r)' % (repr(value), allowed_str)) _map_value2name = {} _map_name2value = {} cls.enum_id = enum_id cls.enum_name_list = [] cls.enum_value_list = [] cls.enum2str = _map_value2name.get cls.str2enum = classmethod(_str2enum) cls.intstr2enum = classmethod(_intstr2enum) cls.register_enum = classmethod(_register_enum) for enum_name in enum_name_list: cls.register_enum(enum_name) if register: make_enum.enum_list.append(cls) return cls
def _getBlocksInternal(self): # Split files into blocks/datasets via key functions and determine metadata intersection (protoBlocks, commonDS, commonB) = ({}, {}, {}) def getActiveKeys(kUser, kGuard, gIdx): return kUser + (kGuard or lchain( imap(lambda x: x.getGuards()[gIdx], self._scanner))) keysDS = getActiveKeys(self._ds_keys_user, self._ds_keys_guard, 0) keysB = getActiveKeys(self._b_keys_user, self._b_keys_guard, 1) for fileInfo in ifilter(itemgetter(0), self._collectFiles()): hashDS = self._generateKey( keysDS, md5_hex(repr(self._datasetExpr)) + md5_hex(repr(self._datasetNick)), *fileInfo) hashB = self._generateKey(keysB, hashDS + md5_hex(repr(fileInfo[3])), *fileInfo) # [3] == SE list if not self._ds_select or (hashDS in self._ds_select): if not self._b_select or (hashB in self._b_select): fileInfo[1].update({'DS_KEY': hashDS, 'BLOCK_KEY': hashB}) protoBlocks.setdefault(hashDS, {}).setdefault(hashB, []).append(fileInfo) utils.intersectDict( commonDS.setdefault(hashDS, dict(fileInfo[1])), fileInfo[1]) utils.intersectDict( commonB.setdefault(hashDS, {}).setdefault( hashB, dict(fileInfo[1])), fileInfo[1]) # Generate names for blocks/datasets using common metadata (hashNameDictDS, hashNameDictB) = ({}, {}) for hashDS in protoBlocks: hashNameDictDS[hashDS] = self._generateDatasetName( hashDS, commonDS[hashDS]) for hashB in protoBlocks[hashDS]: hashNameDictB[hashB] = (hashDS, self._generateBlockName( hashB, commonB[hashDS][hashB])) self._findCollision('dataset', hashNameDictDS, commonDS, keysDS, lambda name, key: [key]) self._findCollision('block', hashNameDictB, commonB, keysDS + keysB, lambda name, key: [name[0], key], lambda name: name[1]) for block in self._buildBlocks(protoBlocks, hashNameDictDS, hashNameDictB): yield block
def __init__(self, config, name): self._sandbox_helper = SandboxHelper(config) self._error_log_fn = config.get_work_path('error.tar') cancel_executor = CancelAndPurgeJobs(config, CondorCancelJobs(config), LocalPurgeJobs(config, self._sandbox_helper)) BasicWMS.__init__(self, config, name, check_executor=CheckJobsMissingState(config, CondorCheckJobs(config)), cancel_executor=cancel_executor) self._task_id = config.get('task id', md5_hex(str(time.time())), persistent=True) # FIXME # finalize config state by reading values or setting to defaults # load keys for condor pool ClassAds self._jdl_writer = CondorJDLWriter(config) self._universe = config.get('universe', 'vanilla', on_change=None) self._pool_req_dict = config.get_dict('poolArgs req', {})[0] self._pool_work_dn = None self._proc_factory = None (self._submit_exec, self._transfer_exec) = (None, None) # prepare interfaces for local/remote/ssh pool access self._remote_type = config.get_enum('remote Type', PoolType, PoolType.LOCAL) self._init_pool_interface(config) # Sandbox base path where individual job data is stored, staged and returned to self._sandbox_dn = config.get_path('sandbox path', config.get_work_path('sandbox'), must_exist=False) # broker for selecting sites - FIXME: this looks wrong... pool != site self._pool_host_list = config.get_list(['poolhostlist', 'pool host list'], []) self._broker_site = config.get_plugin('site broker', 'UserBroker', cls=Broker, bind_kwargs={'tags': [self]}, pargs=('sites', 'sites', lambda: self._pool_host_list)) self._wall_time_mode = config.get_enum('wall time mode', WallTimeMode, WallTimeMode.ignore, subset=[WallTimeMode.hard, WallTimeMode.ignore])
def __init__(self, config, name): self._sandbox_helper = SandboxHelper(config) self._error_log_fn = config.get_work_path('error.tar') cancel_executor = CancelAndPurgeJobs(config, CondorCancelJobs(config), LocalPurgeJobs(config, self._sandbox_helper)) BasicWMS.__init__(self, config, name, check_executor=CheckJobsMissingState(config, CondorCheckJobs(config)), cancel_executor=cancel_executor) self._task_id = config.get('task id', md5_hex(str(time.time())), persistent=True) # FIXME # finalize config state by reading values or setting to defaults # load keys for condor pool ClassAds self._jdl_writer = CondorJDLWriter(config) self._universe = config.get('universe', 'vanilla', on_change=None) self._pool_req_dict = config.get_dict('poolArgs req', {})[0] self._pool_work_dn = None self._proc_factory = None (self._submit_exec, self._transfer_exec) = (None, None) # prepare interfaces for local/remote/ssh pool access self._remote_type = config.get_enum('remote Type', PoolType, PoolType.LOCAL) self._init_pool_interface(config) # Sandbox base path where individual job data is stored, staged and returned to self._sandbox_dn = config.get_path('sandbox path', config.get_work_path('sandbox'), must_exist=False) # broker for selecting sites - FIXME: this looks wrong... pool != site self._pool_host_list = config.get_list(['poolhostlist', 'pool host list'], []) self._broker_site = config.get_plugin('site broker', 'UserBroker', cls=Broker, bind_kwargs={'tags': [self]}, pargs=('sites', 'sites', lambda: self._pool_host_list)) self._wall_time_mode = config.get_enum('wall time mode', WallTimeMode, WallTimeMode.ignore, subset=[WallTimeMode.hard, WallTimeMode.ignore]) self._blacklist_nodes = config.get_list(['blacklist nodes'], [], on_change=None) self._user_requirements = config.get('user requirements', '', on_change=None)
def getHash(self): buffer = StringBuffer() for _ in DataProvider.saveToStream( buffer, self._datasetProcessor.process(self.getBlocksNormed())): pass return md5_hex(buffer.getvalue())
def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration root_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0])) else: tmp_dn = root_dn ensure_dir_exists(tmp_dn) except Exception: raise BackendError( 'Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobs = self._write_wms_id_list(gc_id_jobnum_list) activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) proc = LocalProcess(self._output_exec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn) # yield output dirs todo = map_gc_id2jobnum.values() current_jobnum = None for line in imap(str.strip, proc.stdout.iter(timeout=60)): if line.startswith(tmp_dn): todo.remove(current_jobnum) output_dn = line.strip() unpack_wildcard_tar(self._log, output_dn) yield (current_jobnum, output_dn) current_jobnum = None else: current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum) exit_code = proc.status(timeout=0, terminate=True) activity.finish() if exit_code != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read( timeout=0): remove_files([jobs, root_dn]) raise StopIteration else: self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dn in os.listdir(root_dn): yield (None, os.path.join(root_dn, dn)) # return unretrievable jobs for jobnum in todo: yield (jobnum, None) remove_files([jobs, tmp_dn])
def _assign_dataset_block(self, map_key2fm_list, map_key2metadata_dict, file_metadata_iter): # Split files into blocks/datasets via key functions and determine metadata intersection for (url, metadata_dict, entries, location_list, obj_dict) in file_metadata_iter: # Dataset hash always includes dataset expr and nickname override hash_dataset = self._get_hash(self._hash_input_set_dataset, metadata_dict, md5_hex(repr(self._dataset_expr)) + md5_hex(repr(self._dataset_nick_override))) # Block hash always includes the dataset hash and location list hash_block = self._get_hash(self._hash_input_set_block, metadata_dict, hash_dataset + md5_hex(repr(location_list))) if not self._selected_hash_list_dataset or (hash_dataset in self._selected_hash_list_dataset): if not self._selected_hash_list_block or (hash_block in self._selected_hash_list_block): metadata_dict.update({'DS_KEY': hash_dataset, 'BLOCK_KEY': hash_block}) self._assign_dataset_block_selected(map_key2fm_list, map_key2metadata_dict, (url, metadata_dict, entries, location_list, obj_dict), hash_dataset, hash_block, metadata_dict)
def process_block(self, block): # Check uniqueness of URLs url_hash_list = [] if self._check_url != DatasetUniqueMode.ignore: block[DataProvider.FileList] = list( self._process_fi_list(url_hash_list, block[DataProvider.FileList])) url_hash_list.sort() # Check uniqueness of blocks if self._check_block != DatasetUniqueMode.ignore: block_hash = md5_hex( repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], url_hash_list, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if block_hash in self._recorded_block: msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id( block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._check_block == DatasetUniqueMode.warn: self._log.warning(msg) elif self._check_block == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._check_block == DatasetUniqueMode.skip: return None self._recorded_block.add(block_hash) return block
def collapse_psp_list(psp_list, tracked_list, opts): psp_dict = {} psp_dict_nicks = {} header_list = [('COLLATE_JOBS', '# of jobs')] if 'DATASETSPLIT' in tracked_list: tracked_list.remove('DATASETSPLIT') if opts.collapse == 1: tracked_list.append('DATASETNICK') header_list.append(('DATASETNICK', 'DATASETNICK')) elif opts.collapse == 2: header_list.append(('COLLATE_NICK', '# of nicks')) for pset in psp_list: if ('DATASETSPLIT' in pset) and (opts.collapse == 1): pset.pop('DATASETSPLIT') nickname = None if ('DATASETNICK' in pset) and (opts.collapse == 2): nickname = pset.pop('DATASETNICK') hash_str = md5_hex( repr(lmap(lambda key: pset.get(str(key)), tracked_list))) psp_dict.setdefault(hash_str, []).append(pset) psp_dict_nicks.setdefault(hash_str, set()).add(nickname) def _do_collate(hash_str): psp = psp_dict[hash_str][0] psp['COLLATE_JOBS'] = len(psp_dict[hash_str]) psp['COLLATE_NICK'] = len(psp_dict_nicks[hash_str]) return psp psp_list = sorted(imap(_do_collate, psp_dict), key=lambda x: tuple(imap(str, x.values()))) return (header_list, psp_list)
def __init__(self, config, datasetExpr, datasetNick=None): ds_config = config.changeView(viewClass='TaggedConfigView', addNames=[md5_hex(datasetExpr)]) if os.path.isdir(datasetExpr): scan_pipeline = ['OutputDirsFromWork'] ds_config.set('source directory', datasetExpr) datasetExpr = os.path.join(datasetExpr, 'work.conf') else: scan_pipeline = ['OutputDirsFromConfig', 'MetadataFromTask'] datasetExpr, selector = utils.optSplit(datasetExpr, '%') ds_config.set('source config', datasetExpr) ds_config.set('source job selector', selector) ext_config = create_config(datasetExpr) ext_task_name = ext_config.changeView(setSections=['global']).get( ['module', 'task']) if 'ParaMod' in ext_task_name: # handle old config files ext_task_name = ext_config.changeView( setSections=['ParaMod']).get('module') ext_task_cls = Plugin.getClass(ext_task_name) for ext_task_cls in Plugin.getClass(ext_task_name).iterClassBases(): try: scan_holder = GCProviderSetup.getClass('GCProviderSetup_' + ext_task_cls.__name__) except PluginError: continue scan_pipeline += scan_holder.scan_pipeline break ScanProviderBase.__init__(self, ds_config, datasetExpr, datasetNick, scan_pipeline)
def collapse_psp_list(psp_list, tracked_list, opts): psp_dict = {} psp_dict_nicks = {} header_list = [('COLLATE_JOBS', '# of jobs')] if 'DATASETSPLIT' in tracked_list: tracked_list.remove('DATASETSPLIT') if opts.collapse == 1: tracked_list.append('DATASETNICK') header_list.append(('DATASETNICK', 'DATASETNICK')) elif opts.collapse == 2: header_list.append(('COLLATE_NICK', '# of nicks')) for pset in psp_list: if ('DATASETSPLIT' in pset) and (opts.collapse == 1): pset.pop('DATASETSPLIT') nickname = None if ('DATASETNICK' in pset) and (opts.collapse == 2): nickname = pset.pop('DATASETNICK') hash_str = md5_hex(repr(lmap(lambda key: pset.get(str(key)), tracked_list))) psp_dict.setdefault(hash_str, []).append(pset) psp_dict_nicks.setdefault(hash_str, set()).add(nickname) def _do_collate(hash_str): psp = psp_dict[hash_str][0] psp['COLLATE_JOBS'] = len(psp_dict[hash_str]) psp['COLLATE_NICK'] = len(psp_dict_nicks[hash_str]) return psp psp_list = sorted(imap(_do_collate, psp_dict), key=lambda x: tuple(imap(str, x.values()))) return (header_list, psp_list)
def _get_dataset_hash(self): buffer = StringBuffer() for _ in DataProvider.save_to_stream(buffer, self.iter_blocks_normed()): pass value = buffer.getvalue() buffer.close() return md5_hex(value)
def _prepareSubmit(self, task, jobnum_list, queryArguments): jdlFilePath = os.path.join( self.parentPool.getSandboxPath(), 'htc-%s.schedd-%s.jdl' % (self.parentPool.wms_name, md5_hex(self.getURI()))) safe_write( open(jdlFilePath, 'w'), lmap(lambda line: line + '\n', self._getJDLData(task, jobnum_list, queryArguments))) return jdlFilePath
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): ds_config = config.changeView(viewClass = 'TaggedConfigView', addNames = [md5_hex(datasetExpr)]) if '*' in os.path.basename(datasetExpr): ds_config.set('source directory', os.path.dirname(datasetExpr)) ds_config.set('filename filter', datasetExpr) else: ds_config.set('source directory', datasetExpr) defScanner = ['FilesFromLS', 'MatchOnFilename', 'MatchDelimeter', 'DetermineEvents', 'AddFilePrefix'] ScanProviderBase.__init__(self, ds_config, defScanner, datasetNick, datasetID)
def _translate_pa2pspi_list(padapter): # Reduces parameter adapter output to essential information for diff - faster than keying meta_iter = ifilter(lambda k: not k.untracked, padapter.get_job_metadata()) meta_list = sorted(meta_iter, key=lambda k: k.value) for psp in padapter.iter_jobs(): # Translates parameter space point into hash psp_item_iter = imap(lambda meta: (meta.value, psp.get(meta.value)), meta_list) hash_str = md5_hex(repr(lfilter(itemgetter(1), psp_item_iter))) yield (psp[ParameterInfo.ACTIVE], hash_str, psp['GC_PARAM'])
def list_parameters(opts, psource): (result, needGCParam) = get_parameters(opts, psource) enabledOutput = opts.output.split(',') output = lfilter(lambda k: not opts.output or k in enabledOutput, psource.getJobKeys()) stored = lfilter(lambda k: k.untracked == False, output) untracked = lfilter(lambda k: k.untracked == True, output) if opts.collapse > 0: result_old = result result = {} result_nicks = {} head = [('COLLATE_JOBS', '# of jobs')] if 'DATASETSPLIT' in stored: stored.remove('DATASETSPLIT') if opts.collapse == 1: stored.append('DATASETNICK') head.append(('DATASETNICK', 'DATASETNICK')) elif opts.collapse == 2: head.append(('COLLATE_NICK', '# of nicks')) for pset in result_old: if ('DATASETSPLIT' in pset) and (opts.collapse == 1): pset.pop('DATASETSPLIT') nickname = None if ('DATASETNICK' in pset) and (opts.collapse == 2): nickname = pset.pop('DATASETNICK') h = md5_hex(repr(lmap(lambda key: pset.get(str(key)), stored))) result.setdefault(h, []).append(pset) result_nicks.setdefault(h, set()).add(nickname) def doCollate(h): tmp = result[h][0] tmp['COLLATE_JOBS'] = len(result[h]) tmp['COLLATE_NICK'] = len(result_nicks[h]) return tmp result = lmap(doCollate, result) else: head = [('GC_JOB_ID', '#')] if needGCParam: head.append(('GC_PARAM', 'GC_PARAM')) if opts.active: head.append((ParameterInfo.ACTIVE, 'ACTIVE')) if opts.visible: stored = opts.visible.split(',') head.extend(sorted(izip(stored, stored))) if opts.untracked: head.extend( sorted( imap( lambda n: (n, '(%s)' % n), ifilter(lambda n: n not in ['GC_PARAM', 'GC_JOB_ID'], untracked)))) utils.vprint('') utils.printTabular(head, result)
def _prepareSubmit(self, task, jobnum_list, queryArguments): localJdlFilePath = os.path.join( self.parentPool.getSandboxPath(), 'htc-%s.schedd-%s.jdl' % (self.parentPool.wms_name, md5_hex(self.getURI()))) readyJobNumList = self._stageSubmitFiles(task, jobnum_list) safe_write( open(localJdlFilePath, 'w'), lmap(lambda line: line + '\n', self._getJDLData(task, readyJobNumList, queryArguments))) raise NotImplementedError('JDL must get moved to remote') return jdlFilePath
def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration root_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0])) else: tmp_dn = root_dn ensure_dir_exists(tmp_dn) except Exception: raise BackendError('Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobs = self._write_wms_id_list(gc_id_jobnum_list) activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) proc = LocalProcess(self._output_exec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn) # yield output dirs todo = map_gc_id2jobnum.values() current_jobnum = None for line in imap(str.strip, proc.stdout.iter(timeout=60)): if line.startswith(tmp_dn): todo.remove(current_jobnum) output_dn = line.strip() unpack_wildcard_tar(self._log, output_dn) yield (current_jobnum, output_dn) current_jobnum = None else: current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum) exit_code = proc.status(timeout=0, terminate=True) activity.finish() if exit_code != 0: if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout=0): remove_files([jobs, root_dn]) raise StopIteration else: self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()}) self._log.error('Trying to recover from error ...') for dn in os.listdir(root_dn): yield (None, os.path.join(root_dn, dn)) # return unretrievable jobs for jobnum in todo: yield (jobnum, None) remove_files([jobs, tmp_dn])
def create_dbs3_json_files(opts, block_info, block_dump): block_size = 0 dataset_type = set() for file_info in block_info[DataProvider.FileList]: metadata_info = dict(izip(block_info[DataProvider.Metadata], file_info[DataProvider.Metadata])) if metadata_info['CMSSW_DATATYPE']: # this is not always correctly filled dataset_type.add(metadata_info['CMSSW_DATATYPE']) file_size = metadata_info['SE_OUTPUT_SIZE'] lfn = file_info[DataProvider.URL] # add file information block_dump['files'].append({ 'logical_file_name': lfn, 'file_size': file_size, 'check_sum': metadata_info['SE_OUTPUT_HASH_CRC32'], 'md5': metadata_info['SE_OUTPUT_HASH_MD5'], 'adler32': 'NOTSET', 'file_lumi_list': lmap(lambda run_lumi: {'run_num': run_lumi[0], 'lumi_section_num': run_lumi[1]}, metadata_info['CMSSW_LUMIS']), 'event_count': metadata_info['CMSSW_EVENTS_WRITE'], 'file_type': 'EDM', 'auto_cross_section': 0.0, }) # add file parentage information if not opts.no_parents: block_dump['file_parent_list'].extend(imap(lambda parent_lfn: {'logical_file_name': lfn, 'parent_logical_file_name': parent_lfn}, metadata_info['CMSSW_PARENT_LFN'])) # fill file / dataset configurations dataset_conf_dict = { 'release_version': metadata_info['CMSSW_VERSION'], 'pset_hash': metadata_info['CMSSW_CONFIG_HASH'], 'app_name': 'cmsRun', 'output_module_label': 'crab2_mod_label', 'global_tag': metadata_info.get('CMSSW_GLOBALTAG', opts.globaltag) } if opts.unique_cfg: dataset_conf_dict['pset_hash'] = md5_hex(dataset_conf_dict['pset_hash'] + block_info[DataProvider.Dataset]) if dataset_conf_dict not in block_dump['dataset_conf_list']: block_dump['dataset_conf_list'].append(dataset_conf_dict) # file configurations also specifies lfn file_conf_dict = dict(dataset_conf_dict) file_conf_dict['lfn'] = lfn block_dump['file_conf_list'].append(file_conf_dict) # update block size for block summary information block_size += file_size return (block_size, dataset_type)
def __init__(self, config, name): NamedPlugin.__init__(self, config, name) initSandbox = changeInitNeeded('sandbox') self._varCheck = validNoVar(config) # Task requirements jobs_config = config.changeView(viewClass = 'TaggedConfigView', addSections = ['jobs'], addTags = [self]) # Move this into parameter manager? self.wallTime = jobs_config.getTime('wall time', onChange = None) self.cpuTime = jobs_config.getTime('cpu time', self.wallTime, onChange = None) self.cpus = jobs_config.getInt('cpus', 1, onChange = None) self.memory = jobs_config.getInt('memory', -1, onChange = None) self.nodeTimeout = jobs_config.getTime('node timeout', -1, onChange = initSandbox) # Compute / get task ID self.taskID = config.get('task id', 'GC' + md5_hex(str(time()))[:12], persistent = True) self.taskDate = config.get('task date', strftime('%Y-%m-%d'), persistent = True, onChange = initSandbox) self.taskConfigName = config.getConfigName() self._job_name_generator = config.getPlugin('job name generator', 'DefaultJobName', cls = JobNamePlugin, pargs = (self,)) # Storage setup storage_config = config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, addSections = ['storage'], addTags = [self]) self.taskVariables = { # Space limits 'SCRATCH_UL': storage_config.getInt('scratch space used', 5000, onChange = initSandbox), 'SCRATCH_LL': storage_config.getInt('scratch space left', 1, onChange = initSandbox), 'LANDINGZONE_UL': storage_config.getInt('landing zone space used', 100, onChange = initSandbox), 'LANDINGZONE_LL': storage_config.getInt('landing zone space left', 1, onChange = initSandbox), } storage_config.set('se output pattern', 'job_@GC_JOB_ID@_@X@') self.seMinSize = storage_config.getInt('se min size', -1, onChange = initSandbox) self.sbInputFiles = config.getPaths('input files', [], onChange = initSandbox) self.sbOutputFiles = config.getList('output files', [], onChange = initSandbox) self.gzipOut = config.getBool('gzip output', True, onChange = initSandbox) self._subst_files = config.getList('subst files', [], onChange = initSandbox) self.dependencies = lmap(str.lower, config.getList('depends', [], onChange = initSandbox)) # Get error messages from gc-run.lib comments self.errorDict = {} self.updateErrorDict(utils.pathShare('gc-run.lib')) # Init parameter source manager psrc_repository = {} self._setupJobParameters(config, psrc_repository) self._pfactory = config.getPlugin('internal parameter factory', 'BasicParameterFactory', cls = ParameterFactory, pargs = (psrc_repository,), tags = [self], inherit = True) self.source = config.getPlugin('parameter adapter', 'TrackedParameterAdapter', cls = ParameterAdapter, pargs = (self._pfactory.getSource(),))
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) config = config.changeView(viewClass = 'SimpleConfigView', setSections = ['datasource %s' % datasetExpr]) self._block = self._readBlockFromConfig(config, datasetExpr, datasetNick, datasetID) dataset_hash_new = md5_hex(repr(self._block)) dataset_hash_old = config.get('dataset hash', dataset_hash_new, persistent = True) self._request_resync = dataset_hash_new != dataset_hash_old if self._request_resync: self._log.critical('Dataset %r changed', datasetExpr) config.setState(True, 'resync', detail = 'dataset') config.setState(True, 'resync', detail = 'parameters') config.set('dataset hash', dataset_hash_new)
def __init__(self, config, name): NamedPlugin.__init__(self, config, name) initSandbox = changeInitNeeded('sandbox') self._varCheck = validNoVar(config) # Task requirements jobs_config = config.changeView(viewClass = 'TaggedConfigView', addSections = ['jobs'], addTags = [self]) # Move this into parameter manager? self.wallTime = jobs_config.getTime('wall time', onChange = None) self.cpuTime = jobs_config.getTime('cpu time', self.wallTime, onChange = None) self.cpus = jobs_config.getInt('cpus', 1, onChange = None) self.memory = jobs_config.getInt('memory', -1, onChange = None) self.nodeTimeout = jobs_config.getTime('node timeout', -1, onChange = initSandbox) # Compute / get task ID self.taskID = config.get('task id', 'GC' + md5_hex(str(time()))[:12], persistent = True) self.taskDate = config.get('task date', strftime('%Y-%m-%d'), persistent = True, onChange = initSandbox) self.taskConfigName = config.getConfigName() self._job_name_generator = config.getPlugin('job name generator', 'DefaultJobName', cls = JobNamePlugin, pargs = (self,)) # Storage setup storage_config = config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, addSections = ['storage'], addTags = [self]) self.taskVariables = { # Space limits 'SCRATCH_UL': storage_config.getInt('scratch space used', 5000, onChange = initSandbox), 'SCRATCH_LL': storage_config.getInt('scratch space left', 1, onChange = initSandbox), 'LANDINGZONE_UL': storage_config.getInt('landing zone space used', 100, onChange = initSandbox), 'LANDINGZONE_LL': storage_config.getInt('landing zone space left', 1, onChange = initSandbox), } storage_config.set('se output pattern', 'job_@GC_JOB_ID@_@X@') self.seMinSize = storage_config.getInt('se min size', -1, onChange = initSandbox) self.sbInputFiles = config.getPaths('input files', [], onChange = initSandbox) self.sbOutputFiles = config.getList('output files', [], onChange = initSandbox) self.gzipOut = config.getBool('gzip output', True, onChange = initSandbox) self.substFiles = config.getList('subst files', [], onChange = initSandbox) self.dependencies = lmap(str.lower, config.getList('depends', [], onChange = initSandbox)) # Get error messages from gc-run.lib comments self.errorDict = {} self.updateErrorDict(utils.pathShare('gc-run.lib')) # Init parameter source manager self._setupJobParameters(config) self._pfactory = config.getPlugin('internal parameter factory', 'BasicParameterFactory', cls = ParameterFactory, tags = [self], inherit = True) self.source = config.getPlugin('parameter adapter', 'TrackedParameterAdapter', cls = ParameterAdapter, pargs = (self._pfactory.getSource(),))
def processFI(fiList): for fi in fiList: urlHash = md5_hex(repr((fi[DataProvider.URL], fi[DataProvider.NEntries], fi.get(DataProvider.Metadata)))) if urlHash in self._recordedURL: msg = 'Multiple occurences of URL: %r!' % fi[DataProvider.URL] msg += ' (This check can be configured with %r)' % 'dataset check unique url' if self._checkURL == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkURL == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkURL == DatasetUniqueMode.skip: continue self._recordedURL.add(urlHash) recordedBlockURL.append(urlHash) yield fi
def __init__(self, config, datasetExpr, datasetNick = None): ds_config = config.changeView(viewClass = 'TaggedConfigView', addNames = [md5_hex(datasetExpr)]) basename = os.path.basename(datasetExpr) firstScanner = 'FilesFromLS' if '*' in basename: ds_config.set('source directory', datasetExpr.replace(basename, '')) ds_config.set('filename filter', basename) elif not datasetExpr.endswith('.dbs'): ds_config.set('source directory', datasetExpr) else: ds_config.set('source dataset path', datasetExpr) ds_config.set('filename filter', '') firstScanner = 'FilesFromDataProvider' defScanner = [firstScanner, 'MatchOnFilename', 'MatchDelimeter', 'DetermineEvents', 'AddFilePrefix'] ScanProviderBase.__init__(self, ds_config, datasetExpr, datasetNick, defScanner)
def processFI(fiList): for fi in fiList: urlHash = md5_hex(repr((fi[DataProvider.URL], fi[DataProvider.NEntries], fi.get(DataProvider.Metadata)))) if urlHash in self._recordedURL: msg = 'Multiple occurences of URL: %r!' % fi[DataProvider.URL] msg += ' (This check can be configured with %r)' % 'dataset check unique url' if self._checkURL == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkURL == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkURL == DatasetUniqueMode.skip: continue self._recordedURL.add(urlHash) recordedBlockURL.append(urlHash) yield fi
def processBlock(self, block): # Check uniqueness of URLs recordedBlockURL = [] if self._checkURL != DatasetUniqueMode.ignore: def processFI(fiList): for fi in fiList: urlHash = md5_hex( repr((fi[DataProvider.URL], fi[DataProvider.NEntries], fi.get(DataProvider.Metadata)))) if urlHash in self._recordedURL: msg = 'Multiple occurences of URL: %r!' % fi[ DataProvider.URL] msg += ' (This check can be configured with %r)' % 'dataset check unique url' if self._checkURL == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkURL == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkURL == DatasetUniqueMode.skip: continue self._recordedURL.add(urlHash) recordedBlockURL.append(urlHash) yield fi block[DataProvider.FileList] = list( processFI(block[DataProvider.FileList])) recordedBlockURL.sort() # Check uniqueness of blocks if self._checkBlock != DatasetUniqueMode.ignore: blockHash = md5_hex( repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], recordedBlockURL, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if blockHash in self._recordedBlock: msg = 'Multiple occurences of block: "%s"!' % DataProvider.bName( block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._checkBlock == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkBlock == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkBlock == DatasetUniqueMode.skip: return None self._recordedBlock.add(blockHash) return block
def list_parameters(opts, psource): (result, needGCParam) = get_parameters(opts, psource) enabledOutput = opts.output.split(',') output = lfilter(lambda k: not opts.output or k in enabledOutput, psource.getJobKeys()) stored = lfilter(lambda k: k.untracked == False, output) untracked = lfilter(lambda k: k.untracked == True, output) if opts.collapse > 0: result_old = result result = {} result_nicks = {} head = [('COLLATE_JOBS', '# of jobs')] if 'DATASETSPLIT' in stored: stored.remove('DATASETSPLIT') if opts.collapse == 1: stored.append('DATASETNICK') head.append(('DATASETNICK', 'DATASETNICK')) elif opts.collapse == 2: head.append(('COLLATE_NICK', '# of nicks')) for pset in result_old: if ('DATASETSPLIT' in pset) and (opts.collapse == 1): pset.pop('DATASETSPLIT') nickname = None if ('DATASETNICK' in pset) and (opts.collapse == 2): nickname = pset.pop('DATASETNICK') h = md5_hex(repr(lmap(pset.get, stored))) result.setdefault(h, []).append(pset) result_nicks.setdefault(h, set()).add(nickname) def doCollate(h): tmp = result[h][0] tmp['COLLATE_JOBS'] = len(result[h]) tmp['COLLATE_NICK'] = len(result_nicks[h]) return tmp result = lmap(doCollate, result) else: head = [('GC_JOB_ID', '#')] if needGCParam: head.append(('GC_PARAM', 'GC_PARAM')) if opts.active: head.append((ParameterInfo.ACTIVE, 'ACTIVE')) if opts.visible: stored = opts.visible.split(',') head.extend(sorted(izip(stored, stored))) if opts.untracked: head.extend(sorted(imap(lambda n: (n, '(%s)' % n), ifilter(lambda n: n not in ['GC_PARAM', 'GC_JOB_ID'], untracked)))) utils.vprint('') utils.printTabular(head, result)
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): ds_config = config.change_view(view_class='TaggedConfigView', add_names=[md5_hex(dataset_expr)]) basename = os.path.basename(dataset_expr) scanner_first = 'FilesFromLS' if '*' in basename: ds_config.set('source directory', dataset_expr.replace(basename, '')) ds_config.set('filename filter', basename) elif not dataset_expr.endswith('.dbs'): ds_config.set('source directory', dataset_expr) else: ds_config.set('source dataset path', dataset_expr) ds_config.set('filename filter', '') scanner_first = 'FilesFromDataProvider' scanner_list_default = [scanner_first, 'MatchOnFilename', 'MatchDelimeter', 'DetermineEvents', 'AddFilePrefix'] ScanProviderBase.__init__(self, ds_config, datasource_name, dataset_expr, dataset_nick, dataset_proc, scanner_list_default)
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): ds_config = config.changeView(viewClass = 'TaggedConfigView', addNames = [md5_hex(datasetExpr)]) if os.path.isdir(datasetExpr): GCProvider.stageDir[None] = ['OutputDirsFromWork'] ds_config.set('source directory', datasetExpr) datasetExpr = os.path.join(datasetExpr, 'work.conf') else: GCProvider.stageDir[None] = ['OutputDirsFromConfig', 'MetadataFromTask'] datasetExpr, selector = utils.optSplit(datasetExpr, '%') ds_config.set('source config', datasetExpr) ds_config.set('source job selector', selector) ext_config = createConfig(datasetExpr) ext_task_name = ext_config.changeView(setSections = ['global']).get(['task', 'module']) if 'ParaMod' in ext_task_name: # handle old config files ext_task_name = ext_config.changeView(setSections = ['ParaMod']).get('module') sGet = lambda scannerDict: scannerDict.get(None) + scannerDict.get(ext_task_name, []) sList = sGet(GCProvider.stageDir) + ['JobInfoFromOutputDir', 'FilesFromJobInfo'] + sGet(GCProvider.stageFile) + ['DetermineEvents', 'AddFilePrefix'] ScanProviderBase.__init__(self, ds_config, sList, datasetNick, datasetID)
def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0): DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) config = config.changeView(viewClass='SimpleConfigView', setSections=['datasource %s' % datasetExpr]) self._block = self._readBlockFromConfig(config, datasetExpr, datasetNick, datasetID) dataset_hash_new = md5_hex(repr(self._block)) dataset_hash_old = config.get('dataset hash', dataset_hash_new, persistent=True) self._request_resync = dataset_hash_new != dataset_hash_old if self._request_resync: self._log.critical('Dataset %r changed', datasetExpr) config.setState(True, 'resync', detail='dataset') config.setState(True, 'resync', detail='parameters') config.set('dataset hash', dataset_hash_new)
def __init__(self, config, datasetExpr, datasetNick=None): ds_config = config.changeView(viewClass='TaggedConfigView', addNames=[md5_hex(datasetExpr)]) basename = os.path.basename(datasetExpr) firstScanner = 'FilesFromLS' if '*' in basename: ds_config.set('source directory', datasetExpr.replace(basename, '')) ds_config.set('filename filter', basename) elif not datasetExpr.endswith('.dbs'): ds_config.set('source directory', datasetExpr) else: ds_config.set('source dataset path', datasetExpr) ds_config.set('filename filter', '') firstScanner = 'FilesFromDataProvider' defScanner = [ firstScanner, 'MatchOnFilename', 'MatchDelimeter', 'DetermineEvents', 'AddFilePrefix' ] ScanProviderBase.__init__(self, ds_config, datasetExpr, datasetNick, defScanner)
def bulkSubmissionBegin(self): self._submitParams.update({ '-d': None }) if self._discovery_module: self._submitParams.update({ '-e': self._discovery_module.getWMS() }) if self._useDelegate is False: self._submitParams.update({ '-a': ' ' }) return True dID = 'GCD' + md5_hex(str(time.time()))[:10] activity = utils.ActivityLog('creating delegate proxy for job submission') deletegateArgs = [] if self._configVO: deletegateArgs.extend(['--config', self._configVO]) proc = LocalProcess(self._delegateExec, '-d', dID, '--noint', '--logfile', '/dev/stderr', *deletegateArgs) output = proc.get_output(timeout = 10, raise_errors = False) if ('glite-wms-job-delegate-proxy Success' in output) and (dID in output): self._submitParams.update({ '-d': dID }) del activity if proc.status(timeout = 0, terminate = True) != 0: self._log.log_process(proc) return (self._submitParams.get('-d', None) is not None)
def _begin_bulk_submission(self): self._submit_args_dict.update({'-D': None}) if self._use_delegate is False: self._submit_args_dict.update({'-a': ' '}) return True delegate_id = 'GCD' + md5_hex(str(time.time()))[:10] activity = Activity('creating delegate proxy for job submission') delegate_arg_list = ['-e', self._ce[:self._ce.rfind("/")]] if self._config_fn: delegate_arg_list.extend(['--config', self._config_fn]) proc = LocalProcess(self._delegate_exec, '-d', delegate_id, '--logfile', '/dev/stderr', *delegate_arg_list) output = proc.get_output(timeout=10, raise_errors=False) if ('succesfully delegated to endpoint' in output) and (delegate_id in output): self._submit_args_dict.update({'-D': delegate_id}) activity.finish() if proc.status(timeout=0, terminate=True) != 0: self._log.log_process(proc) return self._submit_args_dict.get('-D') is not None
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): ds_config = config.change_view(view_class='TaggedConfigView', add_names=[md5_hex(dataset_expr)]) if os.path.isdir(dataset_expr): scanner_list = ['OutputDirsFromWork'] ds_config.set('source directory', dataset_expr) dataset_expr = os.path.join(dataset_expr, 'work.conf') else: scanner_list = ['OutputDirsFromConfig', 'MetadataFromTask'] dataset_expr, selector = split_opt(dataset_expr, '%') ds_config.set('source config', dataset_expr) ds_config.set('source job selector', selector) ext_config = create_config(dataset_expr) ext_task_name = ext_config.change_view(set_sections=['global']).get(['module', 'task']) ext_task_cls = Plugin.get_class(ext_task_name) for ext_task_cls in Plugin.get_class(ext_task_name).iter_class_bases(): scan_setup_name = 'GCProviderSetup_' + ext_task_cls.__name__ scan_setup_cls = GCProviderSetup.get_class(scan_setup_name, ignore_missing=True) if scan_setup_cls: scanner_list += scan_setup_cls.scanner_list break ScanProviderBase.__init__(self, ds_config, datasource_name, dataset_expr, dataset_nick, dataset_proc, scanner_list)
def _begin_bulk_submission(self): self._submit_args_dict.update({'-d': None}) if self._discovery_plugin: self._submit_args_dict.update({'-e': self._discovery_plugin.get_endpoint()}) if self._use_delegate is False: self._submit_args_dict.update({'-a': ' '}) return True delegate_id = 'GCD' + md5_hex(str(time.time()))[:10] activity = Activity('creating delegate proxy for job submission') delegate_arg_list = [] if self._config_fn: delegate_arg_list.extend(['--config', self._config_fn]) proc = LocalProcess(self._delegate_exec, '-d', delegate_id, '--noint', '--logfile', '/dev/stderr', *delegate_arg_list) output = proc.get_output(timeout=10, raise_errors=False) if ('glite-wms-job-delegate-proxy Success' in output) and (delegate_id in output): self._submit_args_dict.update({'-d': delegate_id}) activity.finish() if proc.status(timeout=0, terminate=True) != 0: self._log.log_process(proc) return self._submit_args_dict.get('-d') is not None
def processBlock(self, block): # Check uniqueness of URLs recordedBlockURL = [] if self._checkURL != DatasetUniqueMode.ignore: def processFI(fiList): for fi in fiList: urlHash = md5_hex(repr((fi[DataProvider.URL], fi[DataProvider.NEntries], fi.get(DataProvider.Metadata)))) if urlHash in self._recordedURL: msg = 'Multiple occurences of URL: %r!' % fi[DataProvider.URL] msg += ' (This check can be configured with %r)' % 'dataset check unique url' if self._checkURL == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkURL == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkURL == DatasetUniqueMode.skip: continue self._recordedURL.add(urlHash) recordedBlockURL.append(urlHash) yield fi block[DataProvider.FileList] = list(processFI(block[DataProvider.FileList])) recordedBlockURL.sort() # Check uniqueness of blocks if self._checkBlock != DatasetUniqueMode.ignore: blockHash = md5_hex(repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], recordedBlockURL, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if blockHash in self._recordedBlock: msg = 'Multiple occurences of block: "%s"!' % DataProvider.bName(block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._checkBlock == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkBlock == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkBlock == DatasetUniqueMode.skip: return None self._recordedBlock.add(blockHash) return block
def process_block(self, block): # Check uniqueness of URLs url_hash_list = [] if self._check_url != DatasetUniqueMode.ignore: block[DataProvider.FileList] = list(self._process_fi_list(url_hash_list, block[DataProvider.FileList])) url_hash_list.sort() # Check uniqueness of blocks if self._check_block != DatasetUniqueMode.ignore: block_hash = md5_hex(repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], url_hash_list, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if block_hash in self._recorded_block: msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id(block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._check_block == DatasetUniqueMode.warn: self._log.warning(msg) elif self._check_block == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._check_block == DatasetUniqueMode.skip: return None self._recorded_block.add(block_hash) return block
def _get_jobs_output(self, gc_id_jobnum_list): # Get output of jobs and yield output dirs if len(gc_id_jobnum_list) == 0: raise StopIteration tmp_dn = os.path.join(self._path_output, 'tmp') try: if len(gc_id_jobnum_list) == 1: # For single jobs create single subdir tmp_dn = os.path.join(tmp_dn, md5_hex(gc_id_jobnum_list[0][0])) ensure_dir_exists(tmp_dn) except Exception: raise BackendError( 'Temporary path "%s" could not be created.' % tmp_dn, BackendError) map_gc_id2jobnum = dict(gc_id_jobnum_list) jobnum_list_todo = list(map_gc_id2jobnum.values()) wms_id_list_done = [] activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list)) chunk_pos_iter = irange(0, len(gc_id_jobnum_list), self._chunk_size) for ids in imap(lambda x: gc_id_jobnum_list[x:x + self._chunk_size], chunk_pos_iter): for (current_jobnum, output_dn) in self.get_jobs_output_chunk( tmp_dn, ids, wms_id_list_done): unpack_wildcard_tar(self._log, output_dn) jobnum_list_todo.remove(current_jobnum) yield (current_jobnum, output_dn) activity.finish() # return unretrievable jobs for jobnum in jobnum_list_todo: yield (jobnum, None) self._purge_done_jobs(wms_id_list_done) remove_files([tmp_dn])
def __init__(self, config, datasetExpr, datasetNick = None): ds_config = config.changeView(viewClass = 'TaggedConfigView', addNames = [md5_hex(datasetExpr)]) if os.path.isdir(datasetExpr): scan_pipeline = ['OutputDirsFromWork'] ds_config.set('source directory', datasetExpr) datasetExpr = os.path.join(datasetExpr, 'work.conf') else: scan_pipeline = ['OutputDirsFromConfig', 'MetadataFromTask'] datasetExpr, selector = utils.optSplit(datasetExpr, '%') ds_config.set('source config', datasetExpr) ds_config.set('source job selector', selector) ext_config = create_config(datasetExpr) ext_task_name = ext_config.changeView(setSections = ['global']).get(['module', 'task']) if 'ParaMod' in ext_task_name: # handle old config files ext_task_name = ext_config.changeView(setSections = ['ParaMod']).get('module') ext_task_cls = Plugin.getClass(ext_task_name) for ext_task_cls in Plugin.getClass(ext_task_name).iterClassBases(): try: scan_holder = GCProviderSetup.getClass('GCProviderSetup_' + ext_task_cls.__name__) except PluginError: continue scan_pipeline += scan_holder.scan_pipeline break ScanProviderBase.__init__(self, ds_config, datasetExpr, datasetNick, scan_pipeline)
def __init__(self, config, name): # Read configuration options and init vars NamedPlugin.__init__(self, config, name) init_sandbox = TriggerInit('sandbox') self._var_checker = NoVarCheck(config) # Task requirements # Move this into parameter manager? jobs_config = config.change_view(view_class='TaggedConfigView', add_sections=['jobs'], add_tags=[self]) self.wall_time = jobs_config.get_time('wall time', on_change=None) self._cpu_time = jobs_config.get_time('cpu time', self.wall_time, on_change=None) self._cores = jobs_config.get_int(['cores', 'cpus'], 1, on_change=None) self._memory = jobs_config.get_int('memory', -1, on_change=None) self._job_timeout = jobs_config.get_time('node timeout', -1, on_change=init_sandbox) # Compute / get task ID self._task_id = config.get('task id', 'GC' + md5_hex(str(time.time()))[:12], persistent=True) self._task_date = config.get('task date', time.strftime('%Y-%m-%d'), persistent=True, on_change=init_sandbox) self._task_time = config.get('task time', time.strftime('%H%M%S'), persistent=True, on_change=init_sandbox) task_name_generator = config.get_plugin('task name generator', 'DefaultTaskName', cls=TaskNamePlugin) self._task_name = task_name_generator.get_name(self) self._task_config_name = config.get_config_name() self._job_name_generator = config.get_plugin('job name generator', 'DefaultJobName', cls=JobNamePlugin) # Storage setup storage_config = config.change_view(view_class='TaggedConfigView', set_classes=None, set_names=None, add_sections=['storage'], add_tags=[self]) scratch_space_used = storage_config.get_int('scratch space used', 5000, on_change=init_sandbox) lz_space_used = storage_config.get_int('landing zone space used', 100, on_change=init_sandbox) self._task_var_dict = { # Space limits 'SCRATCH_UL': scratch_space_used, 'SCRATCH_LL': storage_config.get_int('scratch space left', 1, on_change=init_sandbox), 'LANDINGZONE_UL': lz_space_used, 'LANDINGZONE_LL': storage_config.get_int('landing zone space left', 1, on_change=init_sandbox), } storage_config.set('se output pattern', 'job_@GC_JOB_ID@_@X@') self._se_min_size = storage_config.get_int('se min size', -1, on_change=init_sandbox) self._disk_min = max(scratch_space_used, lz_space_used) self._sb_in_fn_list = config.get_path_list('input files', [], on_change=init_sandbox) self._sb_out_fn_list = config.get_list('output files', [], on_change=init_sandbox) self._do_gzip_std_output = config.get_bool('gzip output', True, on_change=init_sandbox) self._subst_files = config.get_list('subst files', [], on_change=init_sandbox) self._dependencies = lmap( str.lower, config.get_list('depends', [], on_change=init_sandbox)) # Get error messages from gc-run.lib comments self.map_error_code2msg = {} self._update_map_error_code2msg(get_path_share('gc-run.lib')) # Init parameter source manager psrc_repository = {} self._setup_repository(config, psrc_repository) pfactory = config.get_plugin('internal parameter factory', 'BasicParameterFactory', cls=ParameterFactory, bind_kwargs={ 'tags': [self], 'inherit': True }) self._source = config.get_plugin( 'parameter adapter', 'TrackedParameterAdapter', cls=ParameterAdapter, pargs=(pfactory.get_psrc(psrc_repository), )) self._log.log(logging.DEBUG3, 'Using parameter adapter %s', repr(self._source)) self._log.info('Current task ID: %s', self._task_id) self._log.info('Task started on: %s', self._task_date)
def getNodeName(instance): return instance.__class__.__name__ + '_' + md5_hex(repr(hash(instance)))
def _get_hash(self, keys, metadata_dict, hash_seed): return md5_hex(repr(hash_seed) + repr(lmap(metadata_dict.get, keys)))
def getHash(self): return md5_hex(self._psource.getHash() + str([self._posStart, self._posEnd]))
def getHash(self): return md5_hex(self._psource.getHash() + str(self.times))
def getHash(self): return md5_hex(str(lmap(lambda p: str(p.getMaxParameters()) + p.getHash(), self._psourceList)))
def _generateKey(self, keys, base, path, metadata, events, seList, objStore): return md5_hex(repr(base) + repr(lmap(metadata.get, keys)))
def getHash(self): return md5_hex(str(self._srcName) + str(self._dataSplitter.getMaxJobs()) + str(self.resyncEnabled()))