Ejemplo n.º 1
0
 def getEntries(self, path, metadata, events, seList, objStore):
     if 'JOBINFO' not in objStore:
         raise DatasetError(
             'Job information is not filled! Ensure that "JobInfoFromOutputDir" is scheduled!'
         )
     try:
         jobInfo = objStore['JOBINFO']
         files = ifilter(lambda x: x[0].startswith('file'), jobInfo.items())
         fileInfos = imap(lambda x_y: tuple(x_y[1].strip('"').split('  ')),
                          files)
         for (hashMD5, name_local, name_dest, pathSE) in fileInfos:
             metadata.update({
                 'SE_OUTPUT_HASH_MD5':
                 hashMD5,
                 'SE_OUTPUT_FILE':
                 name_local,
                 'SE_OUTPUT_BASE':
                 os.path.splitext(name_local)[0],
                 'SE_OUTPUT_PATH':
                 pathSE
             })
             yield (os.path.join(pathSE, name_dest), metadata, events,
                    seList, objStore)
     except KeyboardInterrupt:
         sys.exit(os.EX_TEMPFAIL)
     except Exception:
         raise DatasetError('Unable to read file stageout information!')
Ejemplo n.º 2
0
	def processBlock(self, block):
		if self._lumi_filter.empty() and ((self._lumi_keep == LumiKeep.RunLumi) or (DataProvider.Metadata not in block)):
			return block
		def getMetadataIdx(key):
			if key in block.get(DataProvider.Metadata, []):
				return block[DataProvider.Metadata].index(key)
		idxRuns = getMetadataIdx('Runs')
		idxLumi = getMetadataIdx('Lumi')
		if not self._lumi_filter.empty():
			lumi_filter = self._lumi_filter.lookup(block[DataProvider.Nickname], is_selector = False)
			if lumi_filter and (self._lumi_strict == LumiMode.strict) and ((idxRuns is None) or (idxLumi is None)):
				raise DatasetError('Strict lumi filter active but dataset %s does not provide lumi information!' % DataProvider.bName(block))
			elif lumi_filter and (self._lumi_strict == LumiMode.weak) and (idxRuns is None):
				raise DatasetError('Weak lumi filter active but dataset %s does not provide run information!' % DataProvider.bName(block))

		block[DataProvider.FileList] = list(self._processFI(block, idxRuns, idxLumi))
		if not block[DataProvider.FileList]:
			return
		block[DataProvider.NEntries] = sum(imap(lambda fi: fi[DataProvider.NEntries], block[DataProvider.FileList]))
		# Prune metadata
		if self._lumi_keep == LumiKeep.RunLumi:
			return block
		elif self._lumi_keep == LumiKeep.Run:
			idxRuns = None
		removeRunLumi(block[DataProvider.Metadata], idxRuns, idxLumi)
		return block
Ejemplo n.º 3
0
def get_dataset_info(opts, args, query_blocks=True):
    config = get_dataset_config(opts, args)
    if opts.threads is not None:
        config.set_int('dataprovider thread max', int(opts.threads) or 1)
    provider = config.get_composited_plugin(
        'dataset',
        cls=DataProvider,
        bind_kwargs={'provider_name_default': config.get('dataset provider')},
        default_compositor=':ThreadedMultiDatasetProvider:')
    dataset_list = sorted(provider.get_dataset_name_list())
    if len(dataset_list) == 0:
        raise DatasetError('No datasets matched!')

    # Query blocks only if needed
    query_blocks = False
    for option in opts.__dict__:
        if option.startswith('list_') and (option != 'list_dataset_names') or (
                option == 'save'):
            if getattr(opts, option):
                query_blocks = True

    block_list = None
    if query_blocks:
        block_list = provider.get_block_list_cached(show_stats=False)
        if len(block_list) == 0:
            raise DatasetError('No blocks matched!')
        if opts.ordered:
            sort_inplace(block_list,
                         key=itemgetter(DataProvider.Dataset,
                                        DataProvider.BlockName))
            for block in block_list:
                sort_inplace(block[DataProvider.FileList],
                             key=itemgetter(DataProvider.URL))
    return (provider, dataset_list, block_list)
Ejemplo n.º 4
0
 def _iter_datasource_items(self, item, metadata_dict, entries,
                            location_list, obj_dict):
     if 'JOBINFO' not in obj_dict:
         raise DatasetError(
             'Job infos not available! Ensure that "JobInfoFromOutputDir" is selected!'
         )
     try:
         job_info_dict = obj_dict['JOBINFO']
         file_info_str_iter = ifilter(lambda x: x[0].startswith('file'),
                                      job_info_dict.items())
         file_info_tuple_list = imap(
             lambda x_y: tuple(x_y[1].strip('"').split('  ')),
             file_info_str_iter)
         for (file_hash, fn_local, fn_dest,
              se_path) in file_info_tuple_list:
             metadata_dict.update({
                 'SE_OUTPUT_HASH_MD5':
                 file_hash,
                 'SE_OUTPUT_FILE':
                 fn_local,
                 'SE_OUTPUT_BASE':
                 os.path.splitext(fn_local)[0],
                 'SE_OUTPUT_PATH':
                 se_path
             })
             yield (os.path.join(se_path, fn_dest), metadata_dict, entries,
                    location_list, obj_dict)
     except Exception:
         raise DatasetError('Unable to read file stageout information!')
Ejemplo n.º 5
0
	def _check_lumi_filter(self, block, idx_runs, idx_lumi):
		lumi_filter = self._lumi_filter.lookup(block[DataProvider.Nickname], is_selector=False)
		if not lumi_filter:
			return
		if (self._lumi_strict == LumiMode.strict) and ((idx_runs is None) or (idx_lumi is None)):
			raise DatasetError('Strict lumi filter active but ' +
				'dataset %s does not provide lumi information!' % DataProvider.get_block_id(block))
		elif (self._lumi_strict == LumiMode.weak) and (idx_runs is None):
			raise DatasetError('Weak lumi filter active but ' +
				'dataset %s does not provide run information!' % DataProvider.get_block_id(block))
Ejemplo n.º 6
0
	def _iter_datasource_items(self, item, metadata_dict, entries, location_list, obj_dict):
		jobnum = metadata_dict['GC_JOBNUM']
		cms_log_fn = os.path.join(item, 'cmssw.dbs.tar.gz')
		if os.path.exists(cms_log_fn):
			tar = tarfile.open(cms_log_fn, 'r')
			# Collect infos about transferred files
			file_summary_map = {}
			try:
				file_info_str_list = tar.extractfile('files').readlines()
				for rawdata in imap(lambda value: bytes2str(value).split(), file_info_str_list):
					file_summary_map[rawdata[2]] = {
						'SE_OUTPUT_HASH_CRC32': rawdata[0],
						'SE_OUTPUT_SIZE': int(rawdata[1])
					}
				obj_dict['CMSSW_FILES'] = file_summary_map
			except Exception:
				raise DatasetError('Could not read CMSSW file infos for job %d!' % jobnum)
			# Collect infos about CMSSW processing steps
			config_summary_map = {}
			self._process_steps(jobnum, tar, config_summary_map, file_summary_map)
			for cfg in config_summary_map:
				job_hash_list = metadata_dict.setdefault('CMSSW_CONFIG_JOBHASH', [])
				job_hash_list.append(config_summary_map[cfg]['CMSSW_CONFIG_HASH'])
			obj_dict.update({'CMSSW_CONFIG': config_summary_map, 'CMSSW_FILES': file_summary_map})
			tar.close()
		yield (item, metadata_dict, entries, location_list, obj_dict)
Ejemplo n.º 7
0
def create_dbs3_json_blocks(opts, dataset_blocks):
    dbs3_proto_block_iter = create_dbs3_proto_blocks(opts, dataset_blocks)
    for (block, block_dump, block_size, dataset_type) in dbs3_proto_block_iter:
        dataset = block[DataProvider.Dataset]
        try:
            primary_dataset, processed_dataset, data_tier = dataset[1:].split(
                '/')
        except Exception:
            raise DatasetError('Dataset name %s is not a valid DBS name!' %
                               dataset)

        # add primary dataset information
        block_dump['primds'] = {
            'primary_ds_type': dataset_type,
            'primary_ds_name': primary_dataset
        }

        # add dataset information
        block_dump['dataset'] = {
            'dataset': dataset,
            'processed_ds_name': processed_dataset,
            'data_tier_name': data_tier,
            'physics_group_name': None,
            'dataset_access_type': 'VALID',
            'xtcrosssection':
            None,  # TODO: Add to metadata from FrameWorkJobReport, if possible!
        }

        # add block information
        site_db = CRIC()
        try:
            origin_site_name = site_db.se_to_cms_name(
                block[DataProvider.Locations][0])[0]
        except IndexError:
            clear_current_exception()
            origin_site_name = 'UNKNOWN'

        block_dump['block'] = {
            'block_name': DataProvider.get_block_id(block),
            'block_size': block_size,
            'file_count': len(block[DataProvider.FileList]),
            'origin_site_name': origin_site_name
        }
        if opts.do_close_blocks:
            block_dump['block']['open_for_writing'] = 0
        else:
            block_dump['block']['open_for_writing'] = 1

        # add acquisition_era, CRAB is important because of checks within DBS 3
        block_dump['acquisition_era'] = {
            'acquisition_era_name': 'CRAB',
            'start_date': 0
        }
        # add processing_era
        block_dump['processing_era'] = {
            'processing_version': 1,
            'description': 'grid-control'
        }

        yield validate_dbs3_json('blockBulk', block_dump)
Ejemplo n.º 8
0
 def getEntries(self, path, metadata, events, seList, objStore):
     jobNum = metadata['GC_JOBNUM']
     tar = tarfile.open(os.path.join(path, 'cmssw.dbs.tar.gz'), 'r')
     # Collect infos about transferred files
     fileSummaryMap = {}
     try:
         for rawdata in imap(str.split,
                             tar.extractfile('files').readlines()):
             fileSummaryMap[rawdata[2]] = {
                 'SE_OUTPUT_HASH_CRC32': rawdata[0],
                 'SE_OUTPUT_SIZE': int(rawdata[1])
             }
         objStore['CMSSW_FILES'] = fileSummaryMap
     except Exception:
         raise DatasetError('Could not read CMSSW file infos for job %d!' %
                            jobNum)
     # Collect infos about CMSSW processing steps
     cfgSummaryMap = {}
     self._processSteps(jobNum, tar, cfgSummaryMap, fileSummaryMap)
     for cfg in cfgSummaryMap:
         metadata.setdefault('CMSSW_CONFIG_JOBHASH', []).append(
             cfgSummaryMap[cfg]['CMSSW_CONFIG_HASH'])
     objStore.update({
         'CMSSW_CONFIG': cfgSummaryMap,
         'CMSSW_FILES': fileSummaryMap
     })
     tar.close()
     yield (path, metadata, events, seList, objStore)
Ejemplo n.º 9
0
 def __init__(self,
              config,
              datasource_name,
              dataset_expr,
              dataset_nick=None):  # pylint:disable=super-init-not-called
     raise DatasetError('CMS deprecated all DBS2 Services in April 2014! ' +
                        'Please use DBS3Provider instead.')
Ejemplo n.º 10
0
def main(opts, args):
	config = get_dataset_config(opts, args)

	provider = config.getPlugin('dataset', cls = DataProvider)
	blocks = provider.getBlocks()
	if len(blocks) == 0:
		raise DatasetError('No blocks!')

	datasets = set(imap(itemgetter(DataProvider.Dataset), blocks))
	if len(datasets) > 1 or opts.info:
		headerbase = [(DataProvider.Dataset, 'Dataset')]
	else:
		print('Dataset: %s' % blocks[0][DataProvider.Dataset])
		headerbase = []

	if opts.list_datasets:
		list_datasets(blocks)
	if opts.list_blocks:
		list_blocks(blocks, headerbase)
	if opts.list_files:
		list_files(datasets, blocks)
	if opts.list_storage:
		list_storage(blocks, headerbase)
	if opts.metadata and not opts.save:
		list_metadata(datasets, blocks)
	if opts.block_metadata and not opts.save:
		list_block_metadata(datasets, blocks)
	if opts.config_entry:
		list_config_entries(opts, blocks, provider)
	if opts.info:
		list_infos(blocks)
	if opts.save:
		save_dataset(opts, provider)
Ejemplo n.º 11
0
    def getGCBlocks(self, usePhedex):
        for datasetPath in self.getDatasets():
            counter = 0
            for (blockPath,
                 listSE) in self.getCMSBlocks(datasetPath,
                                              getSites=not usePhedex):
                result = {}
                result[DataProvider.Dataset] = blockPath.split('#')[0]
                result[DataProvider.BlockName] = blockPath.split('#')[1]

                if usePhedex:  # Start parallel phedex query
                    dictSE = {}
                    tPhedex = start_thread(
                        'Query phedex site info for %s' % blockPath,
                        self._getPhedexSEList, blockPath, dictSE)
                    self.fillCMSFiles(result, blockPath)
                    tPhedex.join()
                    listSE = dictSE.get(blockPath)
                else:
                    self.fillCMSFiles(result, blockPath)
                result[DataProvider.Locations] = listSE

                if len(result[DataProvider.FileList]):
                    counter += 1
                    yield result

            if counter == 0:
                raise DatasetError(
                    'Dataset %s does not contain any valid blocks!' %
                    datasetPath)
Ejemplo n.º 12
0
 def _process(self, key, setup, path, metadata):
     if setup is not None:
         (delim, ds, de, mod) = setup
         value = str.join(delim, os.path.basename(path).split(delim)[ds:de])
         try:
             metadata[key] = str(mod(value))
         except Exception:
             raise DatasetError('Unable to modifiy %s: %r' % (key, value))
Ejemplo n.º 13
0
 def _process(self, item, metadata_dict, key, delim, delim_start, delim_end,
              modifier_fun):
     value = str.join(
         delim,
         os.path.basename(item).split(delim)[delim_start:delim_end])
     try:
         metadata_dict[key] = str(modifier_fun(value))
     except Exception:
         raise DatasetError('Unable to modifiy %s: %r' % (key, value))
Ejemplo n.º 14
0
 def __init__(self, config, datasource_name):
     InfoScanner.__init__(self, config, datasource_name)
     self._ext_work_dn = config.get_dn('source directory')
     self._ext_output_dir = os.path.join(self._ext_work_dn, 'output')
     if not os.path.isdir(self._ext_output_dir):
         raise DatasetError('Unable to find task output directory %s' %
                            repr(self._ext_output_dir))
     self._selector = JobSelector.create(
         config.get('source job selector', ''))
Ejemplo n.º 15
0
 def __init__(self, config):
     InfoScanner.__init__(self, config)
     self._extWorkDir = config.getPath('source directory',
                                       onChange=triggerDataResync)
     self._extOutputDir = os.path.join(self._extWorkDir, 'output')
     if not os.path.isdir(self._extOutputDir):
         raise DatasetError('Unable to find task output directory %s' %
                            repr(self._extOutputDir))
     self._selector = JobSelector.create(
         config.get('source job selector', '', onChange=triggerDataResync))
Ejemplo n.º 16
0
 def getDatasets(self):
     if self._cache_dataset is None:
         self._cache_dataset = [self._datasetPath]
         if '*' in self._datasetPath:
             self._cache_dataset = list(
                 self.getCMSDatasets(self._datasetPath))
             if not self._cache_dataset:
                 raise DatasetError(
                     'No datasets selected by DBS wildcard %s !' %
                     self._datasetPath)
     return self._cache_dataset
Ejemplo n.º 17
0
 def __init__(self, config):
     InfoScanner.__init__(self, config)
     self._path = config.get('source directory',
                             '.',
                             onChange=triggerDataResync)
     self._recurse = config.getBool('source recurse',
                                    False,
                                    onChange=triggerDataResync)
     if ('://' in self._path) and self._recurse:
         raise DatasetError('Recursion is not supported for URL: %s' %
                            repr(self._path))
     elif '://' not in self._path:
         self._path = utils.cleanPath(self._path)
Ejemplo n.º 18
0
 def get_dataset_name_list(self):
     if self._cache_dataset is None:
         self._cache_dataset = [self._dataset_path]
         if '*' in self._dataset_path:
             activity = Activity('Getting dataset list for %s' %
                                 self._dataset_path)
             self._cache_dataset = list(
                 self._get_cms_dataset_list(self._dataset_path))
             if not self._cache_dataset:
                 raise DatasetError(
                     'No datasets selected by DBS wildcard %s !' %
                     self._dataset_path)
             activity.finish()
     return self._cache_dataset
Ejemplo n.º 19
0
    def _init_reader(self):
        # look for aborted inits / resyncs - and try to restore old state if possible
        if self._exists_data_path('map.tar.resync') and self._exists_data_path(
                'cache.dat.resync'):
            rename_file(self._get_data_path('cache.dat.resync'),
                        self._get_data_path('cache.dat'))
            rename_file(self._get_data_path('map.tar.resync'),
                        self._get_data_path('map.tar'))
        elif self._exists_data_path(
                'map.tar.resync') or self._exists_data_path(
                    'cache.dat.resync'):
            raise DatasetError(
                'Found broken dataset partition resync state in work directory'
            )

        if self._exists_data_path(
                'map.tar') and not self._exists_data_path('cache.dat'):
            raise DatasetError(
                'Found broken dataset partition in work directory')
        elif not self._exists_data_path('map.tar'):
            # create initial partition map file
            if not self._exists_data_path('cache.dat'):
                provider = self._provider
            else:
                provider = DataProvider.load_from_file(
                    self._get_data_path('cache.dat'))
            block_iter = DataProvider.save_to_file_iter(
                self._get_data_path('cache.dat.init'),
                provider.get_block_list_cached(show_stats=True))
            partition_iter = self._splitter.split_partitions(block_iter)
            DataSplitter.save_partitions(self._get_data_path('map.tar.init'),
                                         partition_iter)
            rename_file(self._get_data_path('cache.dat.init'),
                        self._get_data_path('cache.dat'))
            rename_file(self._get_data_path('map.tar.init'),
                        self._get_data_path('map.tar'))
        return DataSplitter.load_partitions(self._get_data_path('map.tar'))
Ejemplo n.º 20
0
	def _process_steps(self, jobnum, tar, config_summary_map, file_summary_map):
		cmssw_version = bytes2str(tar.extractfile('version').read()).strip()
		for cfg in ifilter(lambda x: ('/' not in x) and (x not in ['version', 'files']), tar.getnames()):
			try:
				(config_summary, config_report, events_read) = self._process_config(tar, cfg)
				config_summary['CMSSW_VERSION'] = cmssw_version
				config_summary_map[cfg] = config_summary
			except Exception:
				raise DatasetError('Could not read config infos about %s in job %d' % (cfg, jobnum))

			for output_file_node in config_report.getElementsByTagName('File'):
				(file_summary, pfn) = self._process_output_file(config_report, output_file_node)
				file_summary['CMSSW_EVENTS_READ'] = events_read
				file_summary['CMSSW_CONFIG_FILE'] = cfg
				file_summary_map.setdefault(pfn, {}).update(file_summary)
Ejemplo n.º 21
0
 def getCMSBlocks(self, datasetPath, getSites):
     iter_blockname_selist = self.getCMSBlocksImpl(datasetPath, getSites)
     n_blocks = 0
     selected_blocks = False
     for (blockname, selist) in iter_blockname_selist:
         n_blocks += 1
         if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1]
                                               != self._datasetBlock):
             continue
         selected_blocks = True
         yield (blockname, selist)
     if (n_blocks > 0) and not selected_blocks:
         raise DatasetError(
             'Dataset %r contains %d blocks, but none were selected by %r' %
             (datasetPath, n_blocks, self._datasetBlock))
Ejemplo n.º 22
0
    def _get_gc_block_list(self, use_phedex):
        dataset_name_list = self.get_dataset_name_list()
        progress_ds = ProgressActivity('Getting dataset',
                                       len(dataset_name_list))
        for dataset_idx, dataset_path in enumerate(dataset_name_list):
            progress_ds.update_progress(dataset_idx,
                                        msg='Getting dataset %s' %
                                        dataset_path)
            counter = 0
            blockinfo_list = list(
                self._filter_cms_blockinfo_list(dataset_path, not use_phedex))
            progress_block = ProgressActivity('Getting block information',
                                              len(blockinfo_list))
            for (block_path, replica_infos) in blockinfo_list:
                result = {}
                result[DataProvider.Dataset] = block_path.split('#')[0]
                result[DataProvider.BlockName] = block_path.split('#')[1]
                progress_block.update_progress(
                    counter,
                    msg='Getting block information for ' +
                    result[DataProvider.BlockName])

                if use_phedex and self._allow_phedex:  # Start parallel phedex query
                    replicas_dict = {}
                    phedex_thread = start_thread(
                        'Query phedex site info for %s' % block_path,
                        self._get_phedex_replica_list, block_path,
                        replicas_dict)
                    self._fill_cms_fi_list(result, block_path)
                    phedex_thread.join()
                    replica_infos = replicas_dict.get(block_path)
                else:
                    self._fill_cms_fi_list(result, block_path)
                result[DataProvider.Locations] = self._process_replica_list(
                    block_path, replica_infos)

                if len(result[DataProvider.FileList]):
                    counter += 1
                    yield result
            progress_block.finish()

            if counter == 0:
                raise DatasetError(
                    'Dataset %s does not contain any valid blocks!' %
                    dataset_path)
        progress_ds.finish()
Ejemplo n.º 23
0
 def _filter_cms_blockinfo_list(self, dataset_path, do_query_sites):
     iter_dataset_block_name_selist = self._iter_cms_blocks(
         dataset_path, do_query_sites)
     n_blocks = 0
     selected_blocks = False
     for (dataset_block_name, selist) in iter_dataset_block_name_selist:
         n_blocks += 1
         block_name = str.split(dataset_block_name, '#')[1]
         if (self._dataset_block_selector !=
                 'all') and (block_name != self._dataset_block_selector):
             continue
         selected_blocks = True
         yield (dataset_block_name, selist)
     if (n_blocks > 0) and not selected_blocks:
         raise DatasetError(
             'Dataset %r contains %d blocks, but none were selected by %r' %
             (dataset_path, n_blocks, self._dataset_block_selector))
Ejemplo n.º 24
0
 def _resync_partitions(self, path, block_list_old, block_list_new):
     partition_resync_handler = self._splitter.get_resync_handler()
     progress = ProgressActivity(
         progress_max=self.get_parameter_len(),
         msg=
         'Writing resyncronized dataset partitions (progress is estimated)')
     path_tmp = path + '.tmp'
     try:
         resync_result = partition_resync_handler.resync(
             self._splitter, self._reader, block_list_old, block_list_new)
         DataSplitter.save_partitions(path_tmp,
                                      resync_result.partition_iter,
                                      progress)
     except Exception:
         raise DatasetError('Unable to resync %r' %
                            self.get_datasource_name())
     os.rename(path_tmp, path)
     return (resync_result.pnum_list_redo, resync_result.pnum_list_disable)
Ejemplo n.º 25
0
    def _processSteps(self, jobNum, tar, cfgSummaryMap, fileSummaryMap):
        cmsswVersion = tar.extractfile('version').read().strip()
        for cfg in ifilter(
                lambda x: ('/' not in x) and (x not in ['version', 'files']),
                tar.getnames()):
            try:
                (cfgSummary, cfgReport, evRead) = self._processCfg(tar, cfg)
                cfgSummary['CMSSW_VERSION'] = cmsswVersion
                cfgSummaryMap[cfg] = cfgSummary
            except Exception:
                raise DatasetError(
                    'Could not read config infos about %s in job %d' %
                    (cfg, jobNum))

            for outputFile in cfgReport.getElementsByTagName('File'):
                (fileSummary,
                 pfn) = self._processOutputFile(cfgReport, outputFile)
                fileSummary['CMSSW_EVENTS_READ'] = evRead
                fileSummary['CMSSW_CONFIG_FILE'] = cfg
                fileSummaryMap.setdefault(pfn, {}).update(fileSummary)
Ejemplo n.º 26
0
    def __init__(self,
                 dataDir,
                 srcName,
                 dataProvider,
                 dataSplitter,
                 dataProc,
                 repository,
                 keepOld=True):
        LimitedResyncParameterSource.__init__(self)
        (self._dn, self._name, self._data_provider, self._data_splitter, self._part_proc, self._keepOld) = \
         (dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld)
        repository['dataset:%s' % srcName] = self
        self.resyncSetup(interval=-1)

        if not dataProvider:  # debug mode - used by scripts - disables resync
            self._maxN = self._data_splitter.getMaxJobs()
            return

        # look for aborted resyncs - and try to restore old state if possible
        if self._existsDataPath('cache.dat.resync') and self._existsDataPath(
                'map.tar.resync'):
            utils.renameFile(self._getDataPath('cache.dat.resync'),
                             self._getDataPath('cache.dat'))
            utils.renameFile(self._getDataPath('map.tar.resync'),
                             self._getDataPath('map.tar'))
        elif self._existsDataPath('cache.dat.resync') or self._existsDataPath(
                'map.tar.resync'):
            raise DatasetError('Found broken resync state')

        if self._existsDataPath('cache.dat') and self._existsDataPath(
                'map.tar'):
            self._data_splitter.importPartitions(self._getDataPath('map.tar'))
        else:
            DataProvider.saveToFile(
                self._getDataPath('cache.dat'),
                self._data_provider.getBlocks(show_stats=False))
            self._data_splitter.splitDataset(
                self._getDataPath('map.tar'),
                self._data_provider.getBlocks(show_stats=False))

        self._maxN = self._data_splitter.getMaxJobs()
Ejemplo n.º 27
0
    def processBlock(self, block):
        if self._lumi_filter.empty() and (
            (self._lumi_keep == LumiKeep.RunLumi) or
            (DataProvider.Metadata not in block)):
            return block

        def getMetadataIdx(key):
            if key in block[DataProvider.Metadata]:
                return block[DataProvider.Metadata].index(key)

        idxRuns = getMetadataIdx('Runs')
        idxLumi = getMetadataIdx('Lumi')
        if not self._lumi_filter.empty():
            lumi_filter = self._lumi_filter.lookup(
                block[DataProvider.Nickname], is_selector=False)
            if lumi_filter and ((idxRuns is None) or
                                (idxLumi is None)) and self._lumi_strict:
                fqName = block[DataProvider.Dataset]
                if block[DataProvider.BlockName] != '0':
                    fqName += '#' + block[DataProvider.BlockName]
                raise DatasetError(
                    'Strict lumi filter active but dataset %s does not provide lumi information!'
                    % fqName)

        block[DataProvider.FileList] = list(
            self._processFI(block, idxRuns, idxLumi))
        if not block[DataProvider.FileList]:
            return
        block[DataProvider.NEntries] = sum(
            imap(lambda fi: fi[DataProvider.NEntries],
                 block[DataProvider.FileList]))
        if self._lumi_keep == LumiKeep.RunLumi:
            return block
        elif self._lumi_keep == LumiKeep.Run:
            if idxLumi is not None:
                block[DataProvider.Metadata].pop(idxLumi)
            return block
        removeRunLumi(block[DataProvider.Metadata], idxRuns, idxLumi)
        return block
Ejemplo n.º 28
0
 def __init__(self, config, datasetExpr, datasetNick, datasetID=0):
     raise DatasetError(
         'CMS deprecated all DBS2 Services in April 2014! Please use DBS3Provider instead.'
     )