def tmdbInjectBlock(dbsUrl, datasetPath, blockName, phedexConfig, workingDir="/tmp", nodes=None, storageElements=None): """ _tmdbInjectBlock_ Util Method for injecting a fileblock into TMDB """ fileName = blockName.replace("/", "_") fileName = fileName.replace("#", "") dropXML = "%s/%s-PhEDExDrop.xml" % (workingDir, fileName) xmlContent = makePhEDExDrop(dbsUrl, datasetPath, blockName) handle = open(dropXML, 'w') handle.write(xmlContent) handle.close() reader = DBSReader(dbsUrl) if not storageElements: storageElements = reader.listFileBlockLocation(blockName) tmdbInject(phedexConfig, dropXML, nodes, *storageElements) return
def __call__(self): """ _operator()_ Load PU dataset information from DBS """ reader = DBSReader(self.dbsUrl) blocks = reader.listFileBlocks(self.dataset, False) for block in blocks: # // # // Populate locations #// locations = reader.listFileBlockLocation(block) if locations: self.blockSites[block] = locations for location in locations: if not self.sites.has_key(location): self.sites[location] = set() self.sites[location].add(block) # // # // Populate File list for block #// self[block] = reader.lfnsInBlock(block) return
def tmdbInjectBlock(dbsUrl, datasetPath, blockName, phedexConfig, workingDir="/tmp", nodes=None, storageElements=None): """ _tmdbInjectBlock_ Util Method for injecting a fileblock into TMDB """ fileName = blockName.replace("/","_") fileName = fileName.replace("#","") dropXML = "%s/%s-PhEDExDrop.xml" % (workingDir, fileName) xmlContent = makePhEDExDrop(dbsUrl, datasetPath, blockName) handle = open(dropXML, 'w') handle.write(xmlContent) handle.close() reader = DBSReader(dbsUrl) if not storageElements: storageElements = reader.listFileBlockLocation(blockName) tmdbInject(phedexConfig, dropXML, nodes, *storageElements ) return
def checkPublication(self): """ check dataset publication in a dbs """ common.logger.info('--->>> Check data publication: dataset '+self.dataset_to_check+' in DBS url '+ self.DBSURL+'\n') # // # // Get API to DBS #// dbsreader = DBSReader(self.DBSURL) # // # // Get list of datasets #// if len(self.dataset_to_check.split('/')) < 4: msg = "the provided dataset name is not correct" raise CrabException(msg) else: primds=self.dataset_to_check.split('/')[1] procds=self.dataset_to_check.split('/')[2] tier=self.dataset_to_check.split('/')[3] datasets=dbsreader.matchProcessedDatasets(primds,tier,procds) if common.debugLevel: print "PrimaryDataset = ", primds print "ProcessedDataset = ", procds print "DataTier = ", tier print "datasets matching your requirements= ", datasets for dataset in datasets: # // # // Get list of blocks for the dataset and their location #// if len(dataset.get('PathList'))==0: print "===== Empty dataset yet /%s/%s with tiers %s"%(dataset.get('PrimaryDataset')['Name'],dataset.get('Name'),dataset.get('TierList')) else: for datasetpath in dataset.get('PathList'): nevttot=0 print "=== dataset %s"%datasetpath ### FEDE ####### if dataset['Description'] != None: print "=== dataset description = ", dataset['Description'] ################ blocks=dbsreader.getFileBlocksInfo(datasetpath) for block in blocks: SEList=dbsreader.listFileBlockLocation(block['Name']) # replace that with DLS query print "===== File block name: %s" %block['Name'] print " File block located at: ", SEList print " File block status: %s" %block['OpenForWriting'] print " Number of files: %s"%block['NumberOfFiles'] print " Number of Bytes: %s"%block['BlockSize'] print " Number of Events: %s"%block['NumberOfEvents'] if common.debugLevel: print "--------- info about files --------" print " Size \t Events \t LFN \t FileStatus " files=dbsreader.listFilesInBlock(block['Name']) for file in files: print "%s %s %s %s"%(file['FileSize'],file['NumberOfEvents'],file['LogicalFileName'],file['Status']) nevttot = nevttot + block['NumberOfEvents'] print "\n total events: %s in dataset: %s\n"%(nevttot,datasetpath) if not common.debugLevel: common.logger.info('You can obtain more info about files of the dataset using: crab -checkPublication -USER.dataset_to_check='+self.dataset_to_check+' -USER.dbs_url_for_publication='+self.DBSURL+' -debug')
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("SplitSize = %s" % self.splitSize) logging.debug("AllowedSites = %s" % self.allowedSites) thefiles = Fileset(name='FilesToSplit') reader = DBSReader(self.dbsUrl) fileList = reader.dbs.listFiles(analysisDataset = self.inputDataset(), retriveList = [ 'retrive_block', 'retrive_run']) blocks = {} for f in fileList: block = f['Block']['Name'] if not blocks.has_key(block): blocks[block] = reader.listFileBlockLocation(block) f['Block']['StorageElementList'].extend(blocks[block]) wmbsFile = File(f['LogicalFileName']) [ wmbsFile['locations'].add(x) for x in blocks[block] ] wmbsFile['block'] = block thefiles.addFile( wmbsFile ) work = Workflow() subs = Subscription( fileset = thefiles, workflow = work, split_algo = 'FileBased', type = "Processing") splitter = SplitterFactory() jobfactory = splitter(subs) jobs = jobfactory(files_per_job = self.splitSize) jobDefs = [] for job in jobs.jobs: #job.mask.setMaxAndSkipEvents(-1, 0) jobDef = JobDefinition() jobDef['LFNS'].extend(job.listLFNs()) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.listFiles() ] jobDefs.append(jobDef) return jobDefs
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("SplitSize = %s" % self.splitSize) logging.debug("AllowedSites = %s" % self.allowedSites) thefiles = Fileset(name='FilesToSplit') reader = DBSReader(self.dbsUrl) fileList = reader.dbs.listFiles( analysisDataset=self.inputDataset(), retriveList=['retrive_block', 'retrive_run']) blocks = {} for f in fileList: block = f['Block']['Name'] if not blocks.has_key(block): blocks[block] = reader.listFileBlockLocation(block) f['Block']['StorageElementList'].extend(blocks[block]) wmbsFile = File(f['LogicalFileName']) [wmbsFile['locations'].add(x) for x in blocks[block]] wmbsFile['block'] = block thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='FileBased', type="Processing") splitter = SplitterFactory() jobfactory = splitter(subs) jobs = jobfactory(files_per_job=self.splitSize) jobDefs = [] for job in jobs.jobs: #job.mask.setMaxAndSkipEvents(-1, 0) jobDef = JobDefinition() jobDef['LFNS'].extend(job.listLFNs()) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.listFiles() ] jobDefs.append(jobDef) return jobDefs
def createJobSplitter(dataset, dbsUrl, onlyClosedBlocks=False, siteWhitelist=[], blockWhitelist=[], withParents=False): """ _createJobSplitter_ Instantiate a JobSplitter instance for the dataset provided and populate it with details from DBS. """ reader = DBSReader(dbsUrl) result = JobSplitter(dataset) filterSites = len(siteWhitelist) > 0 filterBlocks = len(blockWhitelist) > 0 for blockName in reader.listFileBlocks(dataset, onlyClosedBlocks): locations = reader.listFileBlockLocation(blockName) if filterBlocks: if blockName not in blockWhitelist: msg = "Excluding block %s based on block whitelist: %s\n" % (blockName, blockWhitelist) logging.debug(msg) continue if filterSites: siteMatches = filter(lambda x: x in locations, siteWhitelist) if len(siteMatches) == 0: msg = "Excluding block %s based on sites: %s \n" % (blockName, locations) logging.debug(msg) continue else: locations = siteMatches newBlock = result.newFileblock(blockName, *locations) if withParents == True: blockData = reader.getFileBlockWithParents(blockName)[blockName] else: blockData = reader.getFileBlock(blockName)[blockName] totalEvents = 0 fileList = set() for fileInfo in blockData["Files"]: totalEvents += fileInfo["NumberOfEvents"] fileList.add(fileInfo["LogicalFileName"]) if withParents: parList = [x["LogicalFileName"] for x in fileInfo["ParentList"]] newBlock.addFile(fileInfo["LogicalFileName"], fileInfo["NumberOfEvents"], parList) else: newBlock.addFile(fileInfo["LogicalFileName"], fileInfo["NumberOfEvents"]) logging.debug("Block %s contains %s events in %s files" % (blockName, totalEvents, len(fileList))) return result
def loadSites(self, **dbsContacts): """ Get the list of sites hosting the PU from DBS/DLS """ dbsUrl = dbsContacts.get('DBSURL', None) if dbsUrl == None: dbsUrl = getLocalDBSURL() reader = DBSReader(dbsUrl) locations = [] blocks = reader.listFileBlocks(self.dataset, True) for block in blocks: try: locations = reader.listFileBlockLocation(block) except Exception, ex: msg = "Unable to find DLS Locations for Block: %s\n" % block msg += str(ex) logging.warning(msg) continue
def importDatasetWithExistingParents(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed = True): """ _importDataset_ Import a dataset into the local scope DBS. It complains if the parent dataset ar not there!! - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if not str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block,sename) logging.info(sename) continue try: xferData = reader.dbs.listDatasetContents( sourceDatasetPath, block ) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not read content of dataset:\n ==> %s\n" % ( sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) try: self.dbs.insertDatasetContents(xferData) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg)
def makeBlockList(self, onlyClosedBlocks = False, sites=None, providedOnlyBlocks=None): """ _makeBlockList_ Generate the list of blocks for the workflow. 1. Get the list of all blocks from the DBS 2. Compare to list of blocks in persistency file 3. Obtain the intersection of the new blocks and the providedOnlyBlocks list. 4. Set OnlyBlocks parameter to intersection obtained. """ #reader = DBSReader(self.dbsUrl) # At this point, blocks should be in local DBS localDBS = getLocalDBSURL() reader = DBSReader(localDBS) dbsBlocks = reader.listFileBlocks(self.inputDataset(), onlyClosedBlocks) if self.persistData.blocks != []: remover = lambda x : x not in self.persistData.blocks newBlocks = filter(remover, dbsBlocks) else: newBlocks = dbsBlocks # // # // Skipping blocks without site info #// msg = "Filtering blocks according to Site information..." logging.info(msg) blocksAtSites = [] for block in newBlocks: locations = reader.listFileBlockLocation(block) if not locations: msg = "\nSkipping block: " msg += "No site info available for block %s " % block logging.info(msg) elif sites is not None: locationInSites = False for location in locations: if location in sites: locationInSites = True break if locationInSites: blocksAtSites.append(block) else: msg = "\nSkipping block: " msg += "Block %s has no replicas in %s" % (block, ", ".join(sites)) logging.info(msg) else: blocksAtSites.append(block) newBlocks = blocksAtSites if len(newBlocks) == 0: msg = "No New Blocks found for dataset\n" raise RuntimeError, msg # // # // Check presence of provided Blocks in newBlocks #// blocksToProcess = [] if providedOnlyBlocks is not None : providedOnlyBlocksList = providedOnlyBlocks.split(',') msg = "OnlyBlocks setting provided. Processing it..." logging.info(msg) msg = "OnlyBlocks list contains %s Blocks." % ( len(providedOnlyBlocksList)) logging.info(msg) blockCount = 1 for block in providedOnlyBlocksList : if block.strip() in newBlocks : blocksToProcess.append(block.strip()) msg = "Block %s: Adding Block %s" % ( blockCount, block) msg += " to the Whitelist" logging.info(msg) else: msg = "Block %s: Skipping Block %s " % ( blockCount, block) msg += "It's no New or it has been processed" msg += " already." logging.info(msg) blockCount += 1 else : blocksToProcess = newBlocks msg = "OnlyBlocks setting not provided. Processing" msg += " all New Blocks for Dataset\n" logging.info(msg) if len(blocksToProcess) == 0 : msg = "OnlyBlocks list does not match any New Blocks" msg += " found for Dataset\n" raise RuntimeError, msg blockList = str(blocksToProcess) blockList = blockList.replace("[", "") blockList = blockList.replace("]", "") blockList = blockList.replace("\'", "") blockList = blockList.replace("\"", "") self.workflow.parameters['OnlyBlocks'] = blockList self.persistData.blocks.extend(blocksToProcess) return
def createJobSplitter(dataset, dbsUrl, onlyClosedBlocks=False, siteWhitelist=[], blockWhitelist=[], withParents=False): """ _createJobSplitter_ Instantiate a JobSplitter instance for the dataset provided and populate it with details from DBS. """ reader = DBSReader(dbsUrl) result = JobSplitter(dataset) filterSites = len(siteWhitelist) > 0 filterBlocks = len(blockWhitelist) > 0 for blockName in reader.listFileBlocks(dataset, onlyClosedBlocks): locations = reader.listFileBlockLocation(blockName) if filterBlocks: if blockName not in blockWhitelist: msg = "Excluding block %s based on block whitelist: %s\n" % ( blockName, blockWhitelist) logging.debug(msg) continue if filterSites: siteMatches = filter(lambda x: x in locations, siteWhitelist) if len(siteMatches) == 0: msg = "Excluding block %s based on sites: %s \n" % ( blockName, locations, ) logging.debug(msg) continue else: locations = siteMatches newBlock = result.newFileblock(blockName, *locations) if withParents == True: blockData = reader.getFileBlockWithParents(blockName)[blockName] else: blockData = reader.getFileBlock(blockName)[blockName] totalEvents = 0 fileList = set() for fileInfo in blockData['Files']: totalEvents += fileInfo['NumberOfEvents'] fileList.add(fileInfo['LogicalFileName']) if withParents: parList = [ x['LogicalFileName'] for x in fileInfo['ParentList'] ] newBlock.addFile(fileInfo['LogicalFileName'], fileInfo['NumberOfEvents'], parList) else: newBlock.addFile(fileInfo['LogicalFileName'], fileInfo['NumberOfEvents']) logging.debug("Block %s contains %s events in %s files" % ( blockName, totalEvents, len(fileList), )) return result
def makeFileList(self, onlyClosedBlocks = False, sites=None, providedOnlyBlocks=None, providedOnlyFiles=None): """ _makeFileList_ Generate the list of blocks for the workflow. 1. Get the list of all blocks from the DBS 2. Compare to list of blocks in persistency file 3. Obtain the intersection of the new blocks and the providedOnlyBlocks list. 4. Set OnlyBlocks parameter to intersection obtained. """ #reader = DBSReader(self.dbsUrl) # At this point, blocks should be in local DBS localDBS = getLocalDBSURL() reader = DBSReader(localDBS) # // # // Querying list of blocks from DBS #// msg = "Querying for closed blocks in Local DBS: %s ..." % localDBS logging.info(msg) dbsBlocks = reader.listFileBlocks(self.inputDataset(), onlyClosedBlocks) msg = "Retrieved %s close blocks from Local DBS" % len(dbsBlocks) logging.info(msg) # // # // Constructing mapping structures block-file #// filesToBlocks = {} blocksToFiles = {} dbsFiles = reader.dbs.listFiles(path=self.inputDataset()) for dbsfile in dbsFiles: if dbsfile['Block']['Name'] in dbsBlocks: filesToBlocks[dbsfile['LogicalFileName']] = \ dbsfile['Block']['Name'] blocksToFiles.setdefault(dbsfile['Block']['Name'], [] ).append(dbsfile['LogicalFileName']) # OnlyFiles? if providedOnlyFiles is not None and \ providedOnlyFiles.strip().lower() != 'auto': msg = "Using OnlyFiles list:" msg += " %s files." % len(providedOnlyFiles.split(',')) logging.info(msg) onlyFiles = [x.strip() for x in providedOnlyFiles.split(',') if x] # OnlyFiles=auto elif providedOnlyFiles is not None: msg = "Automatically generating OnlyFiles list from DBS..." logging.info(msg) onlyFiles = self.createOnlyFilesFromWorkflow() # OnlyBlocks elif providedOnlyBlocks is not None: msg = "Using OnlyBLocks list:" msg += " %s blocks." % len(providedOnlyBlocks.split(',')) logging.info(msg) onlyFiles = [] for block in \ [x.strip() for x in providedOnlyBlocks.split(',') if x]: onlyFiles.extend(blocksToFiles[dbsBlocks]) # Processing everything in DBS else: msg = "Processing whole input dataset..." logging.info(msg) onlyFiles = [] for block in dbsBlocks: onlyFiles.extend(blocksToFiles[dbsBlocks]) if not onlyFiles: msg = "No files were found for the input dataset: " + \ self.inputDataset() raise RuntimeError, msg # // # // Filter files that were already processed #// if self.persistData.blocks: msg = "Filtering files that were already processed for this" msg += " workflow..." logging.info(msg) processedFiles = self.persistData.getFiles() msg = "Persistency file has %s file(s)" % len(processedFiles) logging.info(msg) remover = lambda x: x not in processedFiles onlyFiles = filter(remover, onlyFiles) msg = "%s file(s) were removed" % \ str(len(processedFiles) - len(onlyFiles)) logging.info(msg) if not onlyFiles: msg = "No New files were found for the input dataset: " + \ self.inputDataset() raise RuntimeError, msg # // # // Filter files in blocks without site info #// msg = "Filtering blocks according to Site information..." logging.info(msg) candidateBlocks = {} for file in onlyFiles: candidateBlocks.setdefault(filesToBlocks[file], []).append(file) blocksAtSites = [] for block in candidateBlocks: locations = reader.listFileBlockLocation(block) if not locations: msg = "Excluding block without site info ==> %s" % block logging.info(msg) elif sites is not None: locationInSites = False for location in locations: if location in sites: locationInSites = True break if locationInSites: blocksAtSites.append(block) else: msg = "Excluding block without replicas" msg += " in %s ==> %s" % (block, ", ".join(sites)) logging.info(msg) else: blocksAtSites.append(block) if len(blocksAtSites) == 0: msg = "No block has site information." raise RuntimeError, msg # // # // Constructing OnlyBlocks and OnlyFiles list #// onlyBlocks = {} for block in blocksAtSites: onlyBlocks[block] = candidateBlocks[block] onlyFiles = [] for block in onlyBlocks: onlyFiles.extend(onlyBlocks[block]) msg = "\n ==> Files to process: %s" % len(onlyFiles) msg += "\n ==> Blocks to process: %s" % len(onlyBlocks) logging.info(msg) blockList = ",".join(onlyBlocks.keys()) fileList = ",".join(onlyFiles) self.workflow.parameters['OnlyBlocks'] = blockList self.workflow.parameters['OnlyFiles'] = fileList self.persistData.update(onlyBlocks) return
def importDatasetWithExistingParents(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed=True): """ _importDataset_ Import a dataset into the local scope DBS. It complains if the parent dataset ar not there!! - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if not str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str( inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block, sename) logging.info(sename) continue try: xferData = reader.dbs.listDatasetContents( sourceDatasetPath, block) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not read content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) try: self.dbs.insertDatasetContents(xferData) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg)
# print " matchProcessedDatasets(%s,%s,%s)"%(primds,tier,procds) datasets=dbsreader.matchProcessedDatasets(primds,tier,procds) else: datasets=dbsreader.matchProcessedDatasets("*","*","*") for dataset in datasets: # // # // Get list of blocks for the dataset and their location #// for datasetpath in dataset.get('PathList'): nevttot=0 print "===== dataset %s"%datasetpath blocks=dbsreader.getFileBlocksInfo(datasetpath) for block in blocks: SEList=dbsreader.listFileBlockLocation(block['Name']) # replace that with DLS query print "== File block %s is located at: %s"%(block['Name'],SEList) print "File block name: %s" %block['Name'] print "File block status: %s" %block['OpenForWriting'] print "Number of files: %s"%block['NumberOfFiles'] print "Number of Bytes: %s"%block['BlockSize'] print "Number of Events: %s"%block['NumberOfEvents'] if full: print "--------- info about files --------" print " Size \t Events \t LFN \t FileStatus " files=dbsreader.listFilesInBlock(block['Name']) for file in files: print "%s %s %s %s"%(file['FileSize'],file['NumberOfEvents'],file['LogicalFileName'],file['Status']) nevttot = nevttot + block['NumberOfEvents'] print "\n total events: %s in dataset: %s\n"%(nevttot,datasetpath)
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("MergeSize = %s" % self.mergeSize) logging.debug("AllowedSites = %s" % self.allowedSites) logging.debug("Connection to DBS at: %s" % self.dbsUrl) reader = DBSReader(self.dbsUrl) blockList = reader.dbs.listBlocks(dataset=self.inputDataset()) jobDefs = [] for block in blockList: blockName = block['Name'] logging.debug("Getting files for block %s" % blockName) locations = reader.listFileBlockLocation(blockName) fileList = reader.dbs.listFiles(blockName=blockName) if not fileList: # Skip empty blocks continue thefiles = Fileset(name='FilesToSplit') for f in fileList: f['Block']['StorageElementList'].extend(locations) wmbsFile = File(f['LogicalFileName']) [wmbsFile['locations'].add(x) for x in locations] wmbsFile['block'] = blockName wmbsFile['size'] = f['FileSize'] thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='MergeBySize', type="Merge") logging.debug("Info for Subscription %s" % subs) splitter = SplitterFactory() jobfactory = splitter(subs) jobGroups = jobfactory( merge_size=self.mergeSize, # min in Bytes all_files=True # merge all files ) if not jobGroups: raise (SyntaxError) for jobGroup in jobGroups: for job in jobGroup.getJobs(): jobDef = JobDefinition() jobDef['LFNS'].extend(job.getFiles(type='lfn')) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.getFiles() ] jobDefs.append(jobDef) return jobDefs
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("MergeSize = %s" % self.mergeSize) logging.debug("AllowedSites = %s" % self.allowedSites) logging.debug("Connection to DBS at: %s" % self.dbsUrl) reader = DBSReader(self.dbsUrl) blockList = reader.dbs.listBlocks(dataset = self.inputDataset()) jobDefs = [] for block in blockList: blockName = block['Name'] logging.debug("Getting files for block %s" % blockName) locations = reader.listFileBlockLocation(blockName) fileList = reader.dbs.listFiles(blockName = blockName) if not fileList: # Skip empty blocks continue thefiles = Fileset(name='FilesToSplit') for f in fileList: f['Block']['StorageElementList'].extend(locations) wmbsFile = File(f['LogicalFileName']) [ wmbsFile['locations'].add(x) for x in locations ] wmbsFile['block'] = blockName wmbsFile['size'] = f['FileSize'] thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription( fileset = thefiles, workflow = work, split_algo = 'MergeBySize', type = "Merge") logging.debug("Info for Subscription %s" % subs) splitter = SplitterFactory() jobfactory = splitter(subs) jobGroups = jobfactory( merge_size=self.mergeSize, # min in Bytes all_files=True # merge all files ) if not jobGroups: raise(SyntaxError) for jobGroup in jobGroups: for job in jobGroup.getJobs(): jobDef = JobDefinition() jobDef['LFNS'].extend(job.getFiles(type='lfn')) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.getFiles() ] jobDefs.append(jobDef) return jobDefs
def importDataset(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed = True, skipNoSiteError=False): """ _importDataset_ Import a dataset into the local scope DBS with full parentage hirerarchy (at least not slow because branches info is dropped). Parents are also imported. This method imports block by block, then each time a block is imported, its parent blocks will be imported first. - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to - *onlyClosed* : Only closed blocks will be imported if set to True - *skipNoSiteError* : If this is True, then this method wont raise an Exception if a block has no site information in sourceDBS. """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) blkCounter=0 for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// blkCounter=blkCounter+1 msg="Importing block %s of %s: %s " % (blkCounter,len(inputBlocks),block) logging.debug(msg) if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": # we don't skip the error raising if not skipNoSiteError: msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) msg = "Block has no locations defined: %s" % block logging.info(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block,sename) logging.info(sename) continue try: self.dbs.dbsMigrateBlock(sourceDBS, targetDBS, block_name=block) except DbsException, ex: msg = "Error in DBSWriter.importDataset\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": # we don't skip the error raising if not skipNoSiteError: msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) msg = "Block has no locations defined: %s" % block logging.info(msg) for sename in locations: self.dbs.addReplicaToBlock(block,sename)
def checkPublication(self): """ check dataset publication in a dbs """ common.logger.info('--->>> Check data publication: dataset ' + self.dataset_to_check + ' in DBS url ' + self.DBSURL + '\n') # // # // Get API to DBS #// dbsreader = DBSReader(self.DBSURL) # // # // Get list of datasets #// if len(self.dataset_to_check.split('/')) < 4: msg = "the provided dataset name is not correct" raise CrabException(msg) else: primds = self.dataset_to_check.split('/')[1] procds = self.dataset_to_check.split('/')[2] tier = self.dataset_to_check.split('/')[3] datasets = dbsreader.matchProcessedDatasets(primds, tier, procds) if common.debugLevel: print "PrimaryDataset = ", primds print "ProcessedDataset = ", procds print "DataTier = ", tier print "datasets matching your requirements= ", datasets for dataset in datasets: # // # // Get list of blocks for the dataset and their location #// if len(dataset.get('PathList')) == 0: print "===== Empty dataset yet /%s/%s with tiers %s" % ( dataset.get('PrimaryDataset')['Name'], dataset.get('Name'), dataset.get('TierList')) else: for datasetpath in dataset.get('PathList'): nevttot = 0 print "=== dataset %s" % datasetpath ### FEDE ####### if dataset['Description'] != None: print "=== dataset description = ", dataset[ 'Description'] ################ blocks = dbsreader.getFileBlocksInfo(datasetpath) for block in blocks: SEList = dbsreader.listFileBlockLocation( block['Name']) # replace that with DLS query print "===== File block name: %s" % block['Name'] print " File block located at: ", SEList print " File block status: %s" % block[ 'OpenForWriting'] print " Number of files: %s" % block[ 'NumberOfFiles'] print " Number of Bytes: %s" % block['BlockSize'] print " Number of Events: %s" % block[ 'NumberOfEvents'] if common.debugLevel: print "--------- info about files --------" print " Size \t Events \t LFN \t FileStatus " files = dbsreader.listFilesInBlock(block['Name']) for file in files: print "%s %s %s %s" % ( file['FileSize'], file['NumberOfEvents'], file['LogicalFileName'], file['Status']) nevttot = nevttot + block['NumberOfEvents'] print "\n total events: %s in dataset: %s\n" % ( nevttot, datasetpath) if not common.debugLevel: common.logger.info( 'You can obtain more info about files of the dataset using: crab -checkPublication -USER.dataset_to_check=' + self.dataset_to_check + ' -USER.dbs_url_for_publication=' + self.DBSURL + ' -debug')
def importDataset(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed=True, skipNoSiteError=False): """ _importDataset_ Import a dataset into the local scope DBS with full parentage hirerarchy (at least not slow because branches info is dropped). Parents are also imported. This method imports block by block, then each time a block is imported, its parent blocks will be imported first. - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to - *onlyClosed* : Only closed blocks will be imported if set to True - *skipNoSiteError* : If this is True, then this method wont raise an Exception if a block has no site information in sourceDBS. """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) blkCounter = 0 for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// blkCounter = blkCounter + 1 msg = "Importing block %s of %s: %s " % (blkCounter, len(inputBlocks), block) logging.debug(msg) if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str( inputBlock['NumberOfFiles']) != "0": # we don't skip the error raising if not skipNoSiteError: msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) msg = "Block has no locations defined: %s" % block logging.info(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block, sename) logging.info(sename) continue try: self.dbs.dbsMigrateBlock(sourceDBS, targetDBS, block_name=block) except DbsException, ex: msg = "Error in DBSWriter.importDataset\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": # we don't skip the error raising if not skipNoSiteError: msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) msg = "Block has no locations defined: %s" % block logging.info(msg) for sename in locations: self.dbs.addReplicaToBlock(block, sename)