def __call__(self): """ _operator()_ Load PU dataset information from DBS """ reader = DBSReader(self.dbsUrl) blocks = reader.listFileBlocks(self.dataset, False) for block in blocks: # // # // Populate locations #// locations = reader.listFileBlockLocation(block) if locations: self.blockSites[block] = locations for location in locations: if not self.sites.has_key(location): self.sites[location] = set() self.sites[location].add(block) # // # // Populate File list for block #// self[block] = reader.lfnsInBlock(block) return
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("SplitSize = %s" % self.splitSize) logging.debug("AllowedSites = %s" % self.allowedSites) thefiles = Fileset(name='FilesToSplit') reader = DBSReader(self.dbsUrl) fileList = reader.dbs.listFiles(analysisDataset = self.inputDataset(), retriveList = [ 'retrive_block', 'retrive_run']) blocks = {} for f in fileList: block = f['Block']['Name'] if not blocks.has_key(block): blocks[block] = reader.listFileBlockLocation(block) f['Block']['StorageElementList'].extend(blocks[block]) wmbsFile = File(f['LogicalFileName']) [ wmbsFile['locations'].add(x) for x in blocks[block] ] wmbsFile['block'] = block thefiles.addFile( wmbsFile ) work = Workflow() subs = Subscription( fileset = thefiles, workflow = work, split_algo = 'FileBased', type = "Processing") splitter = SplitterFactory() jobfactory = splitter(subs) jobs = jobfactory(files_per_job = self.splitSize) jobDefs = [] for job in jobs.jobs: #job.mask.setMaxAndSkipEvents(-1, 0) jobDef = JobDefinition() jobDef['LFNS'].extend(job.listLFNs()) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.listFiles() ] jobDefs.append(jobDef) return jobDefs
def tmdbInjectBlock(dbsUrl, datasetPath, blockName, phedexConfig, workingDir="/tmp", nodes=None, storageElements=None): """ _tmdbInjectBlock_ Util Method for injecting a fileblock into TMDB """ fileName = blockName.replace("/", "_") fileName = fileName.replace("#", "") dropXML = "%s/%s-PhEDExDrop.xml" % (workingDir, fileName) xmlContent = makePhEDExDrop(dbsUrl, datasetPath, blockName) handle = open(dropXML, 'w') handle.write(xmlContent) handle.close() reader = DBSReader(dbsUrl) if not storageElements: storageElements = reader.listFileBlockLocation(blockName) tmdbInject(phedexConfig, dropXML, nodes, *storageElements) return
def checkPublication(self): """ check dataset publication in a dbs """ common.logger.info('--->>> Check data publication: dataset '+self.dataset_to_check+' in DBS url '+ self.DBSURL+'\n') # // # // Get API to DBS #// dbsreader = DBSReader(self.DBSURL) # // # // Get list of datasets #// if len(self.dataset_to_check.split('/')) < 4: msg = "the provided dataset name is not correct" raise CrabException(msg) else: primds=self.dataset_to_check.split('/')[1] procds=self.dataset_to_check.split('/')[2] tier=self.dataset_to_check.split('/')[3] datasets=dbsreader.matchProcessedDatasets(primds,tier,procds) if common.debugLevel: print "PrimaryDataset = ", primds print "ProcessedDataset = ", procds print "DataTier = ", tier print "datasets matching your requirements= ", datasets for dataset in datasets: # // # // Get list of blocks for the dataset and their location #// if len(dataset.get('PathList'))==0: print "===== Empty dataset yet /%s/%s with tiers %s"%(dataset.get('PrimaryDataset')['Name'],dataset.get('Name'),dataset.get('TierList')) else: for datasetpath in dataset.get('PathList'): nevttot=0 print "=== dataset %s"%datasetpath ### FEDE ####### if dataset['Description'] != None: print "=== dataset description = ", dataset['Description'] ################ blocks=dbsreader.getFileBlocksInfo(datasetpath) for block in blocks: SEList=dbsreader.listFileBlockLocation(block['Name']) # replace that with DLS query print "===== File block name: %s" %block['Name'] print " File block located at: ", SEList print " File block status: %s" %block['OpenForWriting'] print " Number of files: %s"%block['NumberOfFiles'] print " Number of Bytes: %s"%block['BlockSize'] print " Number of Events: %s"%block['NumberOfEvents'] if common.debugLevel: print "--------- info about files --------" print " Size \t Events \t LFN \t FileStatus " files=dbsreader.listFilesInBlock(block['Name']) for file in files: print "%s %s %s %s"%(file['FileSize'],file['NumberOfEvents'],file['LogicalFileName'],file['Status']) nevttot = nevttot + block['NumberOfEvents'] print "\n total events: %s in dataset: %s\n"%(nevttot,datasetpath) if not common.debugLevel: common.logger.info('You can obtain more info about files of the dataset using: crab -checkPublication -USER.dataset_to_check='+self.dataset_to_check+' -USER.dbs_url_for_publication='+self.DBSURL+' -debug')
def manageDatasetBlocks(datasetPath, localDBS, globalDBS, phedexConfig=None, phedexNodes=None): """ _manageDatasetBlocks_ Trawl through the dataset for all remaining open blocks, and then close them, migrate them to global and inject them into PhEDEx if phedexConfig is not None, using the optional list of PhEDEx nodes if provided. """ dbs = DBSReader(localDBS) blocks = dbs.listFileBlocks(datasetPath) for block in blocks: if dbs.blockIsOpen(block): blockMgr = BlockManager(block, localDbs, globalDbs, datasetPath) blockMgr.closeBlock() blockMgr.migrateToGlobalDBS() if phedexConfig != None: blockMgr.injectBlockToPhEDEx(phedexConfig, phedexNodes) return
def makePhEDExDrop(dbsUrl, datasetPath, *blockNames): """ _makePhEDExDrop_ Given a DBS2 Url, dataset name and list of blockNames, generate an XML structure for injection """ spec = XMLInjectionSpec(dbsUrl, datasetPath) reader = DBSReader(dbsUrl) for block in blockNames: blockContent = reader.getFileBlock(block) isOpen = reader.blockIsOpen(block) if isOpen: xmlBlock = spec.getFileblock(block, "y") else: xmlBlock = spec.getFileblock(block, "n") for x in blockContent[block]['Files']: checksums = {'cksum' : x['Checksum']} if x.get('Adler32') not in (None, ''): checksums['adler32'] = x['Adler32'] xmlBlock.addFile(x['LogicalFileName'], checksums, x['FileSize']) improv = spec.save() xmlString = improv.makeDOMElement().toprettyxml() return xmlString
def tmdbInjectBlock(dbsUrl, datasetPath, blockName, phedexConfig, workingDir="/tmp", nodes=None, storageElements=None): """ _tmdbInjectBlock_ Util Method for injecting a fileblock into TMDB """ fileName = blockName.replace("/","_") fileName = fileName.replace("#","") dropXML = "%s/%s-PhEDExDrop.xml" % (workingDir, fileName) xmlContent = makePhEDExDrop(dbsUrl, datasetPath, blockName) handle = open(dropXML, 'w') handle.write(xmlContent) handle.close() reader = DBSReader(dbsUrl) if not storageElements: storageElements = reader.listFileBlockLocation(blockName) tmdbInject(phedexConfig, dropXML, nodes, *storageElements ) return
def validateDataset( datasetPath, dbsUrl): """ _validateDataset_ Util method to check that the datasetPath provided exists in the dbsUrl provided """ datasetDetails = DatasetConventions.parseDatasetPath(datasetPath) for key in ['Primary', 'DataTier', 'Processed']: if datasetDetails[key] == None: msg = "Invalid Dataset Name: \n ==> %s\n" % datasetPath msg += "Does not contain %s information" % key raise WorkflowMakerError(msg) datasets = [] try: reader = DBSReader(dbsUrl) datasets = reader.matchProcessedDatasets( datasetDetails['Primary'], datasetDetails['DataTier'], datasetDetails['Processed']) except Exception, ex: msg = "Error calling DBS to validate dataset:\n%s\n" % datasetPath msg += str(ex) raise WorkflowMakerError(msg)
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("SplitSize = %s" % self.splitSize) logging.debug("AllowedSites = %s" % self.allowedSites) thefiles = Fileset(name='FilesToSplit') reader = DBSReader(self.dbsUrl) fileList = reader.dbs.listFiles( analysisDataset=self.inputDataset(), retriveList=['retrive_block', 'retrive_run']) blocks = {} for f in fileList: block = f['Block']['Name'] if not blocks.has_key(block): blocks[block] = reader.listFileBlockLocation(block) f['Block']['StorageElementList'].extend(blocks[block]) wmbsFile = File(f['LogicalFileName']) [wmbsFile['locations'].add(x) for x in blocks[block]] wmbsFile['block'] = block thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='FileBased', type="Processing") splitter = SplitterFactory() jobfactory = splitter(subs) jobs = jobfactory(files_per_job=self.splitSize) jobDefs = [] for job in jobs.jobs: #job.mask.setMaxAndSkipEvents(-1, 0) jobDef = JobDefinition() jobDef['LFNS'].extend(job.listLFNs()) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.listFiles() ] jobDefs.append(jobDef) return jobDefs
def createJobSplitter(dataset, dbsUrl, onlyClosedBlocks=False, siteWhitelist=[], blockWhitelist=[], withParents=False): """ _createJobSplitter_ Instantiate a JobSplitter instance for the dataset provided and populate it with details from DBS. """ reader = DBSReader(dbsUrl) result = JobSplitter(dataset) filterSites = len(siteWhitelist) > 0 filterBlocks = len(blockWhitelist) > 0 for blockName in reader.listFileBlocks(dataset, onlyClosedBlocks): locations = reader.listFileBlockLocation(blockName) if filterBlocks: if blockName not in blockWhitelist: msg = "Excluding block %s based on block whitelist: %s\n" % (blockName, blockWhitelist) logging.debug(msg) continue if filterSites: siteMatches = filter(lambda x: x in locations, siteWhitelist) if len(siteMatches) == 0: msg = "Excluding block %s based on sites: %s \n" % (blockName, locations) logging.debug(msg) continue else: locations = siteMatches newBlock = result.newFileblock(blockName, *locations) if withParents == True: blockData = reader.getFileBlockWithParents(blockName)[blockName] else: blockData = reader.getFileBlock(blockName)[blockName] totalEvents = 0 fileList = set() for fileInfo in blockData["Files"]: totalEvents += fileInfo["NumberOfEvents"] fileList.add(fileInfo["LogicalFileName"]) if withParents: parList = [x["LogicalFileName"] for x in fileInfo["ParentList"]] newBlock.addFile(fileInfo["LogicalFileName"], fileInfo["NumberOfEvents"], parList) else: newBlock.addFile(fileInfo["LogicalFileName"], fileInfo["NumberOfEvents"]) logging.debug("Block %s contains %s events in %s files" % (blockName, totalEvents, len(fileList))) return result
def index(self, dataset): html = """<html><body><h2>Local DBS Dataset Listing</h2>\n """ html += "<h4>Dataset: %s<h4>\n" % dataset reader = DBSReader(self.localDBS) html += "<h4>Block Details</h4>\n" html += "<table>\n" html += "<tr><th>Block</th><th>SEName</th><th>Files</th>" html += "<th>Events</th></tr>\n" try: blocks = reader.getFileBlocksInfo(dataset) except Exception, ex: html += "</table>\n" html += "<p> Error accessing dataset information: %s</p>" % str(ex) html += """</body></html>""" return html
def loadLFNs(self, **dbsContacts): """ Get the list of LFNs from the DBS """ for i in self: self.remove(i) dbsUrl = dbsContacts.get('DBSURL', None) if dbsUrl == None: dbsUrl = getLocalDBSURL() reader = DBSReader(dbsUrl) fileList = reader.getFiles(self.dataset) for block in fileList.values(): result = [ x['LogicalFileName'] for x in block['Files']] self.extend(result) return
def processingComplete(self): """ _processingComplete_ look at the processing jobs for the workflow, and return True if all processing jobs are complete """ intermediateDBS = self.workflowSpec.parameters['DBSURL'] outputDataset = self.workflowSpec.outputDatasets()[0].name() allJobs = WEUtils.jobsForWorkflow(self.workflow, "Merge") finishedJobs = WEUtils.jobsForWorkflow(self.workflow, "Merge", "finished") totalProcessing = len(allJobs) totalComplete = len(finishedJobs) logging.info("%s: %s/%s jobs complete" % (self.workflow,totalComplete,totalProcessing)) if totalProcessing == 0: # Protection for non-sensical situation return False if totalComplete < totalProcessing: return False # Check to make sure local DBS knows about all output files try: reader = DBSReader(intermediateDBS) blockList = reader.getFiles(dataset = outputDataset) except: logging.info("Dataset not in DBS yet") return False totalRegistered = 0 for block in blockList: totalRegistered += len(blockList[block]['Files']) logging.info("%s: %s/%s jobs registered" % (self.workflow,totalRegistered,totalProcessing)) if totalRegistered < totalProcessing: return False return True
def loadSites(self, **dbsContacts): """ Get the list of sites hosting the PU from DBS/DLS """ dbsUrl = dbsContacts.get('DBSURL', None) if dbsUrl == None: dbsUrl = getLocalDBSURL() reader = DBSReader(dbsUrl) locations = [] blocks = reader.listFileBlocks(self.dataset, True) for block in blocks: try: locations = reader.listFileBlockLocation(block) except Exception, ex: msg = "Unable to find DLS Locations for Block: %s\n" % block msg += str(ex) logging.warning(msg) continue
def manageDatasetBlocks(datasetPath, localDBS, globalDBS, phedexConfig = None, phedexNodes = None): """ _manageDatasetBlocks_ Trawl through the dataset for all remaining open blocks, and then close them, migrate them to global and inject them into PhEDEx if phedexConfig is not None, using the optional list of PhEDEx nodes if provided. """ dbs = DBSReader(localDBS) blocks = dbs.listFileBlocks(datasetPath) for block in blocks: if dbs.blockIsOpen(block): blockMgr = BlockManager(block, localDbs, globalDbs, datasetPath) blockMgr.closeBlock() blockMgr.migrateToGlobalDBS() if phedexConfig != None: blockMgr.injectBlockToPhEDEx(phedexConfig, phedexNodes) return
def splitDatasetByRun(datasetName, dbsUrl): """ _splitDatasetByRun_ Chop up a dataset into a set of jobs with 1 job per run """ reader = DBSReader(dbsUrl) result = [] for run in listRunsInDataset(reader, datasetName): files = listFilesInRun(reader, datasetName, run) job = JobDefinition() job['LFNS'] = files job['RunNumber'] = run result.append(job) return result
class DBSWriter: """ _DBSWriter_ General API for writing data to DBS """ def __init__(self, url, **contact): args = {"url": url, "level": 'ERROR'} args.update(contact) try: self.dbs = DbsApi(args) except DbsException, ex: msg = "Error in DBSWriterError with DbsApi\n" msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) self.reader = DBSReader(**args)
def importDatasetWithExistingParents(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed = True): """ _importDataset_ Import a dataset into the local scope DBS. It complains if the parent dataset ar not there!! - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if not str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block,sename) logging.info(sename) continue try: xferData = reader.dbs.listDatasetContents( sourceDatasetPath, block ) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not read content of dataset:\n ==> %s\n" % ( sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) try: self.dbs.insertDatasetContents(xferData) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg)
def importDatasetWithExistingParents(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed=True): """ _importDataset_ Import a dataset into the local scope DBS. It complains if the parent dataset ar not there!! - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if not str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str( inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block, sename) logging.info(sename) continue try: xferData = reader.dbs.listDatasetContents( sourceDatasetPath, block) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not read content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) try: self.dbs.insertDatasetContents(xferData) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg)
def migrateDatasetBlocks(self, inputDBSUrl, datasetPath, blocks): """ _migrateDatasetBlocks_ Migrate the list of fileblocks provided by blocks, belonging to the dataset specified by the dataset path to this DBS instance from the inputDBSUrl provided - *inputDBSUrl* : URL for connection to input DBS - *datasetPath* : Name of dataset in input DBS (must exist in input DBS) - *blocks* : list of block names to be migrated (must exist in input DBS) """ if len(blocks) == 0: msg = "FileBlocks not provided.\n" msg += "You must provide the name of at least one fileblock\n" msg += "to be migrated" raise DBSWriterError(msg) # // # // Hook onto input DBSUrl and verify that the dataset & blocks #// exist reader = DBSReader(inputDBSUrl) inputBlocks = reader.listFileBlocks(datasetPath) for block in blocks: # // # // Test block exists at source #// if block not in inputBlocks: msg = "Block name:\n ==> %s\n" % block msg += "Not found in input dataset:\n ==> %s\n" % datasetPath msg += "In DBS Instance:\n ==> %s\n" % inputDBSUrl raise DBSWriterError(msg) # // # // Test block does not exist in target #// if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if not self.reader.blockIsOpen(block): msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Migration of that block" logging.warning(msg) continue try: xferData = reader.dbs.listDatasetContents(datasetPath, block) except DbsException, ex: msg = "Error in DBSWriter.migrateDatasetBlocks\n" msg += "Could not read content of dataset:\n ==> %s\n" % ( datasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) xferData = _remapBlockParentage(datasetPath, xferData) try: self.dbs.insertDatasetContents(xferData) except DbsException, ex: msg = "Error in DBSWriter.migrateDatasetBlocks\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( datasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg)
def createJobSplitter(dataset, dbsUrl, onlyClosedBlocks=False, siteWhitelist=[], blockWhitelist=[], withParents=False): """ _createJobSplitter_ Instantiate a JobSplitter instance for the dataset provided and populate it with details from DBS. """ reader = DBSReader(dbsUrl) result = JobSplitter(dataset) filterSites = len(siteWhitelist) > 0 filterBlocks = len(blockWhitelist) > 0 for blockName in reader.listFileBlocks(dataset, onlyClosedBlocks): locations = reader.listFileBlockLocation(blockName) if filterBlocks: if blockName not in blockWhitelist: msg = "Excluding block %s based on block whitelist: %s\n" % ( blockName, blockWhitelist) logging.debug(msg) continue if filterSites: siteMatches = filter(lambda x: x in locations, siteWhitelist) if len(siteMatches) == 0: msg = "Excluding block %s based on sites: %s \n" % ( blockName, locations, ) logging.debug(msg) continue else: locations = siteMatches newBlock = result.newFileblock(blockName, *locations) if withParents == True: blockData = reader.getFileBlockWithParents(blockName)[blockName] else: blockData = reader.getFileBlock(blockName)[blockName] totalEvents = 0 fileList = set() for fileInfo in blockData['Files']: totalEvents += fileInfo['NumberOfEvents'] fileList.add(fileInfo['LogicalFileName']) if withParents: parList = [ x['LogicalFileName'] for x in fileInfo['ParentList'] ] newBlock.addFile(fileInfo['LogicalFileName'], fileInfo['NumberOfEvents'], parList) else: newBlock.addFile(fileInfo['LogicalFileName'], fileInfo['NumberOfEvents']) logging.debug("Block %s contains %s events in %s files" % ( blockName, totalEvents, len(fileList), )) return result
def makeBlockList(self, onlyClosedBlocks = False, sites=None, providedOnlyBlocks=None): """ _makeBlockList_ Generate the list of blocks for the workflow. 1. Get the list of all blocks from the DBS 2. Compare to list of blocks in persistency file 3. Obtain the intersection of the new blocks and the providedOnlyBlocks list. 4. Set OnlyBlocks parameter to intersection obtained. """ #reader = DBSReader(self.dbsUrl) # At this point, blocks should be in local DBS localDBS = getLocalDBSURL() reader = DBSReader(localDBS) dbsBlocks = reader.listFileBlocks(self.inputDataset(), onlyClosedBlocks) if self.persistData.blocks != []: remover = lambda x : x not in self.persistData.blocks newBlocks = filter(remover, dbsBlocks) else: newBlocks = dbsBlocks # // # // Skipping blocks without site info #// msg = "Filtering blocks according to Site information..." logging.info(msg) blocksAtSites = [] for block in newBlocks: locations = reader.listFileBlockLocation(block) if not locations: msg = "\nSkipping block: " msg += "No site info available for block %s " % block logging.info(msg) elif sites is not None: locationInSites = False for location in locations: if location in sites: locationInSites = True break if locationInSites: blocksAtSites.append(block) else: msg = "\nSkipping block: " msg += "Block %s has no replicas in %s" % (block, ", ".join(sites)) logging.info(msg) else: blocksAtSites.append(block) newBlocks = blocksAtSites if len(newBlocks) == 0: msg = "No New Blocks found for dataset\n" raise RuntimeError, msg # // # // Check presence of provided Blocks in newBlocks #// blocksToProcess = [] if providedOnlyBlocks is not None : providedOnlyBlocksList = providedOnlyBlocks.split(',') msg = "OnlyBlocks setting provided. Processing it..." logging.info(msg) msg = "OnlyBlocks list contains %s Blocks." % ( len(providedOnlyBlocksList)) logging.info(msg) blockCount = 1 for block in providedOnlyBlocksList : if block.strip() in newBlocks : blocksToProcess.append(block.strip()) msg = "Block %s: Adding Block %s" % ( blockCount, block) msg += " to the Whitelist" logging.info(msg) else: msg = "Block %s: Skipping Block %s " % ( blockCount, block) msg += "It's no New or it has been processed" msg += " already." logging.info(msg) blockCount += 1 else : blocksToProcess = newBlocks msg = "OnlyBlocks setting not provided. Processing" msg += " all New Blocks for Dataset\n" logging.info(msg) if len(blocksToProcess) == 0 : msg = "OnlyBlocks list does not match any New Blocks" msg += " found for Dataset\n" raise RuntimeError, msg blockList = str(blocksToProcess) blockList = blockList.replace("[", "") blockList = blockList.replace("]", "") blockList = blockList.replace("\'", "") blockList = blockList.replace("\"", "") self.workflow.parameters['OnlyBlocks'] = blockList self.persistData.blocks.extend(blocksToProcess) return
def importDataset(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed=True, skipNoSiteError=False): """ _importDataset_ Import a dataset into the local scope DBS with full parentage hirerarchy (at least not slow because branches info is dropped). Parents are also imported. This method imports block by block, then each time a block is imported, its parent blocks will be imported first. - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to - *onlyClosed* : Only closed blocks will be imported if set to True - *skipNoSiteError* : If this is True, then this method wont raise an Exception if a block has no site information in sourceDBS. """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) blkCounter = 0 for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// blkCounter = blkCounter + 1 msg = "Importing block %s of %s: %s " % (blkCounter, len(inputBlocks), block) logging.debug(msg) if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str( inputBlock['NumberOfFiles']) != "0": # we don't skip the error raising if not skipNoSiteError: msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) msg = "Block has no locations defined: %s" % block logging.info(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block, sename) logging.info(sename) continue try: self.dbs.dbsMigrateBlock(sourceDBS, targetDBS, block_name=block) except DbsException, ex: msg = "Error in DBSWriter.importDataset\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": # we don't skip the error raising if not skipNoSiteError: msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) msg = "Block has no locations defined: %s" % block logging.info(msg) for sename in locations: self.dbs.addReplicaToBlock(block, sename)
def makeFileList(self, onlyClosedBlocks = False, sites=None, providedOnlyBlocks=None, providedOnlyFiles=None): """ _makeFileList_ Generate the list of blocks for the workflow. 1. Get the list of all blocks from the DBS 2. Compare to list of blocks in persistency file 3. Obtain the intersection of the new blocks and the providedOnlyBlocks list. 4. Set OnlyBlocks parameter to intersection obtained. """ #reader = DBSReader(self.dbsUrl) # At this point, blocks should be in local DBS localDBS = getLocalDBSURL() reader = DBSReader(localDBS) # // # // Querying list of blocks from DBS #// msg = "Querying for closed blocks in Local DBS: %s ..." % localDBS logging.info(msg) dbsBlocks = reader.listFileBlocks(self.inputDataset(), onlyClosedBlocks) msg = "Retrieved %s close blocks from Local DBS" % len(dbsBlocks) logging.info(msg) # // # // Constructing mapping structures block-file #// filesToBlocks = {} blocksToFiles = {} dbsFiles = reader.dbs.listFiles(path=self.inputDataset()) for dbsfile in dbsFiles: if dbsfile['Block']['Name'] in dbsBlocks: filesToBlocks[dbsfile['LogicalFileName']] = \ dbsfile['Block']['Name'] blocksToFiles.setdefault(dbsfile['Block']['Name'], [] ).append(dbsfile['LogicalFileName']) # OnlyFiles? if providedOnlyFiles is not None and \ providedOnlyFiles.strip().lower() != 'auto': msg = "Using OnlyFiles list:" msg += " %s files." % len(providedOnlyFiles.split(',')) logging.info(msg) onlyFiles = [x.strip() for x in providedOnlyFiles.split(',') if x] # OnlyFiles=auto elif providedOnlyFiles is not None: msg = "Automatically generating OnlyFiles list from DBS..." logging.info(msg) onlyFiles = self.createOnlyFilesFromWorkflow() # OnlyBlocks elif providedOnlyBlocks is not None: msg = "Using OnlyBLocks list:" msg += " %s blocks." % len(providedOnlyBlocks.split(',')) logging.info(msg) onlyFiles = [] for block in \ [x.strip() for x in providedOnlyBlocks.split(',') if x]: onlyFiles.extend(blocksToFiles[dbsBlocks]) # Processing everything in DBS else: msg = "Processing whole input dataset..." logging.info(msg) onlyFiles = [] for block in dbsBlocks: onlyFiles.extend(blocksToFiles[dbsBlocks]) if not onlyFiles: msg = "No files were found for the input dataset: " + \ self.inputDataset() raise RuntimeError, msg # // # // Filter files that were already processed #// if self.persistData.blocks: msg = "Filtering files that were already processed for this" msg += " workflow..." logging.info(msg) processedFiles = self.persistData.getFiles() msg = "Persistency file has %s file(s)" % len(processedFiles) logging.info(msg) remover = lambda x: x not in processedFiles onlyFiles = filter(remover, onlyFiles) msg = "%s file(s) were removed" % \ str(len(processedFiles) - len(onlyFiles)) logging.info(msg) if not onlyFiles: msg = "No New files were found for the input dataset: " + \ self.inputDataset() raise RuntimeError, msg # // # // Filter files in blocks without site info #// msg = "Filtering blocks according to Site information..." logging.info(msg) candidateBlocks = {} for file in onlyFiles: candidateBlocks.setdefault(filesToBlocks[file], []).append(file) blocksAtSites = [] for block in candidateBlocks: locations = reader.listFileBlockLocation(block) if not locations: msg = "Excluding block without site info ==> %s" % block logging.info(msg) elif sites is not None: locationInSites = False for location in locations: if location in sites: locationInSites = True break if locationInSites: blocksAtSites.append(block) else: msg = "Excluding block without replicas" msg += " in %s ==> %s" % (block, ", ".join(sites)) logging.info(msg) else: blocksAtSites.append(block) if len(blocksAtSites) == 0: msg = "No block has site information." raise RuntimeError, msg # // # // Constructing OnlyBlocks and OnlyFiles list #// onlyBlocks = {} for block in blocksAtSites: onlyBlocks[block] = candidateBlocks[block] onlyFiles = [] for block in onlyBlocks: onlyFiles.extend(onlyBlocks[block]) msg = "\n ==> Files to process: %s" % len(onlyFiles) msg += "\n ==> Blocks to process: %s" % len(onlyBlocks) logging.info(msg) blockList = ",".join(onlyBlocks.keys()) fileList = ",".join(onlyFiles) self.workflow.parameters['OnlyBlocks'] = blockList self.workflow.parameters['OnlyFiles'] = fileList self.persistData.update(onlyBlocks) return
def publishDataset(self,file): """ """ try: jobReport = readJobReport(file)[0] self.exit_status = '0' except IndexError: self.exit_status = '1' msg = "Error: Problem with "+file+" file" common.logger.info(msg) return self.exit_status if (len(self.dataset_to_import) != 0): for dataset in self.dataset_to_import: common.logger.info("--->>> Importing parent dataset in the dbs: " +dataset) status_import=self.importParentDataset(self.globalDBS, dataset) if (status_import == 1): common.logger.info('Problem with parent '+ dataset +' import from the global DBS '+self.globalDBS+ 'to the local one '+self.DBSURL) self.exit_status='1' return self.exit_status else: common.logger.info('Import ok of dataset '+dataset) if (len(jobReport.files) <= 0) : self.exit_status = '1' msg = "Error: No EDM file to publish in xml file"+file+" file" common.logger.info(msg) return self.exit_status else: msg = "fjr contains some files to publish" common.logger.debug(msg) #### datasets creation in dbs #// DBS to contact write and read of the same dbs dbsReader = DBSReader(self.DBSURL,level='ERROR') dbswriter = DBSWriter(self.DBSURL) ##### self.published_datasets = [] for fileinfo in jobReport.files: datasets_info=fileinfo.dataset if len(datasets_info)<=0: self.exit_status = '1' msg = "Error: No info about dataset in the xml file "+file common.logger.info(msg) return self.exit_status else: for dataset in datasets_info: #### for production data self.processedData = dataset['ProcessedDataset'] if (dataset['PrimaryDataset'] == 'null'): dataset['PrimaryDataset'] = self.userprocessedData elif self.datasetpath.upper() != 'NONE': dataset['ParentDataset']= self.datasetpath dataset['PSetContent']=self.content cfgMeta = {'name' : self.pset , 'Type' : 'user' , 'annotation': 'user cfg', 'version' : 'private version'} # add real name of user cfg common.logger.info("PrimaryDataset = %s"%dataset['PrimaryDataset']) common.logger.info("ProcessedDataset = %s"%dataset['ProcessedDataset']) common.logger.info("<User Dataset Name> = /"+dataset['PrimaryDataset']+"/"+dataset['ProcessedDataset']+"/USER") self.dataset_to_check="/"+dataset['PrimaryDataset']+"/"+dataset['ProcessedDataset']+"/USER" self.published_datasets.append(self.dataset_to_check) common.logger.log(10-1,"--->>> Inserting primary: %s processed : %s"%(dataset['PrimaryDataset'],dataset['ProcessedDataset'])) #### check if dataset already exists in the DBS result = dbsReader.matchProcessedDatasets(dataset['PrimaryDataset'], 'USER', dataset['ProcessedDataset']) if (len(result) != 0): result = dbsReader.listDatasetFiles(self.dataset_to_check) primary = DBSWriterObjects.createPrimaryDataset( dataset, dbswriter.dbs) common.logger.log(10-1,"Primary: %s "%primary) print "primary = ", primary algo = DBSWriterObjects.createAlgorithm(dataset, cfgMeta, dbswriter.dbs) common.logger.log(10-1,"Algo: %s "%algo) processed = DBSWriterObjects.createProcessedDataset(primary, algo, dataset, dbswriter.dbs) common.logger.log(10-1,"Processed: %s "%processed) print "processed = ", processed common.logger.log(10-1,"Inserted primary %s processed %s"%(primary,processed)) ####################################################################################### common.logger.log(10-1,"exit_status = %s "%self.exit_status) return self.exit_status
sys.exit(1) if (block != None) and (blockFileList != None) and (datasetPath != None): print "\n options --block or --blockFileList or --datasetPath are mutually exclusive" print usage sys.exit(1) print ">>>>> DBS URL : %s "%(url,) import logging logging.disable(logging.INFO) # // # // Get API to DBS #// args = {'url' : url , 'level' : 'ERROR'} dbsapi = DbsApi(args) dbsreader = DBSReader(url) # // # // Close FileBlock method #// def closeDBSFileBlock(ablock): print "Closing block %s"%ablock dbsblock = DbsFileBlock( Name = ablock) dbsapi.closeBlock(dbsblock) ### --block option: close single block if (block != None): closeDBSFileBlock(block) ## --blockFileList option: close list of blocks from a file if (blockFileList != None) :
def checkPublication(self): """ check dataset publication in a dbs """ common.logger.info('--->>> Check data publication: dataset ' + self.dataset_to_check + ' in DBS url ' + self.DBSURL + '\n') # // # // Get API to DBS #// dbsreader = DBSReader(self.DBSURL) # // # // Get list of datasets #// if len(self.dataset_to_check.split('/')) < 4: msg = "the provided dataset name is not correct" raise CrabException(msg) else: primds = self.dataset_to_check.split('/')[1] procds = self.dataset_to_check.split('/')[2] tier = self.dataset_to_check.split('/')[3] datasets = dbsreader.matchProcessedDatasets(primds, tier, procds) if common.debugLevel: print "PrimaryDataset = ", primds print "ProcessedDataset = ", procds print "DataTier = ", tier print "datasets matching your requirements= ", datasets for dataset in datasets: # // # // Get list of blocks for the dataset and their location #// if len(dataset.get('PathList')) == 0: print "===== Empty dataset yet /%s/%s with tiers %s" % ( dataset.get('PrimaryDataset')['Name'], dataset.get('Name'), dataset.get('TierList')) else: for datasetpath in dataset.get('PathList'): nevttot = 0 print "=== dataset %s" % datasetpath ### FEDE ####### if dataset['Description'] != None: print "=== dataset description = ", dataset[ 'Description'] ################ blocks = dbsreader.getFileBlocksInfo(datasetpath) for block in blocks: SEList = dbsreader.listFileBlockLocation( block['Name']) # replace that with DLS query print "===== File block name: %s" % block['Name'] print " File block located at: ", SEList print " File block status: %s" % block[ 'OpenForWriting'] print " Number of files: %s" % block[ 'NumberOfFiles'] print " Number of Bytes: %s" % block['BlockSize'] print " Number of Events: %s" % block[ 'NumberOfEvents'] if common.debugLevel: print "--------- info about files --------" print " Size \t Events \t LFN \t FileStatus " files = dbsreader.listFilesInBlock(block['Name']) for file in files: print "%s %s %s %s" % ( file['FileSize'], file['NumberOfEvents'], file['LogicalFileName'], file['Status']) nevttot = nevttot + block['NumberOfEvents'] print "\n total events: %s in dataset: %s\n" % ( nevttot, datasetpath) if not common.debugLevel: common.logger.info( 'You can obtain more info about files of the dataset using: crab -checkPublication -USER.dataset_to_check=' + self.dataset_to_check + ' -USER.dbs_url_for_publication=' + self.DBSURL + ' -debug')
def publishDataset(self, file): """ """ try: jobReport = readJobReport(file)[0] self.exit_status = '0' except IndexError: self.exit_status = '1' msg = "Error: Problem with " + file + " file" common.logger.info(msg) return self.exit_status if (len(self.dataset_to_import) != 0): for dataset in self.dataset_to_import: common.logger.info( "--->>> Importing parent dataset in the dbs: " + dataset) status_import = self.importParentDataset( self.globalDBS, dataset) if (status_import == 1): common.logger.info('Problem with parent ' + dataset + ' import from the global DBS ' + self.globalDBS + 'to the local one ' + self.DBSURL) self.exit_status = '1' return self.exit_status else: common.logger.info('Import ok of dataset ' + dataset) if (len(jobReport.files) <= 0): self.exit_status = '1' msg = "Error: No EDM file to publish in xml file" + file + " file" common.logger.info(msg) return self.exit_status else: msg = "fjr contains some files to publish" common.logger.debug(msg) #### datasets creation in dbs #// DBS to contact write and read of the same dbs dbsReader = DBSReader(self.DBSURL, level='ERROR') dbswriter = DBSWriter(self.DBSURL) ##### self.published_datasets = [] for fileinfo in jobReport.files: datasets_info = fileinfo.dataset if len(datasets_info) <= 0: self.exit_status = '1' msg = "Error: No info about dataset in the xml file " + file common.logger.info(msg) return self.exit_status else: for dataset in datasets_info: #### for production data self.processedData = dataset['ProcessedDataset'] if (dataset['PrimaryDataset'] == 'null'): dataset['PrimaryDataset'] = self.userprocessedData elif self.datasetpath.upper() != 'NONE': dataset['ParentDataset'] = self.datasetpath dataset['PSetContent'] = self.content cfgMeta = { 'name': self.pset, 'Type': 'user', 'annotation': 'user cfg', 'version': 'private version' } # add real name of user cfg common.logger.info("PrimaryDataset = %s" % dataset['PrimaryDataset']) common.logger.info("ProcessedDataset = %s" % dataset['ProcessedDataset']) common.logger.info("<User Dataset Name> = /" + dataset['PrimaryDataset'] + "/" + dataset['ProcessedDataset'] + "/USER") self.dataset_to_check = "/" + dataset[ 'PrimaryDataset'] + "/" + dataset[ 'ProcessedDataset'] + "/USER" self.published_datasets.append(self.dataset_to_check) common.logger.log( 10 - 1, "--->>> Inserting primary: %s processed : %s" % (dataset['PrimaryDataset'], dataset['ProcessedDataset'])) #### check if dataset already exists in the DBS result = dbsReader.matchProcessedDatasets( dataset['PrimaryDataset'], 'USER', dataset['ProcessedDataset']) if (len(result) != 0): result = dbsReader.listDatasetFiles( self.dataset_to_check) primary = DBSWriterObjects.createPrimaryDataset( dataset, dbswriter.dbs) common.logger.log(10 - 1, "Primary: %s " % primary) print "primary = ", primary algo = DBSWriterObjects.createAlgorithm( dataset, cfgMeta, dbswriter.dbs) common.logger.log(10 - 1, "Algo: %s " % algo) processed = DBSWriterObjects.createProcessedDataset( primary, algo, dataset, dbswriter.dbs) common.logger.log(10 - 1, "Processed: %s " % processed) print "processed = ", processed common.logger.log( 10 - 1, "Inserted primary %s processed %s" % (primary, processed)) ####################################################################################### common.logger.log(10 - 1, "exit_status = %s " % self.exit_status) return self.exit_status
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("MergeSize = %s" % self.mergeSize) logging.debug("AllowedSites = %s" % self.allowedSites) logging.debug("Connection to DBS at: %s" % self.dbsUrl) reader = DBSReader(self.dbsUrl) blockList = reader.dbs.listBlocks(dataset = self.inputDataset()) jobDefs = [] for block in blockList: blockName = block['Name'] logging.debug("Getting files for block %s" % blockName) locations = reader.listFileBlockLocation(blockName) fileList = reader.dbs.listFiles(blockName = blockName) if not fileList: # Skip empty blocks continue thefiles = Fileset(name='FilesToSplit') for f in fileList: f['Block']['StorageElementList'].extend(locations) wmbsFile = File(f['LogicalFileName']) [ wmbsFile['locations'].add(x) for x in locations ] wmbsFile['block'] = blockName wmbsFile['size'] = f['FileSize'] thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription( fileset = thefiles, workflow = work, split_algo = 'MergeBySize', type = "Merge") logging.debug("Info for Subscription %s" % subs) splitter = SplitterFactory() jobfactory = splitter(subs) jobGroups = jobfactory( merge_size=self.mergeSize, # min in Bytes all_files=True # merge all files ) if not jobGroups: raise(SyntaxError) for jobGroup in jobGroups: for job in jobGroup.getJobs(): jobDef = JobDefinition() jobDef['LFNS'].extend(job.getFiles(type='lfn')) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.getFiles() ] jobDefs.append(jobDef) return jobDefs
def importDataset(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed = True, skipNoSiteError=False): """ _importDataset_ Import a dataset into the local scope DBS with full parentage hirerarchy (at least not slow because branches info is dropped). Parents are also imported. This method imports block by block, then each time a block is imported, its parent blocks will be imported first. - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to - *onlyClosed* : Only closed blocks will be imported if set to True - *skipNoSiteError* : If this is True, then this method wont raise an Exception if a block has no site information in sourceDBS. """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) blkCounter=0 for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// blkCounter=blkCounter+1 msg="Importing block %s of %s: %s " % (blkCounter,len(inputBlocks),block) logging.debug(msg) if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": # we don't skip the error raising if not skipNoSiteError: msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) msg = "Block has no locations defined: %s" % block logging.info(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block,sename) logging.info(sename) continue try: self.dbs.dbsMigrateBlock(sourceDBS, targetDBS, block_name=block) except DbsException, ex: msg = "Error in DBSWriter.importDataset\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": # we don't skip the error raising if not skipNoSiteError: msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) msg = "Block has no locations defined: %s" % block logging.info(msg) for sename in locations: self.dbs.addReplicaToBlock(block,sename)
def migrateDatasetBlocks(self, inputDBSUrl, datasetPath, blocks): """ _migrateDatasetBlocks_ Migrate the list of fileblocks provided by blocks, belonging to the dataset specified by the dataset path to this DBS instance from the inputDBSUrl provided - *inputDBSUrl* : URL for connection to input DBS - *datasetPath* : Name of dataset in input DBS (must exist in input DBS) - *blocks* : list of block names to be migrated (must exist in input DBS) """ if len(blocks) == 0: msg = "FileBlocks not provided.\n" msg += "You must provide the name of at least one fileblock\n" msg += "to be migrated" raise DBSWriterError(msg) # // # // Hook onto input DBSUrl and verify that the dataset & blocks #// exist reader = DBSReader(inputDBSUrl) inputBlocks = reader.listFileBlocks(datasetPath) for block in blocks: # // # // Test block exists at source #// if block not in inputBlocks: msg = "Block name:\n ==> %s\n" % block msg += "Not found in input dataset:\n ==> %s\n" % datasetPath msg += "In DBS Instance:\n ==> %s\n" % inputDBSUrl raise DBSWriterError(msg) # // # // Test block does not exist in target #// if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if not self.reader.blockIsOpen(block): msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Migration of that block" logging.warning(msg) continue try: xferData = reader.dbs.listDatasetContents(datasetPath, block) except DbsException, ex: msg = "Error in DBSWriter.migrateDatasetBlocks\n" msg += "Could not read content of dataset:\n ==> %s\n" % ( datasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) xferData = _remapBlockParentage(datasetPath, xferData) try: self.dbs.insertDatasetContents(xferData) except DbsException, ex: msg = "Error in DBSWriter.migrateDatasetBlocks\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( datasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg)
def processDataset(self): """ _processDataset_ Import the Dataset contents and create a set of jobs from it """ # // # // Now create the job definitions #// logging.debug("MergeSize = %s" % self.mergeSize) logging.debug("AllowedSites = %s" % self.allowedSites) logging.debug("Connection to DBS at: %s" % self.dbsUrl) reader = DBSReader(self.dbsUrl) blockList = reader.dbs.listBlocks(dataset=self.inputDataset()) jobDefs = [] for block in blockList: blockName = block['Name'] logging.debug("Getting files for block %s" % blockName) locations = reader.listFileBlockLocation(blockName) fileList = reader.dbs.listFiles(blockName=blockName) if not fileList: # Skip empty blocks continue thefiles = Fileset(name='FilesToSplit') for f in fileList: f['Block']['StorageElementList'].extend(locations) wmbsFile = File(f['LogicalFileName']) [wmbsFile['locations'].add(x) for x in locations] wmbsFile['block'] = blockName wmbsFile['size'] = f['FileSize'] thefiles.addFile(wmbsFile) work = Workflow() subs = Subscription(fileset=thefiles, workflow=work, split_algo='MergeBySize', type="Merge") logging.debug("Info for Subscription %s" % subs) splitter = SplitterFactory() jobfactory = splitter(subs) jobGroups = jobfactory( merge_size=self.mergeSize, # min in Bytes all_files=True # merge all files ) if not jobGroups: raise (SyntaxError) for jobGroup in jobGroups: for job in jobGroup.getJobs(): jobDef = JobDefinition() jobDef['LFNS'].extend(job.getFiles(type='lfn')) jobDef['SkipEvents'] = 0 jobDef['MaxEvents'] = -1 [ jobDef['SENames'].extend(list(x['locations'])) for x in job.getFiles() ] jobDefs.append(jobDef) return jobDefs
print usage sys.exit(1) if (block != None) and (datasetPath != None): print "\n options --block or --datasetPath are mutually exclusive" print usage sys.exit(1) print ">>>>> DBS URL : %s" % (url) import logging logging.disable(logging.INFO) # // # // Get API to DBS #// args = {'url': url, 'level': 'ERROR'} dbsapi = DbsApi(args) # // # // Delete dataset #// if (datasetPath): print "Deleting datasetPath=%s" % datasetPath dbsapi.deleteProcDS(datasetPath) if (block): dbsreader = DBSReader(url) getdatasetPath = dbsreader.blockToDatasetPath(block) print "Deleting block=%s from datasetPath=%s" % (block, getdatasetPath) dbsapi.deleteBlock(getdatasetPath, block)
print usage sys.exit(1) print ">>>>> DBS URL : %s"%(url) import logging logging.disable(logging.INFO) # // # // Get API to DBS #// args = {'url' : url , 'level' : 'ERROR'} dbsapi = DbsApi(args) # // # // Delete dataset #// if (datasetPath): print "Deleting datasetPath=%s"%datasetPath dbsapi.deleteProcDS(datasetPath) if (block): dbsreader = DBSReader(url) getdatasetPath = dbsreader.blockToDatasetPath(block) print "Deleting block=%s from datasetPath=%s"%(block,getdatasetPath) dbsapi.deleteBlock(getdatasetPath,block)
sys.exit(1) if datasetPath == None: print "--datasetPath option not provided. For example : --datasetPath /primarydataset/processeddataset/datatier" print usage sys.exit() print ">>>>> DBS URL : %s"%(url,) import logging logging.disable(logging.INFO) # // # // Get API to DBS #// dbsreader = DBSReader(url) # // # // Get list of datasets #// if datasetPath: primds=datasetPath.split('/')[1] procds=datasetPath.split('/')[2] tier=datasetPath.split('/')[3] # print " matchProcessedDatasets(%s,%s,%s)"%(primds,tier,procds) datasets=dbsreader.matchProcessedDatasets(primds,tier,procds) else: datasets=dbsreader.matchProcessedDatasets("*","*","*") for dataset in datasets: # //