def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.enabled = config.RucioInjector.enabled # dataset rule creation has a larger polling cycle self.pollRules = config.RucioInjector.pollIntervalRules self.lastRulesExecTime = 0 self.createBlockRules = config.RucioInjector.createBlockRules self.skipRulesForTiers = config.RucioInjector.skipRulesForTiers self.listTiersToInject = config.RucioInjector.listTiersToInject # setup cache for container and blocks (containers can be much longer, make 6 days now) self.containersCache = MemoryCache( config.RucioInjector.cacheExpiration * 3, set()) self.blocksCache = MemoryCache(config.RucioInjector.cacheExpiration, set()) self.scope = getattr(config.RucioInjector, "scope", "cms") self.rucioAcct = config.RucioInjector.rucioAccount self.rucio = Rucio(acct=self.rucioAcct, hostUrl=config.RucioInjector.rucioUrl, authUrl=config.RucioInjector.rucioAuthUrl, configDict={'logger': self.logger}) # metadata dictionary information to be added to block/container rules # cannot be a python dictionary, but a JSON string instead self.metaData = json.dumps( dict(agentHost=config.Agent.hostName, userAgent=config.Agent.agentName)) self.testRSEs = config.RucioInjector.RSEPostfix self.filesToRecover = [] logging.info( "Component configured to only inject data for data tiers: %s", self.listTiersToInject) logging.info( "Component configured to skip container rule creation for data tiers: %s", self.skipRulesForTiers) logging.info("Component configured to create block rules: %s", self.createBlockRules)
def _getDatasetLocation(self, dset, blockDict): """ Given a dataset name, query PhEDEx or Rucio and resolve the block location :param dset: string with the dataset name :param blockDict: dictionary with DBS summary info :return: update blockDict in place """ # initialize Rucio here to avoid this authentication on T0-WMAgent self.rucio = Rucio(self.rucioAcct) blockReplicas = self.rucio.getPileupLockedAndAvailable( dset, account=self.rucioAcct) for blockName, blockLocation in viewitems(blockReplicas): try: blockDict[blockName]['PhEDExNodeNames'] = list(blockLocation) except KeyError: logging.warning("Block '%s' present in Rucio but not in DBS", blockName)
def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) # dataset rule creation has a larger polling cycle self.pollRules = config.RucioInjector.pollIntervalRules self.lastRulesExecTime = 0 self.createBlockRules = config.RucioInjector.createBlockRules self.containerDiskRuleParams = config.RucioInjector.containerDiskRuleParams self.containerDiskRuleRSEExpr = config.RucioInjector.containerDiskRuleRSEExpr if config.RucioInjector.metaDIDProject not in RUCIO_VALID_PROJECT: msg = "Component configured with an invalid 'project' DID: %s" raise RucioInjectorException(msg % config.RucioInjector.metaDIDProject) self.metaDIDProject = dict(project=config.RucioInjector.metaDIDProject) # setup cache for container and blocks (containers can be much longer, make 6 days now) self.containersCache = MemoryCache(config.RucioInjector.cacheExpiration * 3, set()) self.blocksCache = MemoryCache(config.RucioInjector.cacheExpiration, set()) self.scope = getattr(config.RucioInjector, "scope", "cms") self.rucioAcct = config.RucioInjector.rucioAccount self.rucio = Rucio(acct=self.rucioAcct, hostUrl=config.RucioInjector.rucioUrl, authUrl=config.RucioInjector.rucioAuthUrl, configDict={'logger': self.logger}) # metadata dictionary information to be added to block/container rules # cannot be a python dictionary, but a JSON string instead self.metaData = json.dumps(dict(agentHost=config.Agent.hostName, userAgent=config.Agent.agentName)) self.testRSEs = config.RucioInjector.RSEPostfix self.filesToRecover = [] # output data placement has a different behaviour between T0 and Production agents if hasattr(config, "Tier0Feeder"): logging.info("RucioInjector running on a T0 WMAgent") self.isT0agent = True else: self.isT0agent = False logging.info("Component configured to create block rules: %s", self.createBlockRules)
def setUp(self): """ Setup for unit tests """ super(RucioTest, self).setUp() self.myRucio = Rucio(self.acct, hostUrl=self.defaultArgs['host'], authUrl=self.defaultArgs['auth_host'], configDict=self.defaultArgs) self.client = testClient(rucio_host=self.defaultArgs['host'], auth_host=self.defaultArgs['auth_host'], account=self.acct, ca_cert=self.defaultArgs['ca_cert'], auth_type=self.defaultArgs['auth_type'], creds=self.defaultArgs['creds'], timeout=self.defaultArgs['timeout'])
def testGetReplicaInfoForBlocksRucio(self): """ Test `getReplicaInfoForBlocks` method, however not using the output compatibility with PhEDEx """ theseArgs = self.defaultArgs.copy() theseArgs['phedexCompatible'] = False myRucio = Rucio(self.acct, hostUrl=theseArgs['host'], authUrl=theseArgs['auth_host'], configDict=theseArgs) res = myRucio.getReplicaInfoForBlocks(dataset=DSET) self.assertTrue(isinstance(res, list)) self.assertTrue(len(res) >= 1) # at this very moment, there are 11 replicas blocks = [item['name'] for item in res] self.assertTrue(BLOCK in blocks) for item in res: self.assertTrue(len(item['replica']) > 0)
def getFromRucio(dataset, logger): """ Using the WMCore Rucio object and fetch all the blocks and files for a given container. Returns a dictionary key'ed by the block name, value is the amount of files. """ rucio = Rucio(acct=RUCIO_ACCT, hostUrl=RUCIO_HOST, authUrl=RUCIO_AUTH, configDict={ 'logger': logger, 'phedexCompatible': False }) result = dict() for block in rucio.getBlocksInContainer(dataset): data = rucio.getDID(block) result.setdefault(block, data['length']) return result
def setUp(self): """ _setUp_ """ super(WMBSHelperTest, self).setUp() self.testInit = TestInitCouchApp(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase=True) self.testInit.setupCouch("wmbshelper_t/jobs", "JobDump") self.testInit.setupCouch("wmbshelper_t/fwjrs", "FWJRDump") self.testInit.setupCouch("config_test", "GroupUser", "ConfigCache") os.environ["COUCHDB"] = "wmbshelper_t" self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMComponent.DBS3Buffer", "WMCore.BossAir", "WMCore.ResourceControl" ], useDefault=False) self.workDir = self.testInit.generateWorkDir() self.wmspec = self.createWMSpec() self.topLevelTask = getFirstTask(self.wmspec) self.inputDataset = self.topLevelTask.inputDataset() self.dataset = self.topLevelTask.getInputDatasetPath() self.dbs = DBSReader(self.inputDataset.dbsurl) self.rucioAcct = "wmcore_transferor" self.rucio = Rucio(self.rucioAcct) self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=threading.currentThread().logger, dbinterface=threading.currentThread().dbi) self.configFile = EmulatorSetup.setupWMAgentConfig() self.config = loadConfigurationFile(self.configFile) self.config.component_("JobSubmitter") self.config.JobSubmitter.submitDir = self.workDir self.config.JobSubmitter.submitScript = os.path.join( getTestBase(), 'WMComponent_t/JobSubmitter_t', 'submit.sh') return
def __init__(self, **args): PolicyInterface.__init__(self, **args) self.workQueueElements = [] self.wmspec = None self.team = None self.initialTask = None self.splitParams = None self.dbs_pool = {} self.data = {} self.lumi = None self.couchdb = None self.rejectedWork = [] # List of inputs that were rejected self.badWork = [ ] # list of bad work unit (e.g. without any valid files) self.pileupData = {} self.cric = CRIC() if usingRucio(): self.rucio = Rucio(self.args['rucioAcct'], configDict={'logger': self.logger}) else: self.phedex = PhEDEx() # this will go away eventually
def _queryAndCompareWithDBS(self, pileupDict, pileupConfig, dbsUrl): """ pileupDict is a Python dictionary containing particular pileup configuration information. Query DBS on given dataset contained now in both input pileupConfig as well as in the pileupDict and compare values. """ self.assertItemsEqual(list(pileupDict), list(pileupConfig)) reader = DBS3Reader(dbsUrl) rucioObj = Rucio(self.rucioAcct) # now query DBS and compare the blocks and files from DBS # against those returned by the PileupFetcher for pileupType, datasets in viewitems(pileupConfig): # this is from the pileup configuration produced by PileupFetcher blockDict = pileupDict[pileupType] for dataset in datasets: dbsBlocks = reader.listFileBlocks(dataset=dataset) rucioBlocksLocation = rucioObj.getPileupLockedAndAvailable(dataset, account=self.rucioAcct) # first, validate the number of blocks and their names self.assertItemsEqual(list(blockDict), dbsBlocks) self.assertItemsEqual(list(blockDict), list(rucioBlocksLocation)) # now validate the block location between Rucio and PileupFetcher for block, blockLocation in viewitems(blockDict): self.assertItemsEqual(blockLocation['PhEDExNodeNames'], rucioBlocksLocation[block]) # finally, validate the files fileList = [] # now get list of files in the block dbsFiles = reader.listFilesInBlock(block) for dbsFile in dbsFiles: fileList.append(dbsFile["LogicalFileName"]) self.assertItemsEqual(blockDict[block]["FileList"], fileList)
def __init__(self, **args): # We need to pop this object instance from args because otherwise # the super class blows up when doing a deepcopy(args) self.rucio = args.pop("rucioObject", None) PolicyInterface.__init__(self, **args) self.workQueueElements = [] self.wmspec = None self.team = None self.initialTask = None self.splitParams = None self.dbs_pool = {} self.data = {} self.lumi = None self.couchdb = None self.rejectedWork = [] # List of inputs that were rejected self.badWork = [ ] # list of bad work unit (e.g. without any valid files) self.pileupData = {} self.cric = CRIC() # FIXME: for the moment, it will always use the default value self.rucioAcct = self.args.get("rucioAcct", "wmcore_transferor") if not self.rucio: self.rucio = Rucio(self.rucioAcct, configDict={'logger': self.logger})
def __init__(self, msConfig, **kwargs): """ Provides setup for MSTransferor and MSMonitor classes :param config: MS service configuration :param kwargs: can be used to skip the initialization of specific services, such as: logger: logger object skipReqMgr: boolean to skip ReqMgr initialization skipReqMgrAux: boolean to skip ReqMgrAux initialization skipRucio: boolean to skip Rucio initialization skipPhEDEx: boolean to skip PhEDEx initialization """ self.logger = getMSLogger(getattr(msConfig, 'verbose', False), kwargs.get("logger")) self.msConfig = msConfig self.logger.info("Configuration including default values:\n%s", self.msConfig) if not kwargs.get("skipReqMgr", False): self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'], logger=self.logger) if not kwargs.get("skipReqMgrAux", False): self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'], httpDict={'cacheduration': 1.0}, logger=self.logger) self.phedex = None self.rucio = None if self.msConfig.get('useRucio', False) and not kwargs.get("skipRucio", False): self.rucio = Rucio(acct=self.msConfig['rucioAccount'], hostUrl=self.msConfig['rucioUrl'], authUrl=self.msConfig['rucioAuthUrl'], configDict={ "logger": self.logger, "user_agent": "wmcore-microservices" }) elif not kwargs.get("skipPhEDEx", False): # hard code it to production DBS otherwise PhEDEx subscribe API fails to match TMDB data dbsUrl = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader" self.phedex = PhEDEx(httpDict={'cacheduration': 0.5}, dbsUrl=dbsUrl, logger=self.logger)
def __init__(self, **kwargs): if not kwargs.get('logger'): import logging kwargs['logger'] = logging self.logger = kwargs['logger'] self.rucio = Rucio(kwargs.get("rucioAccount", "wmcore_transferor"), configDict=dict(logger=self.logger)) # this will break all in one test self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None)) centralurl = kwargs.get("central_logdb_url", "") identifier = kwargs.get("log_reporter", "") # set the thread name before creat the log db. # only sets that when it is not set already myThread = threading.currentThread() if myThread.getName() == "MainThread": myThread.setName(self.__class__.__name__) self.logdb = LogDB(centralurl, identifier, logger=self.logger)
def testConfig(self): """ Test service attributes and the override mechanism """ for key in self.defaultArgs: self.assertEqual(getattr(self.myRucio.cli, key), self.defaultArgs[key]) self.assertTrue(getattr(self.myRucio.cli, "user_agent").startswith("wmcore-client/")) self.assertTrue(getattr(self.client, "user_agent").startswith("rucio-clients/")) newParams = {"host": 'http://cms-rucio-int.cern.ch', "auth_host": 'https://cms-rucio-auth-int.cern.ch', "auth_type": "x509", "account": self.acct, "ca_cert": False, "timeout": 5, "phedexCompatible": False} newKeys = newParams.keys() newKeys.remove("phedexCompatible") rucio = Rucio(newParams['account'], hostUrl=newParams['host'], authUrl=newParams['auth_host'], configDict=newParams) self.assertEqual(getattr(rucio, "phedexCompat"), False) for key in newKeys: self.assertEqual(getattr(rucio.cli, key), newParams[key])
class MSMonitor(MSCore): """ MSMonitor class provide whole logic behind the transferor monitoring module. """ def __init__(self, msConfig, logger=None): super(MSMonitor, self).__init__(msConfig, logger=logger) # update interval is used to check records in CouchDB and update them # after this interval, default 6h self.updateInterval = self.msConfig.get('updateInterval', 6 * 60 * 60) self.rucio = Rucio(acct=self.msConfig['rucioAccount'], hostUrl=self.msConfig['rucioUrl'], authUrl=self.msConfig['rucioAuthUrl'], configDict={"logger": self.logger, "user_agent": "WMCore-MSMonitor"}) def updateCaches(self): """ Fetch some data required for the monitoring logic, e.g.: * all campaign configuration * all transfer records from backend DB :return: True if all of them succeeded, else False """ campaigns = self.reqmgrAux.getCampaignConfig("ALL_DOCS") transferRecords = self.reqmgrAux.getTransferInfo('ALL_DOCS') cdict = {} if not campaigns: self.logger.warning("Failed to fetch campaign configurations") if not transferRecords: self.logger.warning("Failed to fetch transfer records") else: for camp in campaigns: cdict[camp['CampaignName']] = camp return cdict, transferRecords def filterTransferDocs(self, requests, transferDocs): """ Given a list of requests in the `staging` status and all the transfer documents; select the transfer documents that: * match against a workflow in requests * haven't been updated over the last updateInterval seconds :param requests: list of workflow names :param transferDocs: list of transfer documents :return: a filtered out list of transfer documents """ now = time.time() newTransferDocs = [] self.logger.info("Matching %d requests to %d transfer documents...", len(requests), len(transferDocs)) for record in transferDocs: if record['workflowName'] in requests: if now - record['lastUpdate'] > self.updateInterval: newTransferDocs.append(record) msg = "Only %d transfer documents passed the status and timestamp filter." self.logger.info(msg, len(newTransferDocs)) return newTransferDocs def execute(self, reqStatus): """ Executes the MS monitoring logic, see https://github.com/dmwm/WMCore/wiki/ReqMgr2-MicroService-Monitor :param reqStatus: request status to process :return: a summary of the activity of the last cycle """ summary = dict(MONITOR_REPORT) try: # get requests from ReqMgr2 data-service for given status # here with detail=False we get back list of records requests = self.reqmgr2.getRequestByStatus([reqStatus], detail=False) self.logger.info(' retrieved %s requests in status: %s', len(requests), reqStatus) campaigns, transferRecords = self.updateCaches() self.updateReportDict(summary, "total_num_campaigns", len(campaigns)) self.updateReportDict(summary, "total_num_transfers", len(transferRecords)) if not campaigns or not transferRecords: # then wait until the next cycle msg = "Failed to fetch data from one of the data sources. Retrying again in the next cycle" self.logger.error(msg) self.updateReportDict(summary, "error", msg) return summary transferRecords = self.filterTransferDocs(requests, transferRecords) self.updateReportDict(summary, "filtered_transfer_docs", len(transferRecords)) except Exception as ex: # general error msg = 'Unknown exception bootstrapping the MSMonitor thread. Error: %s', str(ex) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) return summary try: # keep track of request and their new statuses skippedWorkflows = self.getTransferInfo(transferRecords) requestsToStage = self.getCompletedWorkflows(transferRecords, campaigns) failedDocs = self.updateTransferDocs(transferRecords, skippedWorkflows) self.updateReportDict(summary, "success_transfer_doc_update", len(transferRecords) - len(failedDocs) - len(skippedWorkflows)) self.updateReportDict(summary, "failed_transfer_doc_update", len(failedDocs)) # finally, update statuses for requests for reqName in requestsToStage: if reqName in failedDocs: msg = "Can't proceed with status transition for %s, because" % reqName msg += "the transfer document failed to get updated" self.logger.warning(msg) continue self.change(reqName, 'staged', self.__class__.__name__) self.updateReportDict(summary, "request_status_updated", summary['success_transfer_doc_update'] - summary['failed_transfer_doc_update']) msg = "%s processed %d transfer records, where " % (self.__class__.__name__, len(transferRecords)) msg += "%d completed their data transfers, " % len(requestsToStage) msg += "%d failed to contact the DM system and were skipped in this cycle and " % len(skippedWorkflows) msg += "%d failed to get their transfer documents updated in CouchDB." % len(failedDocs) self.logger.info(msg) except Exception as ex: msg = "Unknown exception processing the transfer records. Error: %s" % str(ex) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) return summary def getTransferInfo(self, transferRecords): """ Contact the data management tool in order to get a status update for the transfer request. :param transferRecords: list of transfer records :return skippedWorkflows: a list of workflow names which a call to the data management system did not succeed """ # FIXME: create concurrent rucio calls using multi_getdata skippedWorkflows = [] tstamp = int(time.time()) for doc in transferRecords: self.logger.debug("Checking transfers for: %s", doc['workflowName']) if not doc['transfers']: # nothing to be done, simply update the document last timestamp doc['lastUpdate'] = tstamp continue try: for rec in doc['transfers']: # obtain new transfer ids and completion for given dataset completion = self._getRucioTransferstatus(rec['transferIDs']) rec['completion'].append(round(completion, 3)) doc['lastUpdate'] = tstamp except Exception as exc: msg = "Unknown exception checking workflow %s. Error: %s" self.logger.exception(msg, doc['workflowName'], str(exc)) skippedWorkflows.append(doc['workflowName']) return skippedWorkflows def _getRucioTransferstatus(self, rulesList): """ Given a list of Rucio rules ID - for a given input data - check the overall transfer status from Rucio :param rulesList: list of rules ID :return: the overall transfers percent completion The Rucio getRule API returns data in the form of: {u'account': u'transfer_ops', u'grouping': u'ALL', u'id': u'40cbe787a42b4f6e991611f6fac3bb11', u'locked': True, u'locks_ok_cnt': 8, u'locks_replicating_cnt': 0, u'locks_stuck_cnt': 0, u'meta': None, etc etc NOTE: completion in Rucio is different than in PhEDEx. PhEDEx gives the percentage value; while Rucio gives the ratio (0 - 1). """ completion = [] for ruleID in rulesList: # if we query by dataset and the subscription was at block level, # we get an empty response. So always wildcard the block parameter data = self.rucio.getRule(ruleID) if not data: msg = "Failed to retrieve rule information from Rucio for rule ID: {}".format(ruleID) raise RuntimeError(msg) if data['state'] == "OK": lockCompletion = 100.0 else: totalLocks = data['locks_ok_cnt'] + data['locks_replicating_cnt'] + data['locks_stuck_cnt'] try: lockCompletion = (data['locks_ok_cnt'] / totalLocks) * 100 except ZeroDivisionError: self.logger.warning("Rule does not have any lock counts yet. Rule data: %s", data) lockCompletion = 0 completion.append(lockCompletion) self.logger.info("Rule ID: %s has a completion rate of: %s%%", ruleID, lockCompletion) self.logger.debug("Rule ID: %s, DID: %s, state: %s, grouping: %s, rse_expression: %s", ruleID, data['name'], data['state'], data['grouping'], data['rse_expression']) if not completion: return 0 return sum(completion) / len(completion) def getCompletedWorkflows(self, transfers, campaigns): """ Parse the transfer documents, compare against the campaign settings and decide whether the workflow is completed or not. :param transfers: list of transfers records :param campaigns: dictionary of campaigns :return: completion status """ completedWfs = [] for record in transfers: reqName = record['workflowName'] if not record['transfers']: self.logger.info("%s OK, no input data transfers, move it on.", reqName) completedWfs.append(reqName) continue # check completion of all transfers statuses = [] for transfer in record['transfers']: cdict = campaigns[transfer['campaignName']] # compare against the last completion number, which is from the last cycle execution if transfer['completion'][-1] >= cdict['PartialCopy'] * 100: status = 1 else: status = 0 statuses.append(status) if all(statuses): self.logger.info("%s OK, all transfers completed or above threshold, move it on.", reqName) completedWfs.append(reqName) return completedWfs def updateTransferDocs(self, docs, workflowsToSkip): """ Given a list of transfer documents, update all of them in ReqMgrAux database. :param docs: list of transfer docs :param workflowsToSkip: list of workflow names that should not be updated in CouchDB :return: a list of request names that failed to be updated """ failedWfs = [] for rec in docs: if rec['workflowName'] in workflowsToSkip: self.logger.warning("Not updating transfer record in CouchDB for: %s", rec['workflowName']) continue if not self.reqmgrAux.updateTransferInfo(rec['workflowName'], rec): # then it failed to update the doc, ReqMgrAux client is logging it already failedWfs.append(rec['workflowName']) return failedWfs
def loggerSetup(logLevel=logging.INFO): """ Return a logger which writes everything to stdout. """ logger = logging.getLogger(__name__) outHandler = logging.StreamHandler(sys.stdout) outHandler.setFormatter(logging.Formatter("%(asctime)s:%(levelname)s:%(module)s: %(message)s")) outHandler.setLevel(logLevel) logger.addHandler(outHandler) logger.setLevel(logLevel) return logger if __name__ == '__main__': args = parseArgs() logger = loggerSetup() rucio = Rucio(acct=RUCIO_ACCT, hostUrl=RUCIO_URL, authUrl=RUCIO_AUTH_URL, configDict={"logger": logger, "user_agent": "amaltaro/makeRucioRules"}) rule = {'copies': 1, 'activity': 'Production Input', 'lifetime': None, 'account': RUCIO_ACCT, 'grouping': "ALL", 'comment': 'WMCore MSTransferor input data placement'} logger.info("\nCreating rule for DID: %s, with RSE: %s and other attrs: %s", args.container, args.rse, rule) resp = rucio.createReplicationRule(args.container, args.rse, **rule) logger.info("Response: %s", resp)
class PileupFetcher(FetcherInterface): """ Pull dataset block/SE : LFN list from DBS for the pileup datasets required by the steps in the job. Save these maps as files in the sandbox """ def __init__(self): """ Prepare module setup """ super(PileupFetcher, self).__init__() # FIXME: find a way to pass the Rucio account name to this fetcher module self.rucioAcct = "wmcore_transferor" self.rucio = Rucio(self.rucioAcct) def _queryDbsAndGetPileupConfig(self, stepHelper, dbsReader): """ Method iterates over components of the pileup configuration input and queries DBS for valid files in the dataset, plus some extra information about each file. Information is organized at block level, listing all its files, number of events in the block, and its data location (to be resolved by a different method using either PhEDEx or Rucio), such as: {"pileupTypeA": {"BlockA": {"FileList": [], "PhEDExNodeNames": [], "NumberOfEvents": 123}, "BlockB": {"FileList": [], "PhEDExNodeName": []}, ....} """ resultDict = {} # iterate over input pileup types (e.g. "cosmics", "minbias") for pileupType in stepHelper.data.pileup.listSections_(): # the format here is: step.data.pileup.cosmics.dataset = [/some/data/set] datasets = getattr(getattr(stepHelper.data.pileup, pileupType), "dataset") # each dataset input can generally be a list, iterate over dataset names blockDict = {} for dataset in datasets: for fileInfo in dbsReader.getFileListByDataset(dataset=dataset, detail=True): blockDict.setdefault(fileInfo['block_name'], {'FileList': [], 'NumberOfEvents': 0, 'PhEDExNodeNames': []}) blockDict[fileInfo['block_name']]['FileList'].append(fileInfo['logical_file_name']) blockDict[fileInfo['block_name']]['NumberOfEvents'] += fileInfo['event_count'] self._getDatasetLocation(dataset, blockDict) resultDict[pileupType] = blockDict return resultDict def _getDatasetLocation(self, dset, blockDict): """ Given a dataset name, query PhEDEx or Rucio and resolve the block location :param dset: string with the dataset name :param blockDict: dictionary with DBS summary info :return: update blockDict in place """ blockReplicas = self.rucio.getPileupLockedAndAvailable(dset, account=self.rucioAcct) for blockName, blockLocation in viewitems(blockReplicas): try: blockDict[blockName]['PhEDExNodeNames'] = list(blockLocation) except KeyError: logging.warning("Block '%s' present in Rucio but not in DBS", blockName) def _getCacheFilePath(self, stepHelper): fileName = "" for pileupType in stepHelper.data.pileup.listSections_(): datasets = getattr(getattr(stepHelper.data.pileup, pileupType), "dataset") fileName += ("_").join(datasets) # TODO cache is not very effective if the dataset combination is different between workflow # here is possibility of hash value collision cacheFile = "%s/pileupconf-%s.json" % (self.cacheDirectory(), hash(fileName)) return cacheFile def _getStepFilePath(self, stepHelper): stepPath = "%s/%s" % (self.workingDirectory(), stepHelper.name()) fileName = "%s/%s" % (stepPath, "pileupconf.json") return fileName def _writeFile(self, filePath, jsonPU): directory = filePath.rsplit('/', 1)[0] if not os.path.exists(directory): os.mkdir(directory) try: with open(filePath, 'w') as f: f.write(jsonPU) except IOError: m = "Could not save pileup JSON configuration file: '%s'" % filePath raise RuntimeError(m) def _copyFile(self, src, dest): directory = dest.rsplit('/', 1)[0] if not os.path.exists(directory): os.mkdir(directory) shutil.copyfile(src, dest) def _isCacheExpired(self, cacheFilePath, delta=24): """Is the cache expired? At delta hours (default 24) in the future. """ # cache can either be a file name or an already opened file object if not os.path.exists(cacheFilePath): return True delta = datetime.timedelta(hours=delta) t = datetime.datetime.now() - delta # cache file mtime has been set to cache expiry time if os.path.getmtime(cacheFilePath) < time.mktime(t.timetuple()): return True return False def _isCacheValid(self, stepHelper): """ Check whether cache is exits TODO: if the cacheDirectory is not inside the Sandbox it should not autormatically deleted. We can add cache refresh policy here """ cacheFile = self._getCacheFilePath(stepHelper) if not self._isCacheExpired(cacheFile, delta=0.5) and os.path.getsize(cacheFile) > 0: # if file already exist don't make a new dbs call and overwrite the file. # just return fileName = self._getStepFilePath(stepHelper) if not os.path.isfile(fileName) or os.path.getsize(fileName) != os.path.getsize(cacheFile): self._copyFile(cacheFile, fileName) return True else: return False def _saveFile(self, stepHelper, jsonPU): cacheFile = self._getCacheFilePath(stepHelper) self._writeFile(cacheFile, jsonPU) fileName = self._getStepFilePath(stepHelper) self._copyFile(cacheFile, fileName) def createPileupConfigFile(self, helper): """ Stores pileup JSON configuration file in the working directory / sandbox. """ if self._isCacheValid(helper): # if file already exist don't make a new dbs call and overwrite the file. # just return return encoder = JSONEncoder() # this should have been set in CMSSWStepHelper along with # the pileup configuration url = helper.data.dbsUrl dbsReader = DBSReader(url) configDict = self._queryDbsAndGetPileupConfig(helper, dbsReader) # create JSON and save into a file jsonPU = encoder.encode(configDict) self._saveFile(helper, jsonPU) def __call__(self, wmTask): """ Method is called when WorkQueue creates the sandbox for a job. Need to look at the pileup configuration in the spec and query dbs to determine the lfns for the files in the datasets and what sites they're located at (WQ creates the job sandbox). wmTask is instance of WMTask.WMTaskHelper """ for step in wmTask.steps().nodeIterator(): helper = WMStep.WMStepHelper(step) # returns e.g. instance of CMSSWHelper # doesn't seem to be necessary ... strangely (some inheritance involved?) # typeHelper = helper.getTypeHelper() if hasattr(helper.data, "pileup"): self.createPileupConfigFile(helper)
class RucioTest(EmulatedUnitTestCase): """ Unit tests for Rucio Service module """ def __init__(self, methodName='runTest'): # TODO figure out what's going on with CRIC mock super(RucioTest, self).__init__(methodName=methodName, mockCRIC=False) self.acct = "wmagent_testing" # HACK: do not verify the SSL certificate because docker images # do not contain the CA certificate bundle # Relying on the config file in the jenkins infrastructure is a PITA # so let's make sure to pass all the necessary arguments self.creds = { "client_cert": os.getenv("X509_USER_CERT", "Unknown"), "client_key": os.getenv("X509_USER_KEY", "Unknown") } self.defaultArgs = { "host": 'http://cms-rucio-dev.cern.ch', "auth_host": 'https://cms-rucio-auth-dev.cern.ch', "auth_type": "x509", "account": self.acct, "ca_cert": False, "timeout": 30, "request_retries": 3, "creds": self.creds } def setUp(self): """ Setup for unit tests """ super(RucioTest, self).setUp() self.myRucio = Rucio(self.acct, hostUrl=self.defaultArgs['host'], authUrl=self.defaultArgs['auth_host'], configDict=self.defaultArgs) self.client = testClient(rucio_host=self.defaultArgs['host'], auth_host=self.defaultArgs['auth_host'], account=self.acct, ca_cert=self.defaultArgs['ca_cert'], auth_type=self.defaultArgs['auth_type'], creds=self.defaultArgs['creds'], timeout=self.defaultArgs['timeout']) def tearDown(self): """ Nothing to be done for this case """ pass def testConfig(self): """ Test service attributes and the override mechanism """ for key in self.defaultArgs: self.assertEqual(getattr(self.myRucio.cli, key), self.defaultArgs[key]) self.assertTrue( getattr(self.myRucio.cli, "user_agent").startswith("wmcore-client/")) self.assertTrue( getattr(self.client, "user_agent").startswith("rucio-clients/")) newParams = { "host": 'http://cms-rucio-dev.cern.ch', "auth_host": 'https://cms-rucio-auth-dev.cern.ch', "auth_type": "x509", "account": self.acct, "ca_cert": False, "timeout": 5, "phedexCompatible": False } newKeys = newParams.keys() newKeys.remove("phedexCompatible") rucio = Rucio(newParams['account'], hostUrl=newParams['host'], authUrl=newParams['auth_host'], configDict=newParams) self.assertEqual(getattr(rucio, "phedexCompat"), False) for key in newKeys: self.assertEqual(getattr(rucio.cli, key), newParams[key]) def testGetAccount(self): """ Test whether we can fetch data about a specific rucio account """ res = self.client.get_account(self.acct) res2 = self.myRucio.getAccount(self.acct) self.assertEqual(res['account'], self.acct) self.assertEqual(res['status'], "ACTIVE") self.assertEqual(res['account_type'], "USER") self.assertTrue({"status", "account", "account_type"}.issubset(set(res2.keys()))) self.assertTrue({self.acct, "ACTIVE", "USER"}.issubset(set(res2.values()))) # @attr('integration') def testWhoAmI(self): """ Test user mapping information from the request headers """ res = dict(self.client.whoami()) res2 = dict(self.myRucio.whoAmI()) self.assertTrue({"status", "account"}.issubset(set(res.keys()))) self.assertTrue(set(res.keys()) == set(res2.keys())) def testPing(self): """ Tests server ping """ res = self.client.ping() res2 = self.myRucio.pingServer() self.assertTrue("version" in res) self.assertItemsEqual(res, res2) def testGetBlocksInContainer(self): """ Test `getBlocksInContainer` method, the ability to retrieve blocks inside a container. """ # test a CMS dataset that does not exist res = self.myRucio.getBlocksInContainer("Alan") self.assertEqual(res, []) # provide a CMS block instead of a dataset res = self.myRucio.getBlocksInContainer(BLOCK) self.assertEqual(res, []) # finally provide a real CMS dataset res = self.myRucio.getBlocksInContainer(DSET) self.assertTrue(len(res) >= len([BLOCK])) self.assertIn(BLOCK, res) def testGetReplicaInfoForBlocks(self): """ Test `getReplicaInfoForBlocks` method, the ability to retrieve replica locations provided a dataset or block. Same output as PhEDEx. """ res = self.myRucio.getReplicaInfoForBlocks(block=BLOCK) self.assertEqual(len(res['phedex']['block']), 1) block = res['phedex']['block'].pop() self.assertEqual(block['name'], BLOCK) replicas = [item['node'] for item in block['replica']] self.assertTrue(len(replicas) > 0) # same test, but providing a dataset as input (which has 4 blocks) res = self.myRucio.getReplicaInfoForBlocks(dataset=DSET) self.assertTrue(len(res['phedex']['block']) >= 1) # at this very moment, there are 11 replicas blocks = [item['name'] for item in res['phedex']['block']] self.assertTrue(BLOCK in blocks) for item in res['phedex']['block']: self.assertTrue(len(item['replica']) > 0) def testGetReplicaInfoForBlocksRucio(self): """ Test `getReplicaInfoForBlocks` method, however not using the output compatibility with PhEDEx """ theseArgs = self.defaultArgs.copy() theseArgs['phedexCompatible'] = False myRucio = Rucio(self.acct, hostUrl=theseArgs['host'], authUrl=theseArgs['auth_host'], configDict=theseArgs) res = myRucio.getReplicaInfoForBlocks(dataset=DSET) self.assertTrue(isinstance(res, list)) self.assertTrue( len(res) >= 1) # at this very moment, there are 11 replicas blocks = [item['name'] for item in res] self.assertTrue(BLOCK in blocks) for item in res: self.assertTrue(len(item['replica']) > 0) def testGetPFN(self): """ Test `getPFN` method """ self.assertRaises(NotImplementedError, self.myRucio.getPFN)
class RucioTest(EmulatedUnitTestCase): """ Unit tests for Rucio Service module """ def __init__(self, methodName='runTest'): # TODO figure out what's going on with CRIC mock super(RucioTest, self).__init__(methodName=methodName, mockCRIC=False) self.acct = "wma_test" # HACK: do not verify the SSL certificate because docker images # do not contain the CA certificate bundle # Relying on the config file in the jenkins infrastructure is a PITA # so let's make sure to pass all the necessary arguments self.creds = { "client_cert": os.getenv("X509_USER_CERT", "Unknown"), "client_key": os.getenv("X509_USER_KEY", "Unknown") } self.defaultArgs = { "host": 'http://cms-rucio-int.cern.ch', "auth_host": 'https://cms-rucio-auth-int.cern.ch', "auth_type": "x509", "account": self.acct, "ca_cert": False, "timeout": 30, "request_retries": 3, "creds": self.creds } def setUp(self): """ Setup for unit tests """ super(RucioTest, self).setUp() self.myRucio = Rucio(self.acct, hostUrl=self.defaultArgs['host'], authUrl=self.defaultArgs['auth_host'], configDict=self.defaultArgs) self.client = testClient(rucio_host=self.defaultArgs['host'], auth_host=self.defaultArgs['auth_host'], account=self.acct, ca_cert=self.defaultArgs['ca_cert'], auth_type=self.defaultArgs['auth_type'], creds=self.defaultArgs['creds'], timeout=self.defaultArgs['timeout']) def tearDown(self): """ Nothing to be done for this case """ pass def testConfig(self): """ Test service attributes and the override mechanism """ for key in self.defaultArgs: self.assertEqual(getattr(self.myRucio.cli, key), self.defaultArgs[key]) self.assertTrue( getattr(self.myRucio.cli, "user_agent").startswith("wmcore-client/")) self.assertTrue( getattr(self.client, "user_agent").startswith("rucio-clients/")) newParams = { "host": 'http://cms-rucio-int.cern.ch', "auth_host": 'https://cms-rucio-auth-int.cern.ch', "auth_type": "x509", "account": self.acct, "ca_cert": False, "timeout": 5, "phedexCompatible": False } newKeys = newParams.keys() newKeys.remove("phedexCompatible") rucio = Rucio(newParams['account'], hostUrl=newParams['host'], authUrl=newParams['auth_host'], configDict=newParams) self.assertEqual(getattr(rucio, "phedexCompat"), False) for key in newKeys: self.assertEqual(getattr(rucio.cli, key), newParams[key]) def testGetAccount(self): """ Test whether we can fetch data about a specific rucio account """ res = self.client.get_account(self.acct) res2 = self.myRucio.getAccount(self.acct) self.assertEqual(res['account'], self.acct) self.assertEqual(res['status'], "ACTIVE") self.assertEqual(res['account_type'], "USER") self.assertTrue({"status", "account", "account_type"}.issubset(set(res2.keys()))) self.assertTrue({self.acct, "ACTIVE", "USER"}.issubset(set(res2.values()))) def testGetAccountUsage(self): """ Test whether we can fetch data about a specific rucio account """ res = list(self.client.get_account_usage(self.acct)) res2 = self.myRucio.getAccountUsage(self.acct) # I have manually created a rule for this account, so it will be there... self.assertEqual(res, res2) # now test against an account that either does not exist or that we cannot access res = self.myRucio.getAccountUsage("admin") self.assertIsNone(res) # @attr('integration') def testWhoAmI(self): """ Test user mapping information from the request headers """ res = dict(self.client.whoami()) res2 = dict(self.myRucio.whoAmI()) self.assertTrue({"status", "account"}.issubset(set(res.keys()))) self.assertTrue(set(res.keys()) == set(res2.keys())) def testPing(self): """ Tests server ping """ res = self.client.ping() res2 = self.myRucio.pingServer() self.assertTrue("version" in res) self.assertItemsEqual(res, res2) def testGetBlocksInContainer(self): """ Test `getBlocksInContainer` method, the ability to retrieve blocks inside a container. """ # test a CMS dataset that does not exist res = self.myRucio.getBlocksInContainer("Alan") self.assertEqual(res, []) # provide a CMS block instead of a dataset res = self.myRucio.getBlocksInContainer(BLOCK) self.assertEqual(res, []) # finally provide a real CMS dataset res = self.myRucio.getBlocksInContainer(DSET) self.assertTrue(len(res) >= len([BLOCK])) self.assertIn(BLOCK, res) def testGetReplicaInfoForBlocks(self): """ Test `getReplicaInfoForBlocks` method, the ability to retrieve replica locations provided a dataset or block. Same output as PhEDEx. """ res = self.myRucio.getReplicaInfoForBlocks(block=BLOCK) self.assertEqual(len(res['phedex']['block']), 1) block = res['phedex']['block'].pop() self.assertEqual(block['name'], BLOCK) replicas = [item['node'] for item in block['replica']] self.assertTrue(len(replicas) > 0) # same test, but providing a dataset as input (which has 4 blocks) res = self.myRucio.getReplicaInfoForBlocks(dataset=DSET) self.assertTrue(len(res['phedex']['block']) >= 1) # at this very moment, there are 11 replicas blocks = [item['name'] for item in res['phedex']['block']] self.assertTrue(BLOCK in blocks) for item in res['phedex']['block']: self.assertTrue(len(item['replica']) > 0) def testGetReplicaInfoForBlocksRucio(self): """ Test `getReplicaInfoForBlocks` method, however not using the output compatibility with PhEDEx """ theseArgs = self.defaultArgs.copy() theseArgs['phedexCompatible'] = False myRucio = Rucio(self.acct, hostUrl=theseArgs['host'], authUrl=theseArgs['auth_host'], configDict=theseArgs) res = myRucio.getReplicaInfoForBlocks(dataset=DSET) self.assertTrue(isinstance(res, list)) self.assertTrue( len(res) >= 1) # at this very moment, there are 11 replicas blocks = [item['name'] for item in res] self.assertTrue(BLOCK in blocks) for item in res: self.assertTrue(len(item['replica']) > 0) def testGetPFN(self): """ Test `getPFN` method """ self.assertRaises(NotImplementedError, self.myRucio.getPFN) def testListContent(self): """ Test `listContent` method, to list content of a given DID """ # listing blocks for a dataset res = self.myRucio.listContent(DSET) self.assertTrue(len(res) > 10) self.assertEqual(res[0]["type"], "DATASET") # listing files for a block res = self.myRucio.listContent(BLOCK) self.assertTrue(len(res) > 10) self.assertEqual(res[0]["type"], "FILE") res = self.myRucio.listContent("/Primary/ProcStr-v1/tier") self.assertItemsEqual(res, []) def testListDataRules(self): """ Test `listContent` method """ res = self.myRucio.listDataRules(DSET) self.assertItemsEqual(res, []) def testGetRule(self): """ Test `getRule` method """ # Badly formatted rule id, raises/catches a general exception res = self.myRucio.getRule("blah") self.assertItemsEqual(res, {}) # Properly formatted rule, but inexistent id res = self.myRucio.getRule("1d6ea1d916d5492e81b1bb30ed4aebc0") self.assertItemsEqual(res, {}) # Properly formatted rule, rule manually created res = self.myRucio.getRule("1d6ea1d916d5492e81b1bb30ed4aebc1") self.assertTrue(res) def testMetaDataValidation(self): """ Test the `validateMetaData` validation function """ for thisProj in RUCIO_VALID_PROJECT: response = validateMetaData("any_DID_name", dict(project=thisProj), self.myRucio.logger) self.assertTrue(response) # test with no "project" meta data at all response = validateMetaData("any_DID_name", dict(), self.myRucio.logger) self.assertTrue(response) # now an invalid "project" meta data response = validateMetaData("any_DID_name", dict(project="mistake"), self.myRucio.logger) self.assertFalse(response)
class RucioInjectorPoller(BaseWorkerThread): """ _RucioInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. The logic of this component is: * create a rucio container (or reuse a pre-existent one) * create a CMS block (or reuse a pre-existent one), block gets automatically attached * create file/replicas, which get automatically attached to its block as well * now create a CMS block rule to protect this data * if the block has been inserted into DBS, close the block in Rucio In addition to that, it has logic for rucio container subscription (rule creation), and block rule removal. Those follow a different polling cycle though. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) self.enabled = config.RucioInjector.enabled # dataset rule creation has a larger polling cycle self.pollRules = config.RucioInjector.pollIntervalRules self.lastRulesExecTime = 0 self.createBlockRules = config.RucioInjector.createBlockRules self.skipRulesForTiers = config.RucioInjector.skipRulesForTiers self.listTiersToInject = config.RucioInjector.listTiersToInject # setup cache for container and blocks (containers can be much longer, make 6 days now) self.containersCache = MemoryCache( config.RucioInjector.cacheExpiration * 3, set()) self.blocksCache = MemoryCache(config.RucioInjector.cacheExpiration, set()) self.scope = getattr(config.RucioInjector, "scope", "cms") self.rucioAcct = config.RucioInjector.rucioAccount self.rucio = Rucio(acct=self.rucioAcct, hostUrl=config.RucioInjector.rucioUrl, authUrl=config.RucioInjector.rucioAuthUrl, configDict={'logger': self.logger}) # metadata dictionary information to be added to block/container rules # cannot be a python dictionary, but a JSON string instead self.metaData = json.dumps( dict(agentHost=config.Agent.hostName, userAgent=config.Agent.agentName)) self.testRSEs = config.RucioInjector.RSEPostfix self.filesToRecover = [] logging.info( "Component configured to only inject data for data tiers: %s", self.listTiersToInject) logging.info( "Component configured to skip container rule creation for data tiers: %s", self.skipRulesForTiers) logging.info("Component configured to create block rules: %s", self.createBlockRules) def setup(self, parameters): """ _setup_ Create DAO Factory and setup some DAO. """ myThread = threading.currentThread() daofactory = DAOFactory(package="WMComponent.RucioInjector.Database", logger=self.logger, dbinterface=myThread.dbi) self.getUninjected = daofactory(classname="GetUninjectedFiles") self.getMigrated = daofactory(classname="GetMigratedBlocks") self.getUnsubscribedBlocks = daofactory( classname="GetUnsubscribedBlocks") self.setBlockRules = daofactory(classname="SetBlocksRule") self.findDeletableBlocks = daofactory(classname="GetDeletableBlocks") self.markBlocksDeleted = daofactory(classname="MarkBlocksDeleted") self.getUnsubscribedDsets = daofactory( classname="GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname="MarkDatasetSubscribed") daofactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=self.logger, dbinterface=myThread.dbi) self.setStatus = daofactory(classname="DBSBufferFiles.SetPhEDExStatus") self.setBlockClosed = daofactory(classname="SetBlockClosed") @timeFunction def algorithm(self, parameters): """ _algorithm_ Poll the database for uninjected files and inject them into Rucio. """ if not self.enabled: logging.info( "RucioInjector component is disabled in the configuration, exiting." ) return logging.info("Running Rucio injector poller algorithm...") try: # files that failed to get their status updated in dbsbuffer self._updateLFNState(self.filesToRecover, recovery=True) # get dbsbuffer_file.in_phedex = 0 uninjectedFiles = self.getUninjected.execute() # while we commission Rucio within WM, not all datatiers are supposed # to be injected by this component. Remove any data that we are not # meant to process! uninjectedFiles = filterDataByTier(uninjectedFiles, self.listTiersToInject) # create containers in rucio (and update local cache) containersAdded = self.insertContainers(uninjectedFiles) if self.containersCache.isCacheExpired(): self.containersCache.setCache(containersAdded) else: self.containersCache.addItemToCache(containersAdded) # create blocks. Only update the cache once a rule gets created... blocksAdded = self.insertBlocks(uninjectedFiles) if self.blocksCache.isCacheExpired(): self.blocksCache.setCache(blocksAdded) else: self.blocksCache.addItemToCache(blocksAdded) # create file replicas self.insertReplicas(uninjectedFiles) # now close blocks already uploaded to DBS self.closeBlocks() if self.lastRulesExecTime + self.pollRules <= int(time.time()): self.insertContainerRules() self.insertBlockRules() self.deleteBlocks() except Exception as ex: msg = "Caught unexpected exception in RucioInjector. Details:\n%s" % str( ex) logging.exception(msg) raise RucioInjectorException(msg) return def insertContainers(self, uninjectedData): """ This method will insert containers into Rucio, provided they cannot be found in the local cache. :param uninjectedData: same data as it's returned from the uninjectedFiles :return: set of containers successfully inserted into Rucio """ logging.info("Preparing to insert containers into Rucio...") newContainers = set() for location in uninjectedData: for container in uninjectedData[location]: # same container can be at multiple locations if container not in self.containersCache and container not in newContainers: if self.rucio.createContainer(container): logging.info("Container %s inserted into Rucio", container) newContainers.add(container) else: logging.error("Failed to create container: %s", container) logging.info("Successfully inserted %d containers into Rucio", newContainers) return newContainers def insertBlocks(self, uninjectedData): """ This method will insert blocks into Rucio and attach them to their correspondent containers, when attaching this block, we also need to provide the RSE that it will be available. :param uninjectedData: same data as it's returned from the uninjectedFiles :return: a dictionary of successfully inserted blocks and their correspondent location """ logging.info("Preparing to insert blocks into Rucio...") newBlocks = set() for location in uninjectedData: rseName = "%s_Test" % location if self.testRSEs else location for container in uninjectedData[location]: for block in uninjectedData[location][container]: if block not in self.blocksCache: if self.rucio.createBlock(block, rse=rseName): logging.info("Block %s inserted into Rucio", block) newBlocks.add(block) else: logging.error("Failed to create block: %s", block) logging.info("Successfully inserted %d blocks into Rucio", newBlocks) return newBlocks # TODO: this will likely go away once the phedex to rucio migration is over def _isBlockTierAllowed(self, blockName): """ Performs a couple of checks on the block datatier, such as: * is the datatier supposed to be injected by this component * is the datatier supposed to get rules created by this component :return: True if the component can proceed with this block, False otherwise """ endBlock = blockName.rsplit('/', 1)[1] endTier = endBlock.split('#')[0] if endTier not in self.listTiersToInject: return False if endTier in self.skipRulesForTiers: return False return True def insertBlockRules(self): """ Creates a simple replication rule for every single block that is under production in a given site/RSE. Also persist the rule ID in the database. """ if not self.createBlockRules: return logging.info("Preparing to create block rules into Rucio...") unsubBlocks = self.getUnsubscribedBlocks.execute() for item in unsubBlocks: if not self._isBlockTierAllowed(item['blockname']): logging.debug( "Component configured to skip block rule for: %s", item['blockname']) continue rseName = "%s_Test" % item['pnn'] if self.testRSEs else item['pnn'] # DATASET = replicates all files in the same block to the same RSE resp = self.rucio.createReplicationRule( item['blockname'], rseExpression="rse=%s" % rseName, account=self.rucioAcct, grouping="DATASET", comment="WMAgent production site", meta=self.metaData) if resp: msg = "Block rule created for block: %s, at: %s, with rule id: %s" logging.info(msg, item['blockname'], item['pnn'], resp[0]) binds = {'RULE_ID': resp[0], 'BLOCKNAME': item['blockname']} self.setBlockRules.execute(binds) else: logging.error("Failed to create rule for block: %s at %s", item['blockname'], rseName) return def insertReplicas(self, uninjectedData): """ Inserts replicas into Rucio and attach them to its specific block. If the insertion succeeds, also switch their database state to injected. :param uninjectedData: dictionary with blocks as key, and RSEs as value """ # FIXME: I think we need a different data struct from the database # this method is very expensive O(n^4) logging.info("Preparing to insert replicas into Rucio...") for location in uninjectedData.keys(): rseName = "%s_Test" % location if self.testRSEs else location for container in uninjectedData[location]: for block in uninjectedData[location][container]: injectData = [] listLfns = [] for fileInfo in uninjectedData[location][container][block][ 'files']: listLfns.append(fileInfo['lfn']) injectData.append( dict(name=fileInfo['lfn'], scope=self.scope, bytes=fileInfo['size'], state="A", adler32=fileInfo['checksum']['adler32'])) if self.rucio.createReplicas(rse=rseName, files=injectData, block=block): logging.info( "Successfully inserted %d files on block %s", len(listLfns), block) self._updateLFNState(listLfns) return def _updateLFNState(self, listLfns, recovery=False): """ Given a list of LFNs, update their state in dbsbuffer table. :param listLfns: list of LFNs :param recovery: True if we are recovering previously injected files :return: nothing """ if not listLfns: return try: self.setStatus.execute(listLfns, 1) except Exception as ex: # save it to try to inject them again in the next cycle self.filesToRecover.extend(listLfns) if 'Deadlock found' in str(ex) or 'deadlock detected' in str(ex): logging.error( "Deadlock during file status update. Retrying again in the next cycle." ) self.filesToRecover.extend(listLfns) else: msg = "Failed to update file status in the database, reason: %s" % str( ex) logging.error(msg) raise RucioInjectorException(msg) else: if recovery: self.filesToRecover = [] def closeBlocks(self): """ Close any blocks that have been migrated to global DBS """ logging.info("Starting closeBlocks method") # in short, dbsbuffer_file.in_phedex = 1 AND dbsbuffer_block.status = 'InDBS' migratedBlocks = self.getMigrated.execute() ### FIXME the data format returned by this DAO for location in migratedBlocks: for container in migratedBlocks[location]: if not self._isContainerTierAllowed(container, checkRulesList=False): continue for block in migratedBlocks[location][container]: if self.rucio.closeBlockContainer(block): self.setBlockClosed.execute(block) else: logging.error( "Failed to close block: %s. Will retry again later.", block) def deleteBlocks(self): """ _deleteBlocks_ Find deletable blocks, then decide if to delete based on: Is there an active subscription for dataset or block ? If yes => set deleted=2 If no => next check Has transfer to all destinations finished ? If yes => request block deletion, approve request, set deleted=1 If no => do nothing (check again next cycle) """ # FIXME: figure out the proper logic for rule block deletion logging.info("Starting deleteBlocks methods --> IMPLEMENT-ME!!!") # TODO: this will likely go away once the phedex to rucio migration is over def _isContainerTierAllowed(self, containerName, checkRulesList=True): """ It compares the container datatier name to check whether the component should inject data for it or not. In addition to that, it can also evaluate whether it's allowed to create rules for such datatier or not. :param containerName: string with the name of the container :param checkRulesList: boolean to check or not against the list of tiers to be skipped in the rule creation :return: True if the component can proceed with this container, False otherwise """ endTier = containerName.rsplit('/', 1)[1] if endTier not in self.listTiersToInject: return False if checkRulesList and endTier in self.skipRulesForTiers: return False return True def insertContainerRules(self): """ _insertContainerRules_ Poll the database for datasets meant to be subscribed and create a container level rule to replicate all files to a given RSE """ logging.info("Starting insertContainerRules method") # FIXME also adapt the format returned by this DAO # Check for completely unsubscribed datasets # in short, files in phedex, file status in "GLOBAL" or "InDBS", and subscribed=0 unsubscribedDatasets = self.getUnsubscribedDsets.execute() # Keep a list of subscriptions to tick as subscribed in the database subscriptionsMade = [] # Create the subscription objects and add them to the list # The list takes care of the sorting internally for subInfo in unsubscribedDatasets: rse = subInfo['site'] container = subInfo['path'] if not self._isContainerTierAllowed(container): logging.debug( "Component configured to skip container rule for: %s", container) continue logging.info("Creating container rule for %s against RSE %s", container, rse) rseName = "%s_Test" % rse if self.testRSEs else rse # ALL = replicates all files to the same RSE resp = self.rucio.createReplicationRule( container, rseExpression="rse=%s" % rseName, account=self.rucioAcct, grouping="ALL", comment="WMAgent automatic container rule", meta=self.metaData) if resp: logging.info("Container rule created for %s under rule id: %s", container, resp) subscriptionsMade.append(subInfo['id']) else: logging.error("Failed to create rule for block: %s", container) # Register the result in DBSBuffer if subscriptionsMade: self.markSubscribed.execute(subscriptionsMade) return
class PileupFetcher(FetcherInterface): """ Pull dataset block/SE : LFN list from DBS for the pileup datasets required by the steps in the job. Save these maps as files in the sandbox """ def __init__(self): """ Prepare module setup """ super(PileupFetcher, self).__init__() if usingRucio(): # Too much work to pass the rucio account name all the way to here # just use the production rucio account for resolving pileup location self.rucio = Rucio("wma_prod", configDict={'phedexCompatible': False}) else: self.phedex = PhEDEx() # this will go away eventually def _queryDbsAndGetPileupConfig(self, stepHelper, dbsReader): """ Method iterates over components of the pileup configuration input and queries DBS. Then iterates over results from DBS. There needs to be a list of files and their locations for each dataset name. Use dbsReader the result data structure is a Python dict following dictionary: FileList is a list of LFNs {"pileupTypeA": {"BlockA": {"FileList": [], "PhEDExNodeNames": []}, "BlockB": {"FileList": [], "PhEDExNodeName": []}, ....} this structure preserves knowledge of where particular files of dataset are physically (list of PNNs) located. DBS only lists sites which have all files belonging to blocks but e.g. BlockA of dataset DS1 may be located at site1 and BlockB only at site2 - it's possible that only a subset of the blocks in a dataset will be at a site. """ resultDict = {} # iterate over input pileup types (e.g. "cosmics", "minbias") for pileupType in stepHelper.data.pileup.listSections_(): # the format here is: step.data.pileup.cosmics.dataset = [/some/data/set] datasets = getattr(getattr(stepHelper.data.pileup, pileupType), "dataset") # each dataset input can generally be a list, iterate over dataset names blockDict = {} for dataset in datasets: blockFileInfo = dbsReader.getFileListByDataset(dataset=dataset, detail=True) for fileInfo in blockFileInfo: blockDict.setdefault(fileInfo['block_name'], { 'FileList': [], 'NumberOfEvents': 0, 'PhEDExNodeNames': [] }) blockDict[fileInfo['block_name']]['FileList'].append( {'logical_file_name': fileInfo['logical_file_name']}) blockDict[fileInfo['block_name']][ 'NumberOfEvents'] += fileInfo['event_count'] self._getDatasetLocation(dataset, blockDict) resultDict[pileupType] = blockDict return resultDict def _getDatasetLocation(self, dset, blockDict): """ Given a dataset name, query PhEDEx or Rucio and resolve the block location :param dset: string with the dataset name :param blockDict: dictionary with DBS summary info :return: update blockDict in place """ node_filter = set(['UNKNOWN', None]) if hasattr(self, "rucio"): # then it's Rucio!! blockReplicasInfo = self.rucio.getReplicaInfoForBlocks( dataset=dset) for item in blockReplicasInfo: block = item['name'] try: blockDict[block]['PhEDExNodeNames'] = item['replica'] blockDict[block]['FileList'] = sorted( blockDict[block]['FileList']) except KeyError: logging.warning( "Block '%s' does not have any complete Rucio replica", block) else: blockReplicasInfo = self.phedex.getReplicaPhEDExNodesForBlocks( dataset=dset, complete='y') for block in blockReplicasInfo: nodes = set(blockReplicasInfo[block]) - node_filter try: blockDict[block]['PhEDExNodeNames'] = list(nodes) blockDict[block]['FileList'] = sorted( blockDict[block]['FileList']) except KeyError: logging.warning( "Block '%s' does not have any complete PhEDEx replica", block) def _getCacheFilePath(self, stepHelper): fileName = "" for pileupType in stepHelper.data.pileup.listSections_(): datasets = getattr(getattr(stepHelper.data.pileup, pileupType), "dataset") fileName += ("_").join(datasets) # TODO cache is not very effective if the dataset combination is different between workflow # here is possibility of hash value collision cacheFile = "%s/pileupconf-%s.json" % (self.cacheDirectory(), hash(fileName)) return cacheFile def _getStepFilePath(self, stepHelper): stepPath = "%s/%s" % (self.workingDirectory(), stepHelper.name()) fileName = "%s/%s" % (stepPath, "pileupconf.json") return fileName def _writeFile(self, filePath, jsonPU): directory = filePath.rsplit('/', 1)[0] if not os.path.exists(directory): os.mkdir(directory) try: with open(filePath, 'w') as f: f.write(jsonPU) except IOError: m = "Could not save pileup JSON configuration file: '%s'" % filePath raise RuntimeError(m) def _copyFile(self, src, dest): directory = dest.rsplit('/', 1)[0] if not os.path.exists(directory): os.mkdir(directory) shutil.copyfile(src, dest) def _isCacheExpired(self, cacheFilePath, delta=24): """Is the cache expired? At delta hours (default 24) in the future. """ # cache can either be a file name or an already opened file object if not os.path.exists(cacheFilePath): return True delta = datetime.timedelta(hours=delta) t = datetime.datetime.now() - delta # cache file mtime has been set to cache expiry time if os.path.getmtime(cacheFilePath) < time.mktime(t.timetuple()): return True return False def _isCacheValid(self, stepHelper): """ Check whether cache is exits TODO: if the cacheDirectory is not inside the Sandbox it should not autormatically deleted. We can add cache refresh policy here """ cacheFile = self._getCacheFilePath(stepHelper) if not self._isCacheExpired( cacheFile, delta=0.5) and os.path.getsize(cacheFile) > 0: # if file already exist don't make a new dbs call and overwrite the file. # just return fileName = self._getStepFilePath(stepHelper) if not os.path.isfile(fileName) or os.path.getsize( fileName) != os.path.getsize(cacheFile): self._copyFile(cacheFile, fileName) return True else: return False def _saveFile(self, stepHelper, jsonPU): cacheFile = self._getCacheFilePath(stepHelper) self._writeFile(cacheFile, jsonPU) fileName = self._getStepFilePath(stepHelper) self._copyFile(cacheFile, fileName) def createPileupConfigFile(self, helper): """ Stores pileup JSON configuration file in the working directory / sandbox. """ if self._isCacheValid(helper): # if file already exist don't make a new dbs call and overwrite the file. # just return return encoder = JSONEncoder() # this should have been set in CMSSWStepHelper along with # the pileup configuration url = helper.data.dbsUrl dbsReader = DBSReader(url) configDict = self._queryDbsAndGetPileupConfig(helper, dbsReader) # create JSON and save into a file jsonPU = encoder.encode(configDict) self._saveFile(helper, jsonPU) def __call__(self, wmTask): """ Method is called when WorkQueue creates the sandbox for a job. Need to look at the pileup configuration in the spec and query dbs to determine the lfns for the files in the datasets and what sites they're located at (WQ creates the job sandbox). wmTask is instance of WMTask.WMTaskHelper """ for step in wmTask.steps().nodeIterator(): helper = WMStep.WMStepHelper(step) # returns e.g. instance of CMSSWHelper # doesn't seem to be necessary ... strangely (some inheritance involved?) # typeHelper = helper.getTypeHelper() if hasattr(helper.data, "pileup"): self.createPileupConfigFile(helper)
def executeInternal(self, *args, **kwargs): self.logger.info( "Data discovery with DBS") ## to be changed into debug dbsurl = self.config.Services.DBSUrl if kwargs['task']['tm_dbs_url']: dbsurl = kwargs['task']['tm_dbs_url'] self.dbs = DBSReader(dbsurl) self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"] isUserDataset = self.dbsInstance.split('/')[1] != 'global' # where to look locations in pre-Rucio world PhEDExOrDBS = 'PhEDEx' if not isUserDataset else 'DBS origin site' taskName = kwargs['task']['tm_taskname'] userProxy = kwargs['task']['user_proxy'] self.logger.debug("Data discovery through %s for %s", self.dbs, taskName) inputDataset = kwargs['task']['tm_input_dataset'] secondaryDataset = kwargs['task'].get('tm_secondary_input_dataset', None) self.checkDatasetStatus(inputDataset, kwargs) if secondaryDataset: self.checkDatasetStatus(secondaryDataset, kwargs) try: # Get the list of blocks for the locations. # The WMCore DBS3 implementation makes one call to DBS for each block # when using locations=True so we are using locations=False and looking up location later blocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(inputDataset, locations=False) ] if secondaryDataset: secondaryBlocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(secondaryDataset, locations=False) ] except DBSReaderError as dbsexc: # dataset not found in DBS is a known use case if str(dbsexc).find('No matching data'): raise TaskWorkerException( "CRAB could not find dataset %s in this DBS instance: %s" % inputDataset, dbsurl) raise ## Create a map for block's locations: for each block get the list of locations. ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no ## locations are found it gets the original locations from DBS. So it should ## never be the case at this point that some blocks have no locations. ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example: ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'], ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL']} # For now apply Rucio data location only to NANOAOD* # in time useRucioForLocations may become a more rich expression isNano = blocks[0].split("#")[0].split("/")[-1] in [ "NANOAOD", "NANOAODSIM" ] if isNano: self.logger.info( "NANOAOD* datset. Will use Rucio for data location") useRucioForLocations = isNano locationsFoundWithRucio = False if not useRucioForLocations: self.logger.info("Will not use Rucio for this dataset") # if locations should be in Rucio, try it first and fall back to old ways if Rucio calls fail # of if they return no locations (possible Rucio teething pain). If Rucio returns a list, trust it. if useRucioForLocations: locationsMap = {} scope = "cms" # If the dataset is a USER one, use the Rucio user scope to find it # TODO: we need a way to enable users to indicate others user scopes as source if isUserDataset: scope = "user.%s" % kwargs['task']['tm_username'] rucio_config_dict = { "phedexCompatible": True, "auth_type": "x509", "ca_cert": self.config.Services.Rucio_caPath, "logger": self.logger, "creds": { "client_cert": self.config.TaskWorker.cmscert, "client_key": self.config.TaskWorker.cmskey } } try: self.logger.info("Initializing Rucio client") # WMCore is awfully verbose with tempSetLogLevel(logger=self.logger, level=logging.ERROR): rucioClient = Rucio( self.config.Services.Rucio_account, hostUrl=self.config.Services.Rucio_host, authUrl=self.config.Services.Rucio_authUrl, configDict=rucio_config_dict) rucioClient.whoAmI() self.logger.info( "Looking up data location with Rucio in %s scope.", scope) with tempSetLogLevel(logger=self.logger, level=logging.ERROR): locations = rucioClient.getReplicaInfoForBlocks( scope=scope, block=list(blocks)) except Exception as exc: msg = "Rucio lookup failed with\n%s" % str(exc) # TODO when removing fall-back to PhEDEx, this should be a fatal error # raise TaskWorkerException(msg) self.logger.warn(msg) locations = None # TODO when removing fall-back to PhEDEx, above code will raise if it fails, therefore # the following "if" must be removed and the code shifted left if locations: located_blocks = locations['phedex']['block'] for element in located_blocks: if element[ 'replica']: # only fill map for blocks which have at least one location locationsMap.update({ element['name']: [x['node'] for x in element['replica']] }) if locationsMap: locationsFoundWithRucio = True else: msg = "No locations found with Rucio for this dataset" # since NANO* are not in PhEDEx, this should be a fatal error if isNano: raise TaskWorkerException(msg) else: # note it down and try with PhEDEx self.logger.warn(msg) if not locationsFoundWithRucio: # fall back to pre-Rucio methods try: self.logger.info("Looking up data locations using %s", PhEDExOrDBS) locationsMap = self.dbs.listFileBlockLocation( list(blocks), dbsOnly=isUserDataset) except Exception as ex: raise TaskWorkerException( "The CRAB3 server backend could not get the location of the files from dbs nor phedex nor rucio.\n"+\ "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex) ) # only fill map for blocks which have at least one location locationsMap = { key: value for key, value in locationsMap.iteritems() if value } if secondaryDataset: secondaryLocationsMap = {} # see https://github.com/dmwm/CRABServer/issues/6075#issuecomment-641569446 self.logger.info( "Trying data location of secondary blocks with Rucio") try: locations = rucioClient.getReplicaInfoForBlocks( scope=scope, block=list(secondaryBlocks)) except Exception as exc: locations = None secondaryLocationsMap = {} self.logger.warn("Rucio lookup failed with. %s", exc) if locations: located_blocks = locations['phedex']['block'] for element in located_blocks: if element[ 'replica']: # only fill map for blocks which have at least one location secondaryLocationsMap.update({ element['name']: [x['node'] for x in element['replica']] }) if not secondaryLocationsMap: msg = "No locations found with Rucio for secondaryDataset." # TODO when removing fall-back to PhEDEx, this should be a fatal error # raise TaskWorkerException(msg) self.logger.warn(msg) self.logger.info( "Trying data location of secondary blocks with PhEDEx") try: secondaryLocationsMap = self.dbs.listFileBlockLocation( list(secondaryBlocks), dbsOnly=isUserDataset) except Exception as ex: raise TaskWorkerException( "The CRAB3 server backend could not get the location of the secondary dataset files from dbs or phedex or rucio.\n" + \ "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)" + \ " and contact the experts if the error persists.\nError reason: %s" % str(ex) ) # only fill map for blocks which have at least one location secondaryLocationsMap = { key: value for key, value in secondaryLocationsMap.iteritems() if value } # From now on code is not dependent from having used Rucio or PhEDEx blocksWithLocation = locationsMap.keys() if secondaryDataset: secondaryBlocksWithLocation = secondaryLocationsMap.keys() self.keepOnlyDisks(locationsMap) if not locationsMap: msg = "Task could not be submitted because there is no DISK replica for dataset %s" % inputDataset if self.tapeLocations: msg += "\nN.B.: the input dataset is stored at %s, but those are TAPE locations." % ', '.join( sorted(self.tapeLocations)) # submit request to DDM ddmRequest = None ddmServer = self.config.TaskWorker.DDMServer try: ddmRequest = blocksRequest(blocksWithLocation, ddmServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, verbose=False) except HTTPException as hte: self.logger.exception(hte) msg += "\nThe automatic stage-out failed, please try again later. If the error persists contact the experts and provide this error message:" msg += "\nHTTP Error while contacting the DDM server %s:\n%s" % ( ddmServer, str(hte)) msg += "\nHTTP Headers are: %s" % hte.headers msg += "\nYou might want to contact your physics group if you need a disk replica." raise TaskWorkerException(msg, retry=True) self.logger.info("Contacted %s using %s and %s, got:\n%s", self.config.TaskWorker.DDMServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, ddmRequest) # The query above returns a JSON with a format {"result": "OK", "message": "Copy requested", "data": [{"request_id": 18, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:57:37", "last_request": "2018-02-26 23:57:37", "request_count": 1}]} if ddmRequest["result"] == "OK": # set status to TAPERECALL tapeRecallStatus = 'TAPERECALL' ddmReqId = ddmRequest["data"][0]["request_id"] configreq = { 'workflow': taskName, 'taskstatus': tapeRecallStatus, 'ddmreqid': ddmReqId, 'subresource': 'addddmreqid', } try: tapeRecallStatusSet = self.server.post( self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.exception(hte) msg = "HTTP Error while contacting the REST Interface %s:\n%s" % ( self.config.TaskWorker.restHost, str(hte)) msg += "\nSetting %s status and DDM request ID (%d) failed for task %s" % ( tapeRecallStatus, ddmReqId, taskName) msg += "\nHTTP Headers are: %s" % hte.headers raise TaskWorkerException(msg, retry=True) msg += "\nA disk replica has been requested on %s to CMS DDM (request ID: %d)" % ( ddmRequest["data"][0]["first_request"], ddmReqId) if tapeRecallStatusSet[2] == "OK": self.logger.info("Status for task %s set to '%s'", taskName, tapeRecallStatus) msg += "\nThis task will be automatically submitted as soon as the stage-out is completed." self.uploadWarning(msg, userProxy, taskName) raise TapeDatasetException(msg) else: msg += ", please try again in two days." else: msg += "\nThe disk replica request failed with this error:\n %s" % ddmRequest[ "message"] msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK." raise TaskWorkerException(msg) # will not need lumi info if user has asked for split by file with no run/lumi mask splitAlgo = kwargs['task']['tm_split_algo'] lumiMask = kwargs['task']['tm_split_args']['lumis'] runRange = kwargs['task']['tm_split_args']['runs'] needLumiInfo = splitAlgo != 'FileBased' or lumiMask != [] or runRange != [] # secondary dataset access relies on run/lumi info if secondaryDataset: needLumiInfo = True if needLumiInfo: self.checkBlocksSize( blocksWithLocation ) # Interested only in blocks with locations, 'blocks' may contain invalid ones and trigger an Exception if secondaryDataset: self.checkBlocksSize(secondaryBlocksWithLocation) try: filedetails = self.dbs.listDatasetFileDetails( inputDataset, getParents=True, getLumis=needLumiInfo, validFileOnly=0) if secondaryDataset: moredetails = self.dbs.listDatasetFileDetails( secondaryDataset, getParents=False, getLumis=needLumiInfo, validFileOnly=0) for secfilename, secinfos in moredetails.items(): secinfos['lumiobj'] = LumiList( runsAndLumis=secinfos['Lumis']) self.logger.info( "Beginning to match files from secondary dataset") for dummyFilename, infos in filedetails.items(): infos['Parents'] = [] lumis = LumiList(runsAndLumis=infos['Lumis']) for secfilename, secinfos in moredetails.items(): if lumis & secinfos['lumiobj']: infos['Parents'].append(secfilename) self.logger.info("Done matching files from secondary dataset") kwargs['task']['tm_use_parent'] = 1 except Exception as ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\ "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves if not filedetails: raise TaskWorkerException(("Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n" +\ "Aborting submission. Resubmitting your task will not help.") %\ ("https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s") %\ (self.dbsInstance, inputDataset)) ## Format the output creating the data structures required by WMCore. Filters out invalid files, ## files whose block has no location, and figures out the PSN result = self.formatOutput(task=kwargs['task'], requestname=taskName, datasetfiles=filedetails, locations=locationsMap, tempDir=kwargs['tempDir']) if not result.result: raise TaskWorkerException(( "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n" + "Aborting submission. Resubmitting your task will not help." ) % ( "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s" ) % (self.dbsInstance, inputDataset)) self.logger.debug("Got %s files", len(result.result.getFiles())) return result
class StartPolicyInterface(PolicyInterface): """Interface for start policies""" def __init__(self, **args): # We need to pop this object instance from args because otherwise # the super class blows up when doing a deepcopy(args) self.rucio = args.pop("rucioObject", None) PolicyInterface.__init__(self, **args) self.workQueueElements = [] self.wmspec = None self.team = None self.initialTask = None self.splitParams = None self.dbs_pool = {} self.data = {} self.lumi = None self.couchdb = None self.rejectedWork = [] # List of inputs that were rejected self.badWork = [ ] # list of bad work unit (e.g. without any valid files) self.pileupData = {} self.cric = CRIC() # FIXME: for the moment, it will always use the default value self.rucioAcct = self.args.get("rucioAcct", "wmcore_transferor") if not self.rucio: self.rucio = Rucio(self.rucioAcct, configDict={'logger': self.logger}) def split(self): """Apply policy to spec""" raise NotImplementedError def validate(self): """Check params and spec are appropriate for the policy""" raise NotImplementedError def validateCommon(self): """Common validation stuff""" try: Lexicon.requestName(self.wmspec.name()) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Workflow name validation error: %s" % str(ex)) raise error if self.initialTask.siteWhitelist(): if isinstance(self.initialTask.siteWhitelist(), (newstr, bytes)): error = WorkQueueWMSpecError( self.wmspec, 'Invalid site whitelist: Must be tuple/list but is %s' % type(self.initialTask.siteWhitelist())) raise error try: [ Lexicon.cmsname(site) for site in self.initialTask.siteWhitelist() ] except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Site whitelist validation error: %s" % str(ex)) raise error else: error = WorkQueueWMSpecError( self.wmspec, "Site whitelist validation error: Empty site whitelist") raise error if self.initialTask.siteBlacklist(): if isinstance(self.initialTask.siteBlacklist(), (newstr, bytes)): error = WorkQueueWMSpecError( self.wmspec, 'Invalid site blacklist: Must be tuple/list but is %s' % type(self.initialTask.siteBlacklist())) raise error try: [ Lexicon.cmsname(site) for site in self.initialTask.siteBlacklist() ] except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Site blacklist validation error: %s" % str(ex)) raise error # splitter settings if self.args.get('SliceSize', 1) <= 0: error = WorkQueueWMSpecError( self.wmspec, 'Zero or negative SliceSize parameter') raise error if self.args.get('SubSliceSize', 1) <= 0: error = WorkQueueWMSpecError( self.wmspec, 'Zero or negative SubSliceSize parameter') raise error # check input dataset is valid try: if self.initialTask.getInputDatasetPath(): Lexicon.dataset(self.initialTask.getInputDatasetPath()) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Dataset validation error: %s" % str(ex)) raise error # if pileup is found, check that they are valid datasets try: pileupDatasets = self.wmspec.listPileupDatasets() for dbsUrl in pileupDatasets: for dataset in pileupDatasets[dbsUrl]: Lexicon.dataset(dataset) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. error = WorkQueueWMSpecError( self.wmspec, "Pileup dataset validation error: %s" % str(ex)) raise error def newQueueElement(self, **args): # DBS Url may not be available in the initial task # but in the pileup data (MC pileup) dbsUrl = self.initialTask.dbsUrl() if dbsUrl is None and self.pileupData: # Get the first DBS found dbsUrl = next(iter(self.wmspec.listPileupDatasets())) args.setdefault('Status', 'Available') args.setdefault('WMSpec', self.wmspec) args.setdefault('Task', self.initialTask) args.setdefault('RequestName', self.wmspec.name()) args.setdefault('TaskName', self.initialTask.name()) args.setdefault('Dbs', dbsUrl) args.setdefault('SiteWhitelist', self.initialTask.siteWhitelist()) args.setdefault('SiteBlacklist', self.initialTask.siteBlacklist()) args.setdefault('StartPolicy', self.wmspec.startPolicy()) args.setdefault('EndPolicy', self.wmspec.endPolicyParameters()) args.setdefault('Priority', self.wmspec.priority()) args.setdefault('PileupData', self.pileupData) if not args['Priority']: args['Priority'] = 0 ele = WorkQueueElement(**args) for data, sites in viewitems(ele['Inputs']): if not sites: raise WorkQueueWMSpecError( self.wmspec, 'Input data has no locations "%s"' % data) # catch infinite splitting loops if len(self.workQueueElements) > self.args.get('maxRequestSize', 1e8): raise WorkQueueWMSpecError( self.wmspec, 'Too many elements (%d)' % self.args.get('MaxRequestElements', 1e8)) self.workQueueElements.append(ele) def __call__(self, wmspec, task, data=None, mask=None, team=None, continuous=False, rucioObj=None): self.wmspec = wmspec # bring in spec specific settings self.args.update(self.wmspec.startPolicyParameters()) self.initialTask = task if data: self.data = data self.mask = mask self.validate() try: pileupDatasets = self.wmspec.listPileupDatasets() if pileupDatasets: self.pileupData = self.getDatasetLocations(pileupDatasets) self.split() # For known exceptions raise custom error that will fail the workflow. except dbsClientException as ex: # A dbs configuration error implies the spec is invalid error = WorkQueueWMSpecError(self.wmspec, "DBS config error: %s" % str(ex)) raise error except AssertionError as ex: # Assertion generally means validation of an input field failed error = WorkQueueWMSpecError(self.wmspec, "Assertion error: %s" % str(ex)) raise error except DBSReaderError as ex: # Hacky way of identifying non-existant data, DbsBadRequest chomped by DBSReader if 'Invalid parameters' in str(ex): data = task.data.input.pythonise_( ) if task.data.input else 'None' msg = """data: %s, mask: %s, pileup: %s. %s""" % ( str(data), str(mask), str(pileupDatasets), str(ex)) error = WorkQueueNoWorkError(self.wmspec, msg) raise error raise # propagate other dbs errors # if we have no new elements and we are not adding work to request # already running, then raise exception if not self.workQueueElements and not continuous: data = task.data.input.pythonise_() if task.data.input else 'None' msg = "Failed to add work. Input data: %s, mask: %s." % (str(data), str(mask)) error = WorkQueueNoWorkError(self.wmspec, msg) raise error return self.workQueueElements, self.rejectedWork, self.badWork def dbs(self, dbs_url=None): """Get DBSReader""" from WMCore.WorkQueue.WorkQueueUtils import get_dbs if dbs_url is None: dbs_url = self.initialTask.dbsUrl() return get_dbs(dbs_url) @staticmethod def supportsWorkAddition(): """Indicates if a given policy supports addition of new work""" return False def getMaskedBlocks(self, task, dbs, datasetPath): """ Get the blocks which pass the lumi mask restrictions. For each block return the list of lumis which were ok (given the lumi mask). The data structure returned is the following: { "block1" : {"file1" : LumiList(), "file5" : LumiList(), ...} "block2" : {"file2" : LumiList(), "file7" : LumiList(), ...} } """ # Get the task mask as a LumiList object to make operations easier maskedBlocks = {} taskMask = task.getLumiMask() # for performance reasons, we first get all the blocknames blocks = [ x['block_name'] for x in dbs.dbs.listBlocks(dataset=datasetPath) ] for block in blocks: fileLumis = dbs.dbs.listFileLumis(block_name=block, validFileOnly=1) for fileLumi in fileLumis: lfn = fileLumi['logical_file_name'] runNumber = str(fileLumi['run_num']) lumis = fileLumi['lumi_section_num'] fileMask = LumiList(runsAndLumis={runNumber: lumis}) commonMask = taskMask & fileMask if commonMask: maskedBlocks.setdefault(block, {}) maskedBlocks[block].setdefault(lfn, LumiList()) maskedBlocks[block][lfn] += commonMask return maskedBlocks def modifyPolicyForWorkAddition(self, inboxElement): """Set modifiers to the policy based on the inboxElement information so that after a splitting pass with this policy strictly new work is returned, the inbox element must have information about already existing work""" raise NotImplementedError( "This can't be called on a base StartPolicyInterface object") def newDataAvailable(self, task, inbound): """ Returns True if there is data in the future could be included as an element for the inbound parent. However it doesn't guarantee that the new data will be included if the inbound element is split (i.e. the new data could be open blocks for the Block policy). """ raise NotImplementedError( "This can't be called on a base StartPolicyInterface object") def getDatasetLocations(self, datasets): """ Returns a dictionary with the location of the datasets according to Rucio The definition of "location" here is a union of all sites holding at least part of the dataset (defined by the DATASET grouping). :param datasets: dictionary with a list of dataset names (key'ed by the DBS URL) :return: a dictionary of dataset locations, key'ed by the dataset name """ result = {} for dbsUrl in datasets: for datasetPath in datasets[dbsUrl]: locations = self.rucio.getDataLockedAndAvailable( name=datasetPath, account=self.rucioAcct) result[datasetPath] = self.cric.PNNstoPSNs(locations) return result def blockLocationRucioPhedex(self, blockName): """ Wrapper around Rucio and PhEDEx systems. Fetch the current location of the block name (if Rucio, also consider the locks made on that block) :param blockName: string with the block name :return: a list of RSEs """ location = self.rucio.getDataLockedAndAvailable(name=blockName, account=self.rucioAcct) return location
class RucioInjectorPoller(BaseWorkerThread): """ _RucioInjectorPoller_ Poll the DBSBuffer database and inject files as they are created. The logic of this component is: * create a rucio container (or reuse a pre-existent one) * create a CMS block (or reuse a pre-existent one), block gets automatically attached * create file/replicas, which get automatically attached to its block as well * now create a CMS block rule to protect this data * if the block has been inserted into DBS, close the block in Rucio In addition to that, it has logic for rucio container subscription (rule creation), and block rule removal. Those follow a different polling cycle though. """ def __init__(self, config): """ ___init___ Initialise class members """ BaseWorkerThread.__init__(self) # dataset rule creation has a larger polling cycle self.pollRules = config.RucioInjector.pollIntervalRules self.lastRulesExecTime = 0 self.createBlockRules = config.RucioInjector.createBlockRules self.containerDiskRuleParams = config.RucioInjector.containerDiskRuleParams self.containerDiskRuleRSEExpr = config.RucioInjector.containerDiskRuleRSEExpr if config.RucioInjector.metaDIDProject not in RUCIO_VALID_PROJECT: msg = "Component configured with an invalid 'project' DID: %s" raise RucioInjectorException(msg % config.RucioInjector.metaDIDProject) self.metaDIDProject = dict(project=config.RucioInjector.metaDIDProject) # setup cache for container and blocks (containers can be much longer, make 6 days now) self.containersCache = MemoryCache( config.RucioInjector.cacheExpiration * 3, set()) self.blocksCache = MemoryCache(config.RucioInjector.cacheExpiration, set()) self.scope = getattr(config.RucioInjector, "scope", "cms") self.rucioAcct = config.RucioInjector.rucioAccount self.rucio = Rucio(acct=self.rucioAcct, hostUrl=config.RucioInjector.rucioUrl, authUrl=config.RucioInjector.rucioAuthUrl, configDict={'logger': self.logger}) # metadata dictionary information to be added to block/container rules # cannot be a python dictionary, but a JSON string instead self.metaData = json.dumps( dict(agentHost=config.Agent.hostName, userAgent=config.Agent.agentName)) self.testRSEs = config.RucioInjector.RSEPostfix self.filesToRecover = [] # output data placement has a different behaviour between T0 and Production agents if hasattr(config, "Tier0Feeder"): logging.info("RucioInjector running on a T0 WMAgent") self.isT0agent = True else: self.isT0agent = False logging.info("Component configured to create block rules: %s", self.createBlockRules) def setup(self, parameters): """ _setup_ Create DAO Factory and setup some DAO. """ myThread = threading.currentThread() daofactory = DAOFactory(package="WMComponent.RucioInjector.Database", logger=self.logger, dbinterface=myThread.dbi) self.getUninjected = daofactory(classname="GetUninjectedFiles") self.getMigrated = daofactory(classname="GetMigratedBlocks") self.getUnsubscribedBlocks = daofactory( classname="GetUnsubscribedBlocks") self.setBlockRules = daofactory(classname="SetBlocksRule") self.findDeletableBlocks = daofactory(classname="GetDeletableBlocks") self.markBlocksDeleted = daofactory(classname="MarkBlocksDeleted") self.getUnsubscribedDsets = daofactory( classname="GetUnsubscribedDatasets") self.markSubscribed = daofactory(classname="MarkDatasetSubscribed") daofactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=self.logger, dbinterface=myThread.dbi) self.setStatus = daofactory(classname="DBSBufferFiles.SetPhEDExStatus") self.setBlockClosed = daofactory(classname="SetBlockClosed") @timeFunction def algorithm(self, parameters): """ _algorithm_ Poll the database for uninjected files and inject them into Rucio. """ logging.info("Running Rucio injector poller algorithm...") try: # files that failed to get their status updated in dbsbuffer self._updateLFNState(self.filesToRecover, recovery=True) # get dbsbuffer_file.in_phedex = 0 uninjectedFiles = self.getUninjected.execute() # create containers in rucio (and update local cache) containersAdded = self.insertContainers(uninjectedFiles) if self.containersCache.isCacheExpired(): self.containersCache.setCache(containersAdded) else: self.containersCache.addItemToCache(containersAdded) # create blocks. Only update the cache once a rule gets created... blocksAdded = self.insertBlocks(uninjectedFiles) if self.blocksCache.isCacheExpired(): self.blocksCache.setCache(blocksAdded) else: self.blocksCache.addItemToCache(blocksAdded) # create file replicas self.insertReplicas(uninjectedFiles) # now close blocks already uploaded to DBS self.closeBlocks() if self.lastRulesExecTime + self.pollRules <= int(time.time()): self.insertContainerRules() self.insertBlockRules() self.deleteBlocks() except Exception as ex: msg = "Caught unexpected exception in RucioInjector. Details:\n%s" % str( ex) logging.exception(msg) raise RucioInjectorException(msg) return def insertContainers(self, uninjectedData): """ This method will insert containers into Rucio, provided they cannot be found in the local cache. :param uninjectedData: same data as it's returned from the uninjectedFiles :return: set of containers successfully inserted into Rucio """ logging.info("Preparing to insert containers into Rucio...") newContainers = set() for location in uninjectedData: for container in uninjectedData[location]: # same container can be at multiple locations if container not in self.containersCache and container not in newContainers: if self.rucio.createContainer(container, meta=self.metaDIDProject): logging.info("Container %s inserted into Rucio", container) newContainers.add(container) else: logging.error("Failed to create container: %s", container) logging.info("Successfully inserted %d containers into Rucio", newContainers) return newContainers def insertBlocks(self, uninjectedData): """ This method will insert blocks into Rucio and attach them to their correspondent containers, when attaching this block, we also need to provide the RSE that it will be available. :param uninjectedData: same data as it's returned from the uninjectedFiles :return: a dictionary of successfully inserted blocks and their correspondent location """ logging.info("Preparing to insert blocks into Rucio...") newBlocks = set() for location in uninjectedData: rseName = "%s_Test" % location if self.testRSEs else location for container in uninjectedData[location]: for block in uninjectedData[location][container]: if block not in self.blocksCache: if self.rucio.createBlock(block, rse=rseName, meta=self.metaDIDProject): logging.info("Block %s inserted into Rucio", block) newBlocks.add(block) else: logging.error("Failed to create block: %s", block) logging.info("Successfully inserted %d blocks into Rucio", newBlocks) return newBlocks def insertBlockRules(self): """ Creates a simple replication rule for every single block that is under production in a given site/RSE. Also persist the rule ID in the database. """ if not self.createBlockRules: return logging.info("Preparing to create block rules into Rucio...") unsubBlocks = self.getUnsubscribedBlocks.execute() for item in unsubBlocks: # first, check if the block has already been created in Rucio if not self.rucio.didExist(item['blockname']): logging.warning("Block: %s not yet in Rucio. Retrying later..", item['blockname']) continue kwargs = dict(activity="Production Output", account=self.rucioAcct, grouping="DATASET", comment="WMAgent automatic container rule", ignore_availability=True, meta=self.metaData) rseName = "%s_Test" % item['pnn'] if self.testRSEs else item['pnn'] # DATASET = replicates all files in the same block to the same RSE resp = self.rucio.createReplicationRule(item['blockname'], rseExpression=rseName, **kwargs) if resp: msg = "Block rule created for block: %s, at: %s, with rule id: %s" logging.info(msg, item['blockname'], item['pnn'], resp[0]) binds = {'RULE_ID': resp[0], 'BLOCKNAME': item['blockname']} self.setBlockRules.execute(binds) else: logging.error("Failed to create rule for block: %s at %s", item['blockname'], rseName) return def insertReplicas(self, uninjectedData): """ Inserts replicas into Rucio and attach them to its specific block. If the insertion succeeds, also switch their database state to injected. :param uninjectedData: dictionary with blocks as key, and RSEs as value """ # FIXME: I think we need a different data struct from the database # this method is very expensive O(n^4) logging.info("Preparing to insert replicas into Rucio...") for location in uninjectedData: rseName = "%s_Test" % location if self.testRSEs else location for container in uninjectedData[location]: for block in uninjectedData[location][container]: if block not in self.blocksCache: logging.warning( "Skipping %d file injection for block that failed to be added into Rucio: %s", len(uninjectedData[location][container][block] ['files']), block) continue injectData = [] listLfns = [] for fileInfo in uninjectedData[location][container][block][ 'files']: listLfns.append(fileInfo['lfn']) injectData.append( dict(name=fileInfo['lfn'], scope=self.scope, bytes=fileInfo['size'], state="A", adler32=fileInfo['checksum']['adler32'])) if self.rucio.createReplicas(rse=rseName, files=injectData, block=block): logging.info( "Successfully inserted %d files on block %s", len(listLfns), block) self._updateLFNState(listLfns) return def _updateLFNState(self, listLfns, recovery=False): """ Given a list of LFNs, update their state in dbsbuffer table. :param listLfns: list of LFNs :param recovery: True if we are recovering previously injected files :return: nothing """ if not listLfns: return try: self.setStatus.execute(listLfns, 1) except Exception as ex: # save it to try to inject them again in the next cycle self.filesToRecover.extend(listLfns) if 'Deadlock found' in str(ex) or 'deadlock detected' in str(ex): logging.error( "Deadlock during file status update. Retrying again in the next cycle." ) self.filesToRecover.extend(listLfns) else: msg = "Failed to update file status in the database, reason: %s" % str( ex) logging.error(msg) raise RucioInjectorException(msg) else: if recovery: self.filesToRecover = [] def closeBlocks(self): """ Close any blocks that have been migrated to global DBS """ logging.info("Starting closeBlocks method") # in short, dbsbuffer_file.in_phedex = 1 AND dbsbuffer_block.status = 'InDBS' migratedBlocks = self.getMigrated.execute() ### FIXME the data format returned by this DAO for location in migratedBlocks: for container in migratedBlocks[location]: for block in migratedBlocks[location][container]: logging.info("Closing block: %s", block) if self.rucio.closeBlockContainer(block): self.setBlockClosed.execute(block) else: logging.error( "Failed to close block: %s. Will retry again later.", block) def deleteBlocks(self): """ _deleteBlocks_ Find deletable blocks, then decide if to delete based on: Has transfer to all destinations finished ? If yes => Delete rules associated with the block, set deleted=1 If no => do nothing (check again next cycle) """ logging.info("Checking if there are block rules to be deleted...") # Get list of blocks that can be deleted blockDict = self.findDeletableBlocks.execute(transaction=False) if not blockDict: logging.info("No candidate blocks found for rule deletion") return logging.info("Found %d candidate blocks for rule deletion", len(blockDict)) blocksToDelete = [] containerDict = {} # Populate containerDict, assigning each block to its correspondant container for blockName in blockDict: container = blockDict[blockName]['dataset'] # If the container is not in the dictionary, create a new entry for it if container not in containerDict: # Set of sites to which the container needs to be transferred sites = set( x.replace("_MSS", "_Tape") for x in blockDict[blockName]['sites']) containerDict[container] = {'blocks': [], 'rse': sites} containerDict[container]['blocks'].append(blockName) for contName in containerDict: cont = containerDict[contName] # Checks if the container is not requested in any sites. # This should never be triggered, but better safe than sorry if not cont['rse']: logging.warning( "No rules for container: %s. Its blocks won't be deleted.", contName) continue try: # Get RSE in which each block is available availableRSEs = self.rucio.getReplicaInfoForBlocks( block=cont['blocks']) except Exception as exc: msg = "Failed to get replica info for blocks in container: %s.\n" % contName msg += "Will retry again in the next cycle. Error: %s" % str( exc) logging.error(msg) continue for blockRSEs in availableRSEs: # If block is available at every RSE its container needs to be transferred, the block can be deleted blockSites = set(blockRSEs['replica']) if cont['rse'].issubset(blockSites): blocksToDelete.append(blockRSEs['name']) # Delete agent created rules locking the block binds = [] logging.info("Going to delete %d block rules", len(blocksToDelete)) for block in blocksToDelete: try: rules = self.rucio.listDataRules(block, scope=self.scope, account=self.rucioAcct) except WMRucioException as exc: logging.warning( "Unable to retrieve replication rules for block: %s. Will retry in the next cycle.", block) else: if not rules: logging.info( "Block rule for: %s has been deleted by previous cycles", block) binds.append({'DELETED': 1, 'BLOCKNAME': block}) continue for rule in rules: deletedRules = 0 if self.rucio.deleteRule(rule['id'], purgeReplicas=True): logging.info( "Successfully deleted rule: %s, for block %s.", rule['id'], block) deletedRules += 1 else: logging.warning( "Failed to delete rule: %s, for block %s. Will retry in the next cycle.", rule['id'], block) if deletedRules == len(rules): binds.append({'DELETED': 1, 'BLOCKNAME': block}) logging.info( "Successfully deleted all rules for block %s.", block) self.markBlocksDeleted.execute(binds) logging.info("Marked %d blocks as deleted in the database", len(binds)) return def insertContainerRules(self): """ Polls the database for containers meant to be subscribed and create a container level rule to replicate all the files to a given RSE. It deals with both Central Production and T0 data rules, which require a different approach, such as: * Production Tape/Custodial data placement is skipped and data is marked as transferred * Production Disk/NonCutodial has a generic RSE expression and some rules override from the agent configuration (like number of copies, grouping and weight) * T0 Tape is created as defined, with a special rule activity for Tape * T0 Disk is created as defined, with a special rule activity for Disk/Export """ logging.info("Starting insertContainerRules method") ruleComment = "WMAgent automatic container rule" if self.isT0agent: ruleComment = "T0 " + ruleComment # FIXME also adapt the format returned by this DAO # Check for completely unsubscribed datasets that are already marked as in_phedex = 1 unsubscribedDatasets = self.getUnsubscribedDsets.execute() # Keep a list of subscriptions to tick as subscribed in the database subscriptionsMade = [] # Create the subscription objects and add them to the list # The list takes care of the sorting internally for subInfo in unsubscribedDatasets: rseName = subInfo['site'].replace("_MSS", "_Tape") container = subInfo['path'] # Skip central production Tape rules if not self.isT0agent and rseName.endswith("_Tape"): logging.info( "Bypassing Production container Tape data placement for container: %s and RSE: %s", container, rseName) subscriptionsMade.append(subInfo['id']) continue # then check if the container has already been created in Rucio if not self.rucio.didExist(container): logging.warning( "Container: %s not yet in Rucio. Retrying later..", container) continue ruleKwargs = dict(ask_approval=False, activity=self._activityMap(rseName), account=self.rucioAcct, grouping="ALL", comment=ruleComment, meta=self.metaData) if not rseName.endswith("_Tape"): # add extra parameters to the Disk rule as defined in the component configuration ruleKwargs.update(self.containerDiskRuleParams) if not self.isT0agent: # destination for production Disk rules are always overwritten rseName = self.containerDiskRuleRSEExpr if self.testRSEs: rseName = rseName.replace("cms_type=real", "cms_type=test") else: # then it's a T0 container placement ruleKwargs['priority'] = 4 if self.testRSEs: rseName = "%s_Test" % rseName #Checking whether we need to ask for rule approval try: if self.rucio.requiresApproval(rseName): ruleKwargs['ask_approval'] = True except WMRucioException as exc: msg = str(exc) msg += "\nUnable to check approval requirements. Will retry again in the next cycle." logging.error(msg) continue logging.info("Creating container rule for %s against RSE %s", container, rseName) logging.debug( "Container rule will be created with keyword args: %s", ruleKwargs) try: resp = self.rucio.createReplicationRule(container, rseExpression=rseName, **ruleKwargs) except Exception: msg = "Failed to create container rule for (retrying with approval): %s" % container logging.warning(msg) ruleKwargs["ask_approval"] = True try: resp = self.rucio.createReplicationRule( container, rseExpression=rseName, **ruleKwargs) except Exception as exc: msg = "Failed once again to create container rule for: %s " % container msg += "\nWill retry again in the next cycle. Error: %s" % str( exc) continue if resp: logging.info("Container rule created for %s under rule id: %s", container, resp) subscriptionsMade.append(subInfo['id']) else: logging.error("Failed to create rule for container: %s", container) # Register the result in DBSBuffer if subscriptionsMade: self.markSubscribed.execute(subscriptionsMade) logging.info( "%d containers successfully locked in Rucio and local database", len(subscriptionsMade)) return def _activityMap(self, rseName): """ It maps the WMAgent type (Production vs T0) and the RSE name to properly set the rule activity field :param rseName: a string with the RSE name :return: a string with the rule activity """ if not self.isT0agent and not rseName.endswith("_Tape"): return "Production Output" elif self.isT0agent and rseName.endswith("_Tape"): return "T0 Tape" elif self.isT0agent: return "T0 Export" else: msg = "This code should never be reached. Report it to the developers. " msg += "Trying to create container rule for RSE name: {}".format( rseName) raise WMRucioException(msg)
class WMBSHelperTest(EmulatedUnitTestCase): def setUp(self): """ _setUp_ """ super(WMBSHelperTest, self).setUp() self.testInit = TestInitCouchApp(__file__) self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase=True) self.testInit.setupCouch("wmbshelper_t/jobs", "JobDump") self.testInit.setupCouch("wmbshelper_t/fwjrs", "FWJRDump") self.testInit.setupCouch("config_test", "GroupUser", "ConfigCache") os.environ["COUCHDB"] = "wmbshelper_t" self.testInit.setSchema(customModules=[ "WMCore.WMBS", "WMComponent.DBS3Buffer", "WMCore.BossAir", "WMCore.ResourceControl" ], useDefault=False) self.workDir = self.testInit.generateWorkDir() self.wmspec = self.createWMSpec() self.topLevelTask = getFirstTask(self.wmspec) self.inputDataset = self.topLevelTask.inputDataset() self.dataset = self.topLevelTask.getInputDatasetPath() self.dbs = DBSReader(self.inputDataset.dbsurl) self.rucioAcct = "wmcore_transferor" self.rucio = Rucio(self.rucioAcct) self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=threading.currentThread().logger, dbinterface=threading.currentThread().dbi) self.configFile = EmulatorSetup.setupWMAgentConfig() self.config = loadConfigurationFile(self.configFile) self.config.component_("JobSubmitter") self.config.JobSubmitter.submitDir = self.workDir self.config.JobSubmitter.submitScript = os.path.join( getTestBase(), 'WMComponent_t/JobSubmitter_t', 'submit.sh') return def tearDown(self): """ _tearDown_ Clear out the database. """ self.testInit.clearDatabase() self.testInit.tearDownCouch() self.testInit.delWorkDir() EmulatorSetup.deleteConfig(self.configFile) super(WMBSHelperTest, self).tearDown() return def setupForKillTest(self, baAPI=None): """ _setupForKillTest_ Inject a workflow into WMBS that has a processing task, a merge task and a cleanup task. Inject files into the various tasks at various processing states (acquired, complete, available...). Also create jobs for each subscription in various states. """ myThread = threading.currentThread() daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) dummyLocationAction = daoFactory(classname="Locations.New") changeStateAction = daoFactory(classname="Jobs.ChangeState") resourceControl = ResourceControl() resourceControl.insertSite(siteName='site1', pnn='goodse.cern.ch', ceName='site1', plugin="TestPlugin") resourceControl.insertThreshold(siteName='site1', taskType='Processing', \ maxSlots=10000, pendingSlots=10000) userDN = 'someDN' userAction = daoFactory(classname="Users.New") userAction.execute(dn=userDN, group_name='DEFAULT', role_name='DEFAULT') inputFileset = Fileset("input") inputFileset.create() inputFileA = File("lfnA", locations="goodse.cern.ch") inputFileB = File("lfnB", locations="goodse.cern.ch") inputFileC = File("lfnC", locations="goodse.cern.ch") inputFileA.create() inputFileB.create() inputFileC.create() inputFileset.addFile(inputFileA) inputFileset.addFile(inputFileB) inputFileset.addFile(inputFileC) inputFileset.commit() unmergedOutputFileset = Fileset("unmerged") unmergedOutputFileset.create() unmergedFileA = File("ulfnA", locations="goodse.cern.ch") unmergedFileB = File("ulfnB", locations="goodse.cern.ch") unmergedFileC = File("ulfnC", locations="goodse.cern.ch") unmergedFileA.create() unmergedFileB.create() unmergedFileC.create() unmergedOutputFileset.addFile(unmergedFileA) unmergedOutputFileset.addFile(unmergedFileB) unmergedOutputFileset.addFile(unmergedFileC) unmergedOutputFileset.commit() mainProcWorkflow = Workflow(spec="spec1", owner="Steve", name="Main", task="Proc") mainProcWorkflow.create() mainProcMergeWorkflow = Workflow(spec="spec1", owner="Steve", name="Main", task="ProcMerge") mainProcMergeWorkflow.create() mainCleanupWorkflow = Workflow(spec="spec1", owner="Steve", name="Main", task="Cleanup") mainCleanupWorkflow.create() self.mainProcSub = Subscription(fileset=inputFileset, workflow=mainProcWorkflow, type="Processing") self.mainProcSub.create() self.mainProcSub.acquireFiles(inputFileA) self.mainProcSub.completeFiles(inputFileB) procJobGroup = JobGroup(subscription=self.mainProcSub) procJobGroup.create() self.procJobA = Job(name="ProcJobA") self.procJobA["state"] = "new" self.procJobA["location"] = "site1" self.procJobB = Job(name="ProcJobB") self.procJobB["state"] = "executing" self.procJobB["location"] = "site1" self.procJobC = Job(name="ProcJobC") self.procJobC["state"] = "complete" self.procJobC["location"] = "site1" self.procJobA.create(procJobGroup) self.procJobB.create(procJobGroup) self.procJobC.create(procJobGroup) self.mainMergeSub = Subscription(fileset=unmergedOutputFileset, workflow=mainProcMergeWorkflow, type="Merge") self.mainMergeSub.create() self.mainMergeSub.acquireFiles(unmergedFileA) self.mainMergeSub.failFiles(unmergedFileB) mergeJobGroup = JobGroup(subscription=self.mainMergeSub) mergeJobGroup.create() self.mergeJobA = Job(name="MergeJobA") self.mergeJobA["state"] = "exhausted" self.mergeJobA["location"] = "site1" self.mergeJobB = Job(name="MergeJobB") self.mergeJobB["state"] = "cleanout" self.mergeJobB["location"] = "site1" self.mergeJobC = Job(name="MergeJobC") self.mergeJobC["state"] = "new" self.mergeJobC["location"] = "site1" self.mergeJobA.create(mergeJobGroup) self.mergeJobB.create(mergeJobGroup) self.mergeJobC.create(mergeJobGroup) self.mainCleanupSub = Subscription(fileset=unmergedOutputFileset, workflow=mainCleanupWorkflow, type="Cleanup") self.mainCleanupSub.create() self.mainCleanupSub.acquireFiles(unmergedFileA) self.mainCleanupSub.completeFiles(unmergedFileB) cleanupJobGroup = JobGroup(subscription=self.mainCleanupSub) cleanupJobGroup.create() self.cleanupJobA = Job(name="CleanupJobA") self.cleanupJobA["state"] = "new" self.cleanupJobA["location"] = "site1" self.cleanupJobB = Job(name="CleanupJobB") self.cleanupJobB["state"] = "executing" self.cleanupJobB["location"] = "site1" self.cleanupJobC = Job(name="CleanupJobC") self.cleanupJobC["state"] = "complete" self.cleanupJobC["location"] = "site1" self.cleanupJobA.create(cleanupJobGroup) self.cleanupJobB.create(cleanupJobGroup) self.cleanupJobC.create(cleanupJobGroup) jobList = [ self.procJobA, self.procJobB, self.procJobC, self.mergeJobA, self.mergeJobB, self.mergeJobC, self.cleanupJobA, self.cleanupJobB, self.cleanupJobC ] changeStateAction.execute(jobList) if baAPI: for job in jobList: job['plugin'] = 'TestPlugin' job['userdn'] = userDN job['usergroup'] = 'DEFAULT' job['userrole'] = 'DEFAULT' job['custom']['location'] = 'site1' baAPI.createNewJobs(wmbsJobs=jobList) # We'll create an unrelated workflow to verify that it isn't affected # by the killing code. bogusFileset = Fileset("dontkillme") bogusFileset.create() bogusFileA = File("bogus/lfnA", locations="goodse.cern.ch") bogusFileA.create() bogusFileset.addFile(bogusFileA) bogusFileset.commit() bogusWorkflow = Workflow(spec="spec2", owner="Steve", name="Bogus", task="Proc") bogusWorkflow.create() self.bogusSub = Subscription(fileset=bogusFileset, workflow=bogusWorkflow, type="Processing") self.bogusSub.create() self.bogusSub.acquireFiles(bogusFileA) return def verifyFileKillStatus(self): """ _verifyFileKillStatus_ Verify that all files were killed correctly. The status of files in Cleanup and LogCollect subscriptions isn't modified. Status of already completed and failed files is not modified. Also verify that the bogus subscription is untouched. """ failedFiles = self.mainProcSub.filesOfStatus("Failed") acquiredFiles = self.mainProcSub.filesOfStatus("Acquired") completedFiles = self.mainProcSub.filesOfStatus("Completed") availableFiles = self.mainProcSub.filesOfStatus("Available") bogusAcquiredFiles = self.bogusSub.filesOfStatus("Acquired") self.assertEqual(len(availableFiles), 0, \ "Error: There should be no available files.") self.assertEqual(len(acquiredFiles), 0, \ "Error: There should be no acquired files.") self.assertEqual(len(bogusAcquiredFiles), 1, \ "Error: There should be one acquired file.") self.assertEqual(len(completedFiles), 3, \ "Error: There should be only one completed file.") goldenLFNs = ["lfnA", "lfnB", "lfnC"] for completedFile in completedFiles: self.assertTrue(completedFile["lfn"] in goldenLFNs, \ "Error: Extra completed file.") goldenLFNs.remove(completedFile["lfn"]) self.assertEqual(len(failedFiles), 0, \ "Error: There should be no failed files.") self.assertEqual(len(goldenLFNs), 0, \ "Error: Missing LFN") failedFiles = self.mainMergeSub.filesOfStatus("Failed") acquiredFiles = self.mainMergeSub.filesOfStatus("Acquired") completedFiles = self.mainMergeSub.filesOfStatus("Completed") availableFiles = self.mainMergeSub.filesOfStatus("Available") self.assertEqual(len(acquiredFiles), 0, \ "Error: Merge subscription should have 0 acq files.") self.assertEqual(len(availableFiles), 0, \ "Error: Merge subscription should have 0 avail files.") self.assertEqual(len(failedFiles), 1, \ "Error: Merge subscription should have 1 failed files.") self.assertEqual( list(failedFiles)[0]["lfn"], "ulfnB", "Error: Wrong failed file.") self.assertEqual(len(completedFiles), 2, \ "Error: Merge subscription should have 2 compl files.") goldenLFNs = ["ulfnA", "ulfnC"] for completedFile in completedFiles: self.assertTrue(completedFile["lfn"] in goldenLFNs, \ "Error: Extra complete file.") goldenLFNs.remove(completedFile["lfn"]) self.assertEqual(len(goldenLFNs), 0, \ "Error: Missing LFN") failedFiles = self.mainCleanupSub.filesOfStatus("Failed") acquiredFiles = self.mainCleanupSub.filesOfStatus("Acquired") completedFiles = self.mainCleanupSub.filesOfStatus("Completed") availableFiles = self.mainCleanupSub.filesOfStatus("Available") self.assertEqual(len(failedFiles), 0, \ "Error: Cleanup subscription should have 0 fai files.") self.assertEqual(len(acquiredFiles), 1, \ "Error: There should be only one acquired file.") self.assertEqual(list(acquiredFiles)[0]["lfn"], "ulfnA", \ "Error: Wrong acquired LFN.") self.assertEqual(len(completedFiles), 1, \ "Error: There should be only one completed file.") self.assertEqual(list(completedFiles)[0]["lfn"], "ulfnB", \ "Error: Wrong completed LFN.") self.assertEqual(len(availableFiles), 1, \ "Error: There should be only one available file.") self.assertEqual(list(availableFiles)[0]["lfn"], "ulfnC", \ "Error: Wrong completed LFN.") return def verifyJobKillStatus(self): """ _verifyJobKillStatus_ Verify that jobs are killed correctly. Jobs belonging to Cleanup and LogCollect subscriptions are not killed. The status of jobs that have already finished running is not changed. """ self.procJobA.load() self.procJobB.load() self.procJobC.load() self.assertEqual(self.procJobA["state"], "killed", \ "Error: Proc job A should be killed.") self.assertEqual(self.procJobB["state"], "killed", \ "Error: Proc job B should be killed.") self.assertEqual(self.procJobC["state"], "complete", \ "Error: Proc job C should be complete.") self.mergeJobA.load() self.mergeJobB.load() self.mergeJobC.load() self.assertEqual(self.mergeJobA["state"], "exhausted", \ "Error: Merge job A should be exhausted.") self.assertEqual(self.mergeJobB["state"], "cleanout", \ "Error: Merge job B should be cleanout.") self.assertEqual(self.mergeJobC["state"], "killed", \ "Error: Merge job C should be killed.") self.cleanupJobA.load() self.cleanupJobB.load() self.cleanupJobC.load() self.assertEqual(self.cleanupJobA["state"], "new", \ "Error: Cleanup job A should be new.") self.assertEqual(self.cleanupJobB["state"], "executing", \ "Error: Cleanup job B should be executing.") self.assertEqual(self.cleanupJobC["state"], "complete", \ "Error: Cleanup job C should be complete.") return def createTestWMSpec(self): """ _createTestWMSpec_ Create a WMSpec that has a processing, merge, cleanup and skims tasks that can be used by the subscription creation test. """ testWorkload = WMWorkloadHelper(WMWorkload("TestWorkload")) testWorkload.setDashboardActivity("TestReReco") testWorkload.setSpecUrl("/path/to/workload") testWorkload.setOwnerDetails("sfoulkes", "DMWM", {'dn': 'MyDN'}) procTask = testWorkload.newTask("ProcessingTask") procTask.setTaskType("Processing") procTask.setSplittingAlgorithm("FileBased", files_per_job=1) procTaskCMSSW = procTask.makeStep("cmsRun1") procTaskCMSSW.setStepType("CMSSW") procTaskCMSSWHelper = procTaskCMSSW.getTypeHelper() procTask.setTaskType("Processing") procTask.setSiteWhitelist(["site1"]) procTask.setSiteBlacklist(["site2"]) procTask.applyTemplates() procTaskCMSSWHelper.addOutputModule("OutputA", primaryDataset="bogusPrimary", processedDataset="bogusProcessed", dataTier="DataTierA", lfnBase="bogusUnmerged", mergedLFNBase="bogusMerged", filterName=None) mergeTask = procTask.addTask("MergeTask") mergeTask.setInputReference(procTaskCMSSW, outputModule="OutputA", dataTier='DataTierA') mergeTask.setTaskType("Merge") mergeTask.setSplittingAlgorithm("WMBSMergeBySize", min_merge_size=1, max_merge_size=2, max_merge_events=3) mergeTaskCMSSW = mergeTask.makeStep("cmsRun1") mergeTaskCMSSW.setStepType("CMSSW") mergeTaskCMSSWHelper = mergeTaskCMSSW.getTypeHelper() mergeTask.setTaskType("Merge") mergeTask.applyTemplates() mergeTaskCMSSWHelper.addOutputModule("Merged", primaryDataset="bogusPrimary", processedDataset="bogusProcessed", dataTier="DataTierA", lfnBase="bogusUnmerged", mergedLFNBase="bogusMerged", filterName=None) cleanupTask = procTask.addTask("CleanupTask") cleanupTask.setInputReference(procTaskCMSSW, outputModule="OutputA", dataTier="DataTierA") cleanupTask.setTaskType("Merge") cleanupTask.setSplittingAlgorithm("SiblingProcessingBased", files_per_job=50) cleanupTaskCMSSW = cleanupTask.makeStep("cmsRun1") cleanupTaskCMSSW.setStepType("CMSSW") cleanupTask.setTaskType("Cleanup") cleanupTask.applyTemplates() skimTask = mergeTask.addTask("SkimTask") skimTask.setTaskType("Skim") skimTask.setInputReference(mergeTaskCMSSW, outputModule="Merged", dataTier="DataTierA") skimTask.setSplittingAlgorithm("FileBased", files_per_job=1, include_parents=True) skimTaskCMSSW = skimTask.makeStep("cmsRun1") skimTaskCMSSW.setStepType("CMSSW") skimTaskCMSSWHelper = skimTaskCMSSW.getTypeHelper() skimTask.setTaskType("Skim") skimTask.applyTemplates() skimTaskCMSSWHelper.addOutputModule("SkimOutputA", primaryDataset="bogusPrimary", processedDataset="bogusProcessed", dataTier="DataTierA", lfnBase="bogusUnmerged", mergedLFNBase="bogusMerged", filterName=None) skimTaskCMSSWHelper.addOutputModule("SkimOutputB", primaryDataset="bogusPrimary", processedDataset="bogusProcessed", dataTier="DataTierB", lfnBase="bogusUnmerged", mergedLFNBase="bogusMerged", filterName=None) return testWorkload def setupMCWMSpec(self): """Setup MC workflow""" self.wmspec = self.createMCWMSpec() self.topLevelTask = getFirstTask(self.wmspec) self.inputDataset = self.topLevelTask.inputDataset() self.dataset = self.topLevelTask.getInputDatasetPath() self.dbs = None # add sites that would normally be added by operator via resource_control locationDAO = self.daoFactory(classname="Locations.New") self.pnns = [] for site in ['T2_XX_SiteA', 'T2_XX_SiteB']: locationDAO.execute(siteName=site, pnn=site) self.pnns.append(site) def createWMSpec(self, name='ReRecoWorkload'): factory = ReRecoWorkloadFactory() rerecoArgs["ConfigCacheID"] = createConfig(rerecoArgs["CouchDBName"]) wmspec = factory.factoryWorkloadConstruction(name, rerecoArgs) wmspec.setSpecUrl("/path/to/workload") wmspec.setSubscriptionInformation(custodialSites=[], nonCustodialSites=[], autoApproveSites=[], priority="Low", custodialSubType="Move") return wmspec def createMCWMSpec(self, name='MonteCarloWorkload'): mcArgs = TaskChainWorkloadFactory.getTestArguments() mcArgs["CouchDBName"] = rerecoArgs["CouchDBName"] mcArgs["Task1"]["ConfigCacheID"] = createConfig(mcArgs["CouchDBName"]) wmspec = taskChainWorkload(name, mcArgs) wmspec.setSpecUrl("/path/to/workload") getFirstTask(wmspec).addProduction(totalevents=10000) return wmspec def getDBS(self, wmspec): topLevelTask = getFirstTask(wmspec) inputDataset = topLevelTask.inputDataset() dbs = DBSReader(inputDataset.dbsurl) # dbsDict = {self.inputDataset.dbsurl : self.dbs} return dbs def createWMBSHelperWithTopTask(self, wmspec, block, mask=None, parentFlag=False, detail=False, commonLocation=None): topLevelTask = getFirstTask(wmspec) wmbs = WMBSHelper(wmspec, topLevelTask.name(), block, mask, cachepath=self.workDir, commonLocation=commonLocation) if block: blockName = block if parentFlag: block = self.dbs.getFileBlockWithParents(blockName) data = self.rucio.getReplicaInfoForBlocks(block=[blockName]) block['PhEDExNodeNames'] = data[0]["replica"] else: block = self.dbs.getFileBlock(blockName) data = self.rucio.getReplicaInfoForBlocks(block=[blockName]) block['PhEDExNodeNames'] = data[0]["replica"] sub, files = wmbs.createSubscriptionAndAddFiles(block=block) if detail: return wmbs, sub, files else: return wmbs def testKillWorkflow(self): """ _testKillWorkflow_ Verify that workflow killing works correctly. """ baAPI = BossAirAPI(config=self.config, insertStates=True) # Create nine jobs self.setupForKillTest(baAPI=baAPI) self.assertEqual(len(baAPI._listRunJobs()), 9) killWorkflow("Main", self.config, self.config) self.verifyFileKillStatus() self.verifyJobKillStatus() self.assertEqual(len(baAPI._listRunJobs()), 8) return def testCreateSubscription(self): """ _testCreateSubscription_ Verify that the subscription creation code works correctly. """ resourceControl = ResourceControl() resourceControl.insertSite(siteName='site1', pnn='goodse.cern.ch', ceName='site1', plugin="TestPlugin") resourceControl.insertSite(siteName='site2', pnn='goodse2.cern.ch', ceName='site2', plugin="TestPlugin") testWorkload = self.createTestWMSpec() testTopLevelTask = getFirstTask(testWorkload) testWMBSHelper = WMBSHelper(testWorkload, testTopLevelTask.name(), "SomeBlock", cachepath=self.workDir) testWMBSHelper.createTopLevelFileset() testWMBSHelper._createSubscriptionsInWMBS( testTopLevelTask, testWMBSHelper.topLevelFileset) procWorkflow = Workflow(name="TestWorkload", task="/TestWorkload/ProcessingTask") procWorkflow.load() self.assertEqual(procWorkflow.owner, "sfoulkes", "Error: Wrong owner: %s" % procWorkflow.owner) self.assertEqual(procWorkflow.group, "DMWM", "Error: Wrong group: %s" % procWorkflow.group) self.assertEqual(procWorkflow.wfType, "TestReReco", "Error: Wrong type.") self.assertEqual( procWorkflow.spec, os.path.join(self.workDir, procWorkflow.name, "WMSandbox", "WMWorkload.pkl"), "Error: Wrong spec URL") self.assertEqual(len(procWorkflow.outputMap), 1, "Error: Wrong number of WF outputs.") mergedProcOutput = procWorkflow.outputMap["OutputADataTierA"][0][ "merged_output_fileset"] unmergedProcOutput = procWorkflow.outputMap["OutputADataTierA"][0][ "output_fileset"] mergedProcOutput.loadData() unmergedProcOutput.loadData() self.assertEqual( mergedProcOutput.name, "/TestWorkload/ProcessingTask/MergeTask/merged-MergedDataTierA", "Error: Merged output fileset is wrong.") self.assertEqual( unmergedProcOutput.name, "/TestWorkload/ProcessingTask/unmerged-OutputADataTierA", "Error: Unmerged output fileset is wrong.") mergeWorkflow = Workflow(name="TestWorkload", task="/TestWorkload/ProcessingTask/MergeTask") mergeWorkflow.load() self.assertEqual(mergeWorkflow.owner, "sfoulkes", "Error: Wrong owner.") self.assertEqual( mergeWorkflow.spec, os.path.join(self.workDir, mergeWorkflow.name, "WMSandbox", "WMWorkload.pkl"), "Error: Wrong spec URL") self.assertEqual(len(mergeWorkflow.outputMap), 1, "Error: Wrong number of WF outputs.") cleanupWorkflow = Workflow( name="TestWorkload", task="/TestWorkload/ProcessingTask/CleanupTask") cleanupWorkflow.load() self.assertEqual(cleanupWorkflow.owner, "sfoulkes", "Error: Wrong owner.") self.assertEqual( cleanupWorkflow.spec, os.path.join(self.workDir, cleanupWorkflow.name, "WMSandbox", "WMWorkload.pkl"), "Error: Wrong spec URL") self.assertEqual(len(cleanupWorkflow.outputMap), 0, "Error: Wrong number of WF outputs.") unmergedMergeOutput = mergeWorkflow.outputMap["MergedDataTierA"][0][ "output_fileset"] unmergedMergeOutput.loadData() self.assertEqual( unmergedMergeOutput.name, "/TestWorkload/ProcessingTask/MergeTask/merged-MergedDataTierA", "Error: Unmerged output fileset is wrong.") skimWorkflow = Workflow( name="TestWorkload", task="/TestWorkload/ProcessingTask/MergeTask/SkimTask") skimWorkflow.load() self.assertEqual(skimWorkflow.owner, "sfoulkes", "Error: Wrong owner.") self.assertEqual( skimWorkflow.spec, os.path.join(self.workDir, skimWorkflow.name, "WMSandbox", "WMWorkload.pkl"), "Error: Wrong spec URL") self.assertEqual(len(skimWorkflow.outputMap), 2, "Error: Wrong number of WF outputs.") mergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][0][ "merged_output_fileset"] unmergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][ 0]["output_fileset"] mergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][0][ "merged_output_fileset"] unmergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][ 0]["output_fileset"] mergedSkimOutputA.loadData() mergedSkimOutputB.loadData() unmergedSkimOutputA.loadData() unmergedSkimOutputB.loadData() self.assertEqual( mergedSkimOutputA.name, "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputADataTierA", "Error: Merged output fileset is wrong: %s" % mergedSkimOutputA.name) self.assertEqual( unmergedSkimOutputA.name, "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputADataTierA", "Error: Unmerged output fileset is wrong.") self.assertEqual( mergedSkimOutputB.name, "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB", "Error: Merged output fileset is wrong.") self.assertEqual( unmergedSkimOutputB.name, "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB", "Error: Unmerged output fileset is wrong.") topLevelFileset = Fileset(name="TestWorkload-ProcessingTask-SomeBlock") topLevelFileset.loadData() procSubscription = Subscription(fileset=topLevelFileset, workflow=procWorkflow) procSubscription.loadData() self.assertEqual(len(procSubscription.getWhiteBlackList()), 2, "Error: Wrong site white/black list for proc sub.") for site in procSubscription.getWhiteBlackList(): if site["site_name"] == "site1": self.assertEqual(site["valid"], 1, "Error: Site should be white listed.") else: self.assertEqual(site["valid"], 0, "Error: Site should be black listed.") self.assertEqual(procSubscription["type"], "Processing", "Error: Wrong subscription type.") self.assertEqual(procSubscription["split_algo"], "FileBased", "Error: Wrong split algo.") mergeSubscription = Subscription(fileset=unmergedProcOutput, workflow=mergeWorkflow) mergeSubscription.loadData() self.assertEqual(len(mergeSubscription.getWhiteBlackList()), 0, "Error: Wrong white/black list for merge sub.") self.assertEqual(mergeSubscription["type"], "Merge", "Error: Wrong subscription type.") self.assertEqual(mergeSubscription["split_algo"], "WMBSMergeBySize", "Error: Wrong split algo.") skimSubscription = Subscription(fileset=unmergedMergeOutput, workflow=skimWorkflow) skimSubscription.loadData() self.assertEqual(skimSubscription["type"], "Skim", "Error: Wrong subscription type.") self.assertEqual(skimSubscription["split_algo"], "FileBased", "Error: Wrong split algo.") return def testTruncatedWFInsertion(self): """ _testTruncatedWFInsertion_ """ resourceControl = ResourceControl() resourceControl.insertSite(siteName='site1', pnn='goodse.cern.ch', ceName='site1', plugin="TestPlugin") resourceControl.insertSite(siteName='site2', pnn='goodse2.cern.ch', ceName='site2', plugin="TestPlugin") testWorkload = self.createTestWMSpec() testTopLevelTask = getFirstTask(testWorkload) testWMBSHelper = WMBSHelper(testWorkload, testTopLevelTask.name(), "SomeBlock", cachepath=self.workDir) testWMBSHelper.createTopLevelFileset() testWMBSHelper._createSubscriptionsInWMBS( testTopLevelTask, testWMBSHelper.topLevelFileset) testWorkload.truncate("ResubmitTestWorkload", "/TestWorkload/ProcessingTask/MergeTask", "someserver", "somedatabase") # create the subscription for multiple top task (MergeTask and CleanupTask for the same block) for task in testWorkload.getTopLevelTask(): testResubmitWMBSHelper = WMBSHelper(testWorkload, task.name(), "SomeBlock2", cachepath=self.workDir) testResubmitWMBSHelper.createTopLevelFileset() testResubmitWMBSHelper._createSubscriptionsInWMBS( task, testResubmitWMBSHelper.topLevelFileset) mergeWorkflow = Workflow(name="ResubmitTestWorkload", task="/ResubmitTestWorkload/MergeTask") mergeWorkflow.load() self.assertEqual(mergeWorkflow.owner, "sfoulkes", "Error: Wrong owner.") self.assertEqual( mergeWorkflow.spec, os.path.join(self.workDir, mergeWorkflow.name, "WMSandbox", "WMWorkload.pkl"), "Error: Wrong spec URL") self.assertEqual(len(mergeWorkflow.outputMap), 1, "Error: Wrong number of WF outputs.") unmergedMergeOutput = mergeWorkflow.outputMap["MergedDataTierA"][0][ "output_fileset"] unmergedMergeOutput.loadData() self.assertEqual( unmergedMergeOutput.name, "/ResubmitTestWorkload/MergeTask/merged-MergedDataTierA", "Error: Unmerged output fileset is wrong.") skimWorkflow = Workflow( name="ResubmitTestWorkload", task="/ResubmitTestWorkload/MergeTask/SkimTask") skimWorkflow.load() self.assertEqual(skimWorkflow.owner, "sfoulkes", "Error: Wrong owner.") self.assertEqual( skimWorkflow.spec, os.path.join(self.workDir, skimWorkflow.name, "WMSandbox", "WMWorkload.pkl"), "Error: Wrong spec URL") self.assertEqual(len(skimWorkflow.outputMap), 2, "Error: Wrong number of WF outputs.") mergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][0][ "merged_output_fileset"] unmergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][ 0]["output_fileset"] mergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][0][ "merged_output_fileset"] unmergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][ 0]["output_fileset"] mergedSkimOutputA.loadData() mergedSkimOutputB.loadData() unmergedSkimOutputA.loadData() unmergedSkimOutputB.loadData() self.assertEqual( mergedSkimOutputA.name, "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputADataTierA", "Error: Merged output fileset is wrong: %s" % mergedSkimOutputA.name) self.assertEqual( unmergedSkimOutputA.name, "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputADataTierA", "Error: Unmerged output fileset is wrong.") self.assertEqual( mergedSkimOutputB.name, "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB", "Error: Merged output fileset is wrong.") self.assertEqual( unmergedSkimOutputB.name, "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB", "Error: Unmerged output fileset is wrong.") topLevelFileset = Fileset( name="ResubmitTestWorkload-MergeTask-SomeBlock2") topLevelFileset.loadData() mergeSubscription = Subscription(fileset=topLevelFileset, workflow=mergeWorkflow) mergeSubscription.loadData() self.assertEqual(len(mergeSubscription.getWhiteBlackList()), 0, "Error: Wrong white/black list for merge sub.") self.assertEqual(mergeSubscription["type"], "Merge", "Error: Wrong subscription type.") self.assertEqual(mergeSubscription["split_algo"], "WMBSMergeBySize", "Error: Wrong split algo.") skimSubscription = Subscription(fileset=unmergedMergeOutput, workflow=skimWorkflow) skimSubscription.loadData() self.assertEqual(skimSubscription["type"], "Skim", "Error: Wrong subscription type.") self.assertEqual(skimSubscription["split_algo"], "FileBased", "Error: Wrong split algo.") return def testReReco(self): """ReReco workflow""" # create workflow block = self.dataset + "#" + BLOCK1 wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block) files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files']) self.assertEqual(len(files), 5) def testReRecoBlackRunRestriction(self): """ReReco workflow with Run restrictions""" block = self.dataset + "#" + BLOCK2 self.topLevelTask.setInputRunBlacklist( [181183]) # Set run blacklist to only run in the block wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block) files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files']) self.assertEqual(len(files), 0) def testReRecoWhiteRunRestriction(self): block = self.dataset + "#" + BLOCK2 self.topLevelTask.setInputRunWhitelist( [181183]) # Set run whitelist to only run in the block wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block) files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files']) self.assertEqual(len(files), 1) def testLumiMaskRestrictionsOK(self): block = self.dataset + "#" + BLOCK1 self.wmspec.getTopLevelTask()[0].data.input.splitting.runs = ['181367'] self.wmspec.getTopLevelTask()[0].data.input.splitting.lumis = ['57,80'] wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block) files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files']) self.assertEqual(len(files), 1) def testLumiMaskRestrictionsKO(self): block = self.dataset + "#" + BLOCK1 self.wmspec.getTopLevelTask()[0].data.input.splitting.runs = [ '123454321' ] self.wmspec.getTopLevelTask()[0].data.input.splitting.lumis = [ '123,123' ] wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block) files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files']) self.assertEqual(len(files), 0) def testDuplicateFileInsert(self): # using default wmspec block = self.dataset + "#" + BLOCK1 wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block) wmbs.topLevelFileset.loadData() numOfFiles = len(wmbs.topLevelFileset.files) # check initially inserted files. dbsFiles = self.dbs.getFileBlock(block)['Files'] self.assertEqual(numOfFiles, len(dbsFiles)) firstFileset = wmbs.topLevelFileset wmbsDao = wmbs.daofactory(classname="Files.InFileset") numOfFiles = len(wmbsDao.execute(firstFileset.id)) self.assertEqual(numOfFiles, len(dbsFiles)) # use the new spec with same inputdataset block = self.dataset + "#" + BLOCK1 wmspec = self.createWMSpec("TestSpec1") dbs = self.getDBS(wmspec) wmbs = self.createWMBSHelperWithTopTask(wmspec, block) # check duplicate insert dbsFiles = dbs.getFileBlock(block) data = self.rucio.getReplicaInfoForBlocks(block=[block]) dbsFiles['PhEDExNodeNames'] = data[0]["replica"] numOfFiles = wmbs.addFiles(dbsFiles) self.assertEqual(numOfFiles, 0) secondFileset = wmbs.topLevelFileset wmbsDao = wmbs.daofactory(classname="Files.InFileset") numOfFiles = len(wmbsDao.execute(secondFileset.id)) self.assertEqual(numOfFiles, len(dbsFiles['Files'])) self.assertNotEqual(firstFileset.id, secondFileset.id) def testDuplicateSubscription(self): """Can't duplicate subscriptions""" siteWhitelist = ["T2_XX_SiteA", "T2_XX_SiteB"] # using default wmspec block = self.dataset + "#" + BLOCK1 wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block) wmbs.topLevelFileset.loadData() numOfFiles = len(wmbs.topLevelFileset.files) filesetId = wmbs.topLevelFileset.id subId = wmbs.topLevelSubscription['id'] # check initially inserted files. dbsFiles = self.dbs.getFileBlock(block)['Files'] self.assertEqual(numOfFiles, len(dbsFiles)) # Not clear what's supposed to happen here, 2nd test is completely redundant dummyFirstFileset = wmbs.topLevelFileset self.assertEqual(numOfFiles, len(dbsFiles)) # reinsert subscription - shouldn't create anything new wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block) wmbs.topLevelFileset.loadData() self.assertEqual(numOfFiles, len(wmbs.topLevelFileset.files)) self.assertEqual(filesetId, wmbs.topLevelFileset.id) self.assertEqual(subId, wmbs.topLevelSubscription['id']) # now do a montecarlo workflow self.setupMCWMSpec() mask = Mask(FirstRun=12, FirstLumi=1234, FirstEvent=12345, LastEvent=999995, LastLumi=12345, LastRun=12) wmbs = self.createWMBSHelperWithTopTask(self.wmspec, None, mask, commonLocation=siteWhitelist) wmbs.topLevelFileset.loadData() numOfFiles = len(wmbs.topLevelFileset.files) filesetId = wmbs.topLevelFileset.id subId = wmbs.topLevelSubscription['id'] # check initially inserted files. # Not clear what's supposed to happen here, 2nd test is completely redundant numDbsFiles = 1 self.assertEqual(numOfFiles, numDbsFiles) dummyFirstFileset = wmbs.topLevelFileset self.assertEqual(numOfFiles, numDbsFiles) # reinsert subscription - shouldn't create anything new wmbs = self.createWMBSHelperWithTopTask(self.wmspec, None, mask, commonLocation=siteWhitelist) wmbs.topLevelFileset.loadData() self.assertEqual(numOfFiles, len(wmbs.topLevelFileset.files)) self.assertEqual(filesetId, wmbs.topLevelFileset.id) self.assertEqual(subId, wmbs.topLevelSubscription['id']) def testParentage(self): """ 1. check whether parent files are created in wmbs. 2. check parent files are associated to child. 3. When 2 specs with the same input data (one with parent processing, one without it) is inserted, if one without parent processing inserted first then the other with parent processing insert, it still needs to create parent files although child files are duplicate """ # Swap out the dataset for one that has parents task = next(self.wmspec.taskIterator()) oldDS = task.inputDataset( ) # Copy the old dataset, only will use DBS URL from it task.addInputDataset(name="/Cosmics/ComissioningHI-PromptReco-v1/RECO", primary='Cosmics', processed='ComissioningHI-PromptReco-v1', tier='RECO', dbsurl=oldDS.dbsurl) block = '/Cosmics/ComissioningHI-PromptReco-v1/RECO' + '#5b89ba9c-0dbf-11e1-9b6c-003048caaace' # File creation without parents wmbs, _, numFiles = self.createWMBSHelperWithTopTask(self.wmspec, block, parentFlag=False, detail=True) self.assertEqual(8, numFiles) wmbs.topLevelFileset.loadData() for child in wmbs.topLevelFileset.files: self.assertEqual(len(child["parents"]), 0) # no parents per child # File creation with parents wmbs, _, numFiles = self.createWMBSHelperWithTopTask(self.wmspec, block, parentFlag=True, detail=True) self.assertEqual(8, numFiles) wmbs.topLevelFileset.loadData() for child in wmbs.topLevelFileset.files: self.assertEqual(len(child["parents"]), 1) # one parent per child def testMCFakeFileInjection(self): """Inject fake Monte Carlo files into WMBS""" # This test is failing because the name of the couch DB is set to None # in BasicProductionWorkload.getProdArgs() but changing it to # "reqmgr_config_cache_t" from StdBase test arguments does not fix the # situation. testDuplicateSubscription probably has the same issue siteWhitelist = ["T2_XX_SiteA", "T2_XX_SiteB"] self.setupMCWMSpec() mask = Mask(FirstRun=12, FirstLumi=1234, FirstEvent=12345, LastEvent=999995, LastLumi=12345, LastRun=12) wmbs = self.createWMBSHelperWithTopTask(self.wmspec, None, mask, commonLocation=siteWhitelist) subscription = wmbs.topLevelSubscription self.assertEqual(1, subscription.exists()) fileset = subscription['fileset'] self.assertEqual(1, fileset.exists()) fileset.loadData() # need to refresh from database self.assertEqual(len(fileset.files), 1) self.assertEqual(len(fileset.parents), 0) self.assertFalse(fileset.open) firstFile = list(fileset.files)[0] self.assertEqual(firstFile['events'], mask['LastEvent'] - mask['FirstEvent'] + 1) # inclusive range self.assertEqual(firstFile['merged'], False) # merged files get added to dbs self.assertEqual(len(firstFile['parents']), 0) # firstFile.loadData() self.assertEqual(sorted(firstFile['locations']), sorted(self.pnns)) self.assertEqual(len(firstFile.getParentLFNs()), 0) self.assertEqual(len(firstFile.getRuns()), 1) run = firstFile.getRuns()[0] self.assertEqual(run.run, mask['FirstRun']) self.assertEqual(run.lumis[0], mask['FirstLumi']) self.assertEqual(run.lumis[-1], mask['LastLumi']) self.assertEqual(len(run.lumis), mask['LastLumi'] - mask['FirstLumi'] + 1)