def printQuotaSummary(self): """ Print a summary of the current quotas, space usage and space available """ self.logger.info("Summary of the current quotas in Terabytes:") for node in sorted(self.nodeUsage.keys()): msg = " %s:\t\tbytes_limit: %.2f, bytes_used: %.2f, bytes_remaining: %.2f, " msg += "quota: %.2f, quota_avail: %.2f" self.logger.debug( msg, node, teraBytes(self.nodeUsage[node]['bytes_limit']), teraBytes(self.nodeUsage[node]['bytes']), teraBytes(self.nodeUsage[node]['bytes_remaining']), teraBytes(self.nodeUsage[node]['quota']), teraBytes(self.nodeUsage[node]['quota_avail'])) self.logger.info("List of RSE's out of quota: %s", self.outOfSpaceNodes)
def notifyLargeData(self, aboveWarningThreshold, transferId, wflowName, dataSize, dataIn): """ Evaluates whether the amount of data placed is too big, if so, send an email notification to a few persons :param aboveWarningThreshold: boolean flag saying if the thresholds was exceeded or not :param transferId: rule/transfer request id :param wflowName: name of the workflow :param dataSize: total amount of data subscribed :param dataIn: short summary of the workflow data """ # Warn about data transfer subscriptions going above some threshold if aboveWarningThreshold: emailSubject = "[MS] Large pending data transfer under request id: {}".format(transferId) emailMsg = "Workflow: {}\nhas a large amount of ".format(wflowName) emailMsg += "data subscribed: {} TB,\n".format(teraBytes(dataSize)) emailMsg += "for {} data: {}.""".format(dataIn['type'], dataIn['name']) self.emailAlert.send(emailSubject, emailMsg) self.logger.info(emailMsg)
def unifiedUnused(self): """ FIXME FIXME TODO Leave this code in a different method until we evaluate what is needed and what is not, and refactor this thing... """ # FIXME making pylint happy, remove these assignments requestNames = [] uConfig = {} # requestNames = [r.getName() for r in workflows] # TODO: the logic below shows original unified port and it should be # revisited wrt new proposal specs and unified codebase # get workflows from list of requests orig = time.time() time0 = time.time() requestWorkflows = self._getRequestWorkflows(requestNames) requestWorkflows = requestWorkflows.values() self.logger.debug(elapsedTime(time0, "### getWorkflows")) # get workflows info summaries and collect datasets we need to process winfo = workflowsInfo(requestWorkflows) datasets = [d for row in winfo.values() for d in row['datasets']] # find dataset info time0 = time.time() datasetBlocks, datasetSizes, _datasetTransfers = dbsInfo( datasets, self.msConfig['dbsUrl']) self.logger.debug(elapsedTime(time0, "### dbsInfo")) # find block nodes information for our datasets time0 = time.time() blockNodes = phedexInfo(datasets, self.msConfig['phedexUrl']) self.logger.debug(elapsedTime(time0, "### phedexInfo")) # find events-lumis info for our datasets time0 = time.time() eventsLumis = eventsLumisInfo(datasets, self.msConfig['dbsUrl']) self.logger.debug(elapsedTime(time0, "### eventsLumisInfo")) # get specs for all requests and re-use them later in getSiteWhiteList as cache reqSpecs = self._getRequestSpecs(requestNames) # get siteInfo instance once and re-use it later, it is time-consumed object siteInfo = SiteInfo(uConfig) requestsToProcess = [] tst0 = time.time() totBlocks = totEvents = totSize = totCpuT = 0 for wflow in requestWorkflows: for wname, wspec in wflow.items(): time0 = time.time() cput = getComputingTime(wspec, eventsLumis=eventsLumis, dbsUrl=self.msConfig['dbsUrl'], logger=self.logger) ncopies = getNCopies(cput) attrs = winfo[wname] ndatasets = len(attrs['datasets']) npileups = len(attrs['pileups']) nblocks = nevts = nlumis = size = 0 nodes = set() for dataset in attrs['datasets']: blocks = datasetBlocks[dataset] for blk in blocks: for node in blockNodes.get(blk, []): nodes.add(node) nblocks += len(blocks) size += datasetSizes[dataset] edata = eventsLumis.get(dataset, { 'num_event': 0, 'num_lumi': 0 }) nevts += edata['num_event'] nlumis += edata['num_lumi'] totBlocks += nblocks totEvents += nevts totSize += size totCpuT += cput sites = json.dumps(sorted(list(nodes))) self.logger.debug("### %s", wname) self.logger.debug( "%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s", ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites) # find out which site can serve given workflow request t0 = time.time() lheInput, primary, parent, secondary, allowedSites \ = self._getSiteWhiteList(uConfig, wspec, siteInfo, reqSpecs) if not isinstance(primary, list): primary = [primary] if not isinstance(secondary, list): secondary = [secondary] wflowDatasets = primary + secondary wflowDatasetsBlocks = [] for dset in wflowDatasets: for item in datasetBlocks.get(dset, []): wflowDatasetsBlocks.append(item) rdict = dict(name=wname, datasets=wflowDatasets, blocks=wflowDatasetsBlocks, npileups=npileups, size=size, nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies, sites=sites, allowedSites=allowedSites, parent=parent, lheInput=lheInput, primary=primary, secondary=secondary) requestsToProcess.append(rdict) self.logger.debug(elapsedTime(t0, "### getSiteWhiteList")) self.logger.debug( "total # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)", len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT) self.logger.debug(elapsedTime(tst0, '### workflows info')) self.logger.debug(elapsedTime(orig, '### total time')) return requestsToProcess
def unified(svc, requestRecords, logger): """ Unified Transferror box Input parameters: :param requestRecords: list of request records, see definition in requestRecord :param logger: logger """ # get aux info for dataset/blocks from inputs/parents/pileups # make subscriptions based on site white/black lists logger.debug("### unified transferor") requests = [r['name'] for r in requestRecords] ### TODO: the logic below shows original unified port and it should be ### revisited wrt new proposal specs and unified codebase # get workflows from list of requests orig = time.time() time0 = time.time() requestWorkflows = getRequestWorkflows(requests) workflows = requestWorkflows.values() logger.debug(elapsedTime(time0, "### getWorkflows")) # get workflows info summaries and collect datasets we need to process winfo = workflowsInfo(workflows) datasets = [d for row in winfo.values() for d in row['datasets']] # find dataset info time0 = time.time() datasetBlocks, datasetSizes = dbsInfo(datasets) logger.debug(elapsedTime(time0, "### dbsInfo")) # find block nodes information for our datasets time0 = time.time() blockNodes = phedexInfo(datasets) logger.debug(elapsedTime(time0, "### phedexInfo")) # find events-lumis info for our datasets time0 = time.time() eventsLumis = eventsLumisInfo(datasets) logger.debug(elapsedTime(time0, "### eventsLumisInfo")) # get specs for all requests and re-use them later in getSiteWhiteList as cache requests = [v['RequestName'] for w in workflows for v in w.values()] reqSpecs = getRequestSpecs(requests) # get siteInfo instance once and re-use it later, it is time-consumed object siteInfo = SiteInfo() requestsToProcess = [] totBlocks = totEvents = totSize = totCpuT = 0 tst0 = time.time() for wflow in workflows: for wname, wspec in wflow.items(): time0 = time.time() cput = getComputingTime(wspec, eventsLumis=eventsLumis) ncopies = getNCopies(cput) attrs = winfo[wname] ndatasets = len(attrs['datasets']) npileups = len(attrs['pileups']) nblocks = nevts = nlumis = size = 0 nodes = set() for dataset in attrs['datasets']: blocks = datasetBlocks[dataset] for blk in blocks: for node in blockNodes.get(blk, []): nodes.add(node) nblocks += len(blocks) size += datasetSizes[dataset] edata = eventsLumis.get(dataset, {'num_event': 0, 'num_lumi': 0}) nevts += edata['num_event'] nlumis += edata['num_lumi'] totBlocks += nblocks totEvents += nevts totSize += size totCpuT += cput sites = json.dumps(sorted(list(nodes))) logger.debug("### %s", wname) logger.debug("%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s", ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites) # find out which site can serve given workflow request t0 = time.time() lheInput, primary, parent, secondary, allowedSites \ = getSiteWhiteList(svc, wspec, siteInfo, reqSpecs) rdict = dict(name=wname, datasets=datasets, blocks=datasetBlocks, \ npileups=npileups, size=size, nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies, \ sites=sites, allowedSites=allowedSites, parent=parent, \ lheInput=lheInput, primary=primary, secondary=secondary) requestsToProcess.append(rdict) logger.debug(elapsedTime(t0, "### getSiteWhiteList")) logger.debug("total # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)", len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT) logger.debug(elapsedTime(tst0, '### workflows info')) logger.debug(elapsedTime(orig, '### total time')) return requestsToProcess
def makeTransferRequest(self, wflow): """ Send request to PhEDEx and return status of request subscription This method does the following: 1. return if there is no workflow data to be transferred 2. check if the data input campaign is in the database, skip if not 3. _getValidSites: using the workflow site lists and the campaign configuration, find a common list of sites (converted to PNNs). If the PNN is out of quota, it's also removed from this list 4. create the transfer record dictionary 5. for every final node 5.1. if it's a pileup dataset, pick a random node and subscribe the whole dataset 5.2. else, retrieve chunks of blocks to be subscribed (evenly distributed) 5.3. update node usage with the amount of data subscribed 6. re-evaluate nodes with quota exceeded 7. return the transfer record, with a list of transfer IDs :param wflow: workflow object :return: boolean whether it succeeded or not, and a subscription dictionary {"dataset":transferIDs} """ response = [] success = True if not (wflow.getParentBlocks() or wflow.getPrimaryBlocks() or wflow.getSecondarySummary()): self.logger.info("Request %s does not have any further data to transfer", wflow.getName()) return success, response self.logger.info("Handling data subscriptions for request: %s", wflow.getName()) for dataIn in wflow.getDataCampaignMap(): if dataIn["type"] == "parent": msg = "Skipping 'parent' data subscription (done with the 'primary' data), for: %s" % dataIn self.logger.info(msg) continue elif dataIn["type"] == "secondary" and dataIn['name'] not in wflow.getSecondarySummary(): # secondary already in place continue if dataIn['campaign'] not in self.campaigns: msg = "Data placement can't proceed because campaign '%s' was not found." % dataIn["campaign"] msg += " Skipping this workflow until the campaign gets created." self.logger.warning(msg) return False, response nodes = self._getValidSites(wflow, dataIn) if not nodes: msg = "There are no RSEs with available space for %s. " % wflow.getName() msg += "Skipping this workflow until RSEs get enough free space" self.logger.warning(msg) return False, response transRec = newTransferRec(dataIn) for blocks, dataSize, idx in self._decideDataDestination(wflow, dataIn, len(nodes)): if not blocks and dataIn["type"] == "primary": # no valid files in any blocks, it will likely fail in global workqueue return success, response if blocks: subLevel = "block" data = {dataIn['name']: blocks} else: # then it's a dataset level subscription subLevel = "dataset" data = None subscription = PhEDExSubscription(datasetPathList=dataIn['name'], nodeList=nodes[idx], group=self.msConfig['quotaAccount'], level=subLevel, priority="low", request_only=self.msConfig["phedexRequestOnly"], blocks=data, comments="WMCore MicroService automated subscription") msg = "Creating '%s' level subscription for %s dataset: %s" % (subscription.level, dataIn['type'], dataIn['name']) if wflow.getParentDataset(): msg += ", where parent blocks have also been added for dataset: %s" % wflow.getParentDataset() self.logger.info(msg) if self.msConfig.get('enableDataTransfer', True): # Force request-only subscription # to any data transfer going above some threshold (do not auto-approve) aboveWarningThreshold = self.msConfig.get('warningTransferThreshold') > 0. and \ dataSize > self.msConfig.get('warningTransferThreshold') if aboveWarningThreshold and subscription.request_only != "y": subscription.request_only = "y" # Then make the data subscription, for real!!! success, transferId = self._subscribeData(subscription, wflow.getName(), dataIn['name']) if not success: break if transferId: transRec['transferIDs'].add(transferId) # Warn about data transfer subscriptions going above some treshold if aboveWarningThreshold: emailSubject = "[MS] Large pending data transfer under request id: {transferid}".format( transferid=transferId) emailMsg = "Workflow: {}\nhas a large amount of ".format(wflow.getName()) emailMsg += "data subscribed: {} TB,\n".format(teraBytes(dataSize)) emailMsg += "for {} data: {}.""".format(dataIn['type'], dataIn['name']) self.emailAlert.send(emailSubject, emailMsg) self.logger.info(emailMsg) # and update some instance caches self.rseQuotas.updateNodeUsage(nodes[idx], dataSize) if subLevel == 'dataset': self.dsetCounter += 1 else: self.blockCounter += len(blocks) else: self.logger.info("DRY-RUN: making subscription: %s", subscription) transRec['transferIDs'] = list(transRec['transferIDs']) response.append(transRec) # once the workflow has been completely processed, update the node usage self.rseQuotas.evaluateQuotaExceeded() return success, response
def requestsInfo(reqmgrAuxSvc, state='assignment-approved'): """ Helper function to get information about all requests in assignment-approved state in ReqMgr """ # get list of known request in workqueue requestJobs = workqueueRequests(state) requests = requestJobs.keys() # get workflows from list of requests time0 = orig = time.time() requestWorkflows = getRequestWorkflows(requests) workflows = requestWorkflows.values() elapsedTime(time0, "### getWorkflows") # time0 = orig = time.time() # workflows = getWorkflows(state) # elapsedTime(time0, "### getWorkflows") # get workflows info summaries and collect datasets we need to process winfo = workflowsInfo(workflows) datasets = [d for row in winfo.values() for d in row['datasets']] # find dataset info time0 = time.time() datasetBlocks, datasetSizes = dbsInfo(datasets) elapsedTime(time0, "### dbsInfo") # find block nodes information for our datasets time0 = time.time() blockNodes = phedexInfo(datasets) elapsedTime(time0, "### phedexInfo") # find events-lumis info for our datasets time0 = time.time() eventsLumis = eventsLumisInfo(datasets) elapsedTime(time0, "### eventsLumisInfo") # get specs for all requests and re-use them later in getSiteWhiteList as cache requests = [v['RequestName'] for w in workflows for v in w.values()] reqSpecs = getRequestSpecs(requests) # get siteInfo instance once and re-use it later, it is time-consumed object siteInfo = SiteInfo() requests = {} totBlocks = totEvents = totSize = totCpuT = 0 tst0 = time.time() for wflow in workflows: for wname, wspec in wflow.items(): time0 = time.time() cput = getComputingTime(wspec, eventsLumis=eventsLumis) ncopies = getNCopies(cput) attrs = winfo[wname] ndatasets = len(attrs['datasets']) npileups = len(attrs['pileups']) nblocks = nevts = nlumis = size = 0 nodes = set() for dataset in attrs['datasets']: blocks = datasetBlocks[dataset] for blk in blocks: for node in blockNodes.get(blk, []): nodes.add(node) nblocks += len(blocks) size += datasetSizes[dataset] edata = eventsLumis.get(dataset, {'num_event':0, 'num_lumi':0}) nevts += edata['num_event'] nlumis += edata['num_lumi'] totBlocks += nblocks totEvents += nevts totSize += size totCpuT += cput sites = json.dumps(sorted(list(nodes))) njobs = requestJobs[wname] print("\n### %s" % wname) print("%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s" \ % (ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites)) # find out which site can serve given workflow request t0 = time.time() lheInput, primary, parent, secondary, allowedSites \ = getSiteWhiteList(wspec, siteInfo, reqmgrAuxSvc, reqSpecs) rdict = dict(name=wname, datasets=datasets, blocks=datasetBlocks,\ npileups=npileups, size=size, njobs=njobs,\ nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies,\ sites=sites, allowedSites=allowedSites, parent=parent,\ lheInput=lheInput, primary=primary, secondary=secondary) requests[wname] = rdict print("sites", allowedSites) elapsedTime(t0, "getSiteWhiteList") print("\ntotal # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)" \ % (len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT)) elapsedTime(tst0, 'workflows info') elapsedTime(orig) return requests
def requestsInfo(reqmgrAuxSvc, state='assignment-approved'): """ Helper function to get information about all requests in assignment-approved state in ReqMgr """ # get list of known request in workqueue requestJobs = workqueueRequests(state) requests = requestJobs.keys() # get workflows from list of requests time0 = orig = time.time() requestWorkflows = getRequestWorkflows(requests) workflows = requestWorkflows.values() elapsedTime(time0, "### getWorkflows") # time0 = orig = time.time() # workflows = getWorkflows(state) # elapsedTime(time0, "### getWorkflows") # get workflows info summaries and collect datasets we need to process winfo = workflowsInfo(workflows) datasets = [d for row in winfo.values() for d in row['datasets']] # find dataset info time0 = time.time() datasetBlocks, datasetSizes = dbsInfo(datasets) elapsedTime(time0, "### dbsInfo") # find block nodes information for our datasets time0 = time.time() blockNodes = phedexInfo(datasets) elapsedTime(time0, "### phedexInfo") # find events-lumis info for our datasets time0 = time.time() eventsLumis = eventsLumisInfo(datasets) elapsedTime(time0, "### eventsLumisInfo") # get specs for all requests and re-use them later in getSiteWhiteList as cache requests = [v['RequestName'] for w in workflows for v in w.values()] reqSpecs = getRequestSpecs(requests) # get siteInfo instance once and re-use it later, it is time-consumed object siteInfo = SiteInfo() requests = {} totBlocks = totEvents = totSize = totCpuT = 0 tst0 = time.time() for wflow in workflows: for wname, wspec in wflow.items(): time0 = time.time() cput = getComputingTime(wspec, eventsLumis=eventsLumis) ncopies = getNCopies(cput) attrs = winfo[wname] ndatasets = len(attrs['datasets']) npileups = len(attrs['pileups']) nblocks = nevts = nlumis = size = 0 nodes = set() for dataset in attrs['datasets']: blocks = datasetBlocks[dataset] for blk in blocks: for node in blockNodes.get(blk, []): nodes.add(node) nblocks += len(blocks) size += datasetSizes[dataset] edata = eventsLumis.get(dataset, { 'num_event': 0, 'num_lumi': 0 }) nevts += edata['num_event'] nlumis += edata['num_lumi'] totBlocks += nblocks totEvents += nevts totSize += size totCpuT += cput sites = json.dumps(sorted(list(nodes))) njobs = requestJobs[wname] print("\n### %s" % wname) print("%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s" \ % (ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites)) # find out which site can serve given workflow request t0 = time.time() lheInput, primary, parent, secondary, allowedSites \ = getSiteWhiteList(wspec, siteInfo, reqmgrAuxSvc, reqSpecs) rdict = dict(name=wname, datasets=datasets, blocks=datasetBlocks, \ npileups=npileups, size=size, njobs=njobs, \ nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies, \ sites=sites, allowedSites=allowedSites, parent=parent, \ lheInput=lheInput, primary=primary, secondary=secondary) requests[wname] = rdict print("sites", allowedSites) elapsedTime(t0, "getSiteWhiteList") print("\ntotal # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)" \ % (len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT)) elapsedTime(tst0, 'workflows info') elapsedTime(orig) return requests