Example #1
0
 def printQuotaSummary(self):
     """
     Print a summary of the current quotas, space usage and space available
     """
     self.logger.info("Summary of the current quotas in Terabytes:")
     for node in sorted(self.nodeUsage.keys()):
         msg = "  %s:\t\tbytes_limit: %.2f, bytes_used: %.2f, bytes_remaining: %.2f, "
         msg += "quota: %.2f, quota_avail: %.2f"
         self.logger.debug(
             msg, node, teraBytes(self.nodeUsage[node]['bytes_limit']),
             teraBytes(self.nodeUsage[node]['bytes']),
             teraBytes(self.nodeUsage[node]['bytes_remaining']),
             teraBytes(self.nodeUsage[node]['quota']),
             teraBytes(self.nodeUsage[node]['quota_avail']))
     self.logger.info("List of RSE's out of quota: %s",
                      self.outOfSpaceNodes)
Example #2
0
 def notifyLargeData(self, aboveWarningThreshold, transferId, wflowName, dataSize, dataIn):
     """
     Evaluates whether the amount of data placed is too big, if so, send an email
     notification to a few persons
     :param aboveWarningThreshold: boolean flag saying if the thresholds was exceeded or not
     :param transferId: rule/transfer request id
     :param wflowName: name of the workflow
     :param dataSize: total amount of data subscribed
     :param dataIn: short summary of the workflow data
     """
     # Warn about data transfer subscriptions going above some threshold
     if aboveWarningThreshold:
         emailSubject = "[MS] Large pending data transfer under request id: {}".format(transferId)
         emailMsg = "Workflow: {}\nhas a large amount of ".format(wflowName)
         emailMsg += "data subscribed: {} TB,\n".format(teraBytes(dataSize))
         emailMsg += "for {} data: {}.""".format(dataIn['type'], dataIn['name'])
         self.emailAlert.send(emailSubject, emailMsg)
         self.logger.info(emailMsg)
Example #3
0
    def unifiedUnused(self):
        """
        FIXME FIXME TODO
        Leave this code in a different method until we evaluate what
        is needed and what is not, and refactor this thing...
        """
        # FIXME making pylint happy, remove these assignments
        requestNames = []
        uConfig = {}

        # requestNames = [r.getName() for r in workflows]
        # TODO: the logic below shows original unified port and it should be
        #       revisited wrt new proposal specs and unified codebase

        # get workflows from list of requests
        orig = time.time()
        time0 = time.time()
        requestWorkflows = self._getRequestWorkflows(requestNames)
        requestWorkflows = requestWorkflows.values()
        self.logger.debug(elapsedTime(time0, "### getWorkflows"))

        # get workflows info summaries and collect datasets we need to process
        winfo = workflowsInfo(requestWorkflows)
        datasets = [d for row in winfo.values() for d in row['datasets']]

        # find dataset info
        time0 = time.time()
        datasetBlocks, datasetSizes, _datasetTransfers = dbsInfo(
            datasets, self.msConfig['dbsUrl'])
        self.logger.debug(elapsedTime(time0, "### dbsInfo"))

        # find block nodes information for our datasets
        time0 = time.time()
        blockNodes = phedexInfo(datasets, self.msConfig['phedexUrl'])
        self.logger.debug(elapsedTime(time0, "### phedexInfo"))

        # find events-lumis info for our datasets
        time0 = time.time()
        eventsLumis = eventsLumisInfo(datasets, self.msConfig['dbsUrl'])
        self.logger.debug(elapsedTime(time0, "### eventsLumisInfo"))

        # get specs for all requests and re-use them later in getSiteWhiteList as cache
        reqSpecs = self._getRequestSpecs(requestNames)

        # get siteInfo instance once and re-use it later, it is time-consumed object
        siteInfo = SiteInfo(uConfig)

        requestsToProcess = []
        tst0 = time.time()
        totBlocks = totEvents = totSize = totCpuT = 0
        for wflow in requestWorkflows:
            for wname, wspec in wflow.items():
                time0 = time.time()
                cput = getComputingTime(wspec,
                                        eventsLumis=eventsLumis,
                                        dbsUrl=self.msConfig['dbsUrl'],
                                        logger=self.logger)
                ncopies = getNCopies(cput)

                attrs = winfo[wname]
                ndatasets = len(attrs['datasets'])
                npileups = len(attrs['pileups'])
                nblocks = nevts = nlumis = size = 0
                nodes = set()
                for dataset in attrs['datasets']:
                    blocks = datasetBlocks[dataset]
                    for blk in blocks:
                        for node in blockNodes.get(blk, []):
                            nodes.add(node)
                    nblocks += len(blocks)
                    size += datasetSizes[dataset]
                    edata = eventsLumis.get(dataset, {
                        'num_event': 0,
                        'num_lumi': 0
                    })
                    nevts += edata['num_event']
                    nlumis += edata['num_lumi']
                totBlocks += nblocks
                totEvents += nevts
                totSize += size
                totCpuT += cput
                sites = json.dumps(sorted(list(nodes)))
                self.logger.debug("### %s", wname)
                self.logger.debug(
                    "%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s",
                    ndatasets, nblocks, size, teraBytes(size), nevts, nlumis,
                    cput, ncopies, sites)
                # find out which site can serve given workflow request
                t0 = time.time()
                lheInput, primary, parent, secondary, allowedSites \
                    = self._getSiteWhiteList(uConfig, wspec, siteInfo, reqSpecs)
                if not isinstance(primary, list):
                    primary = [primary]
                if not isinstance(secondary, list):
                    secondary = [secondary]
                wflowDatasets = primary + secondary
                wflowDatasetsBlocks = []
                for dset in wflowDatasets:
                    for item in datasetBlocks.get(dset, []):
                        wflowDatasetsBlocks.append(item)
                rdict = dict(name=wname,
                             datasets=wflowDatasets,
                             blocks=wflowDatasetsBlocks,
                             npileups=npileups,
                             size=size,
                             nevents=nevts,
                             nlumis=nlumis,
                             cput=cput,
                             ncopies=ncopies,
                             sites=sites,
                             allowedSites=allowedSites,
                             parent=parent,
                             lheInput=lheInput,
                             primary=primary,
                             secondary=secondary)
                requestsToProcess.append(rdict)
                self.logger.debug(elapsedTime(t0, "### getSiteWhiteList"))
        self.logger.debug(
            "total # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)",
            len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize,
            teraBytes(totSize), totCpuT)
        self.logger.debug(elapsedTime(tst0, '### workflows info'))
        self.logger.debug(elapsedTime(orig, '### total time'))
        return requestsToProcess
Example #4
0
def unified(svc, requestRecords, logger):
    """
    Unified Transferror box

    Input parameters:
    :param requestRecords: list of request records, see definition in requestRecord
    :param logger: logger
    """
    # get aux info for dataset/blocks from inputs/parents/pileups
    # make subscriptions based on site white/black lists
    logger.debug("### unified transferor")

    requests = [r['name'] for r in requestRecords]

    ### TODO: the logic below shows original unified port and it should be
    ###       revisited wrt new proposal specs and unified codebase

    # get workflows from list of requests 
    orig = time.time()
    time0 = time.time()
    requestWorkflows = getRequestWorkflows(requests)
    workflows = requestWorkflows.values()
    logger.debug(elapsedTime(time0, "### getWorkflows"))

    # get workflows info summaries and collect datasets we need to process
    winfo = workflowsInfo(workflows)
    datasets = [d for row in winfo.values() for d in row['datasets']]

    # find dataset info
    time0 = time.time()
    datasetBlocks, datasetSizes = dbsInfo(datasets)
    logger.debug(elapsedTime(time0, "### dbsInfo"))

    # find block nodes information for our datasets
    time0 = time.time()
    blockNodes = phedexInfo(datasets)
    logger.debug(elapsedTime(time0, "### phedexInfo"))

    # find events-lumis info for our datasets
    time0 = time.time()
    eventsLumis = eventsLumisInfo(datasets)
    logger.debug(elapsedTime(time0, "### eventsLumisInfo"))

    # get specs for all requests and re-use them later in getSiteWhiteList as cache
    requests = [v['RequestName'] for w in workflows for v in w.values()]
    reqSpecs = getRequestSpecs(requests)

    # get siteInfo instance once and re-use it later, it is time-consumed object
    siteInfo = SiteInfo()

    requestsToProcess = []
    totBlocks = totEvents = totSize = totCpuT = 0
    tst0 = time.time()
    for wflow in workflows:
        for wname, wspec in wflow.items():
            time0 = time.time()
            cput = getComputingTime(wspec, eventsLumis=eventsLumis)
            ncopies = getNCopies(cput)

            attrs = winfo[wname]
            ndatasets = len(attrs['datasets'])
            npileups = len(attrs['pileups'])
            nblocks = nevts = nlumis = size = 0
            nodes = set()
            for dataset in attrs['datasets']:
                blocks = datasetBlocks[dataset]
                for blk in blocks:
                    for node in blockNodes.get(blk, []):
                        nodes.add(node)
                nblocks += len(blocks)
                size += datasetSizes[dataset]
                edata = eventsLumis.get(dataset, {'num_event': 0, 'num_lumi': 0})
                nevts += edata['num_event']
                nlumis += edata['num_lumi']
            totBlocks += nblocks
            totEvents += nevts
            totSize += size
            totCpuT += cput
            sites = json.dumps(sorted(list(nodes)))
            logger.debug("### %s", wname)
            logger.debug("%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s", ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites)
            # find out which site can serve given workflow request
            t0 = time.time()
            lheInput, primary, parent, secondary, allowedSites \
                = getSiteWhiteList(svc, wspec, siteInfo, reqSpecs)
            rdict = dict(name=wname, datasets=datasets, blocks=datasetBlocks, \
                         npileups=npileups, size=size,
                         nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies, \
                         sites=sites, allowedSites=allowedSites, parent=parent, \
                         lheInput=lheInput, primary=primary, secondary=secondary)
            requestsToProcess.append(rdict)
            logger.debug(elapsedTime(t0, "### getSiteWhiteList"))
    logger.debug("total # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)", len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT)
    logger.debug(elapsedTime(tst0, '### workflows info'))
    logger.debug(elapsedTime(orig, '### total time'))

    return requestsToProcess
Example #5
0
    def makeTransferRequest(self, wflow):
        """
        Send request to PhEDEx and return status of request subscription
        This method does the following:
          1. return if there is no workflow data to be transferred
          2. check if the data input campaign is in the database, skip if not
          3. _getValidSites: using the workflow site lists and the campaign configuration,
             find a common list of sites (converted to PNNs). If the PNN is out of quota,
             it's also removed from this list
          4. create the transfer record dictionary
          5. for every final node
             5.1. if it's a pileup dataset, pick a random node and subscribe the whole dataset
             5.2. else, retrieve chunks of blocks to be subscribed (evenly distributed)
             5.3. update node usage with the amount of data subscribed
          6. re-evaluate nodes with quota exceeded
          7. return the transfer record, with a list of transfer IDs
        :param wflow: workflow object
        :return: boolean whether it succeeded or not, and a subscription dictionary {"dataset":transferIDs}
        """
        response = []
        success = True
        if not (wflow.getParentBlocks() or wflow.getPrimaryBlocks() or wflow.getSecondarySummary()):
            self.logger.info("Request %s does not have any further data to transfer", wflow.getName())
            return success, response

        self.logger.info("Handling data subscriptions for request: %s", wflow.getName())

        for dataIn in wflow.getDataCampaignMap():
            if dataIn["type"] == "parent":
                msg = "Skipping 'parent' data subscription (done with the 'primary' data), for: %s" % dataIn
                self.logger.info(msg)
                continue
            elif dataIn["type"] == "secondary" and dataIn['name'] not in wflow.getSecondarySummary():
                # secondary already in place
                continue
            if dataIn['campaign'] not in self.campaigns:
                msg = "Data placement can't proceed because campaign '%s' was not found." % dataIn["campaign"]
                msg += " Skipping this workflow until the campaign gets created."
                self.logger.warning(msg)
                return False, response

            nodes = self._getValidSites(wflow, dataIn)
            if not nodes:
                msg = "There are no RSEs with available space for %s. " % wflow.getName()
                msg += "Skipping this workflow until RSEs get enough free space"
                self.logger.warning(msg)
                return False, response

            transRec = newTransferRec(dataIn)
            for blocks, dataSize, idx in self._decideDataDestination(wflow, dataIn, len(nodes)):
                if not blocks and dataIn["type"] == "primary":
                    # no valid files in any blocks, it will likely fail in global workqueue
                    return success, response
                if blocks:
                    subLevel = "block"
                    data = {dataIn['name']: blocks}
                else:
                    # then it's a dataset level subscription
                    subLevel = "dataset"
                    data = None

                subscription = PhEDExSubscription(datasetPathList=dataIn['name'],
                                                  nodeList=nodes[idx],
                                                  group=self.msConfig['quotaAccount'],
                                                  level=subLevel,
                                                  priority="low",
                                                  request_only=self.msConfig["phedexRequestOnly"],
                                                  blocks=data,
                                                  comments="WMCore MicroService automated subscription")
                msg = "Creating '%s' level subscription for %s dataset: %s" % (subscription.level,
                                                                               dataIn['type'],
                                                                               dataIn['name'])
                if wflow.getParentDataset():
                    msg += ", where parent blocks have also been added for dataset: %s" % wflow.getParentDataset()
                self.logger.info(msg)

                if self.msConfig.get('enableDataTransfer', True):
                    # Force request-only subscription
                    # to any data transfer going above some threshold (do not auto-approve)
                    aboveWarningThreshold = self.msConfig.get('warningTransferThreshold') > 0. and \
                        dataSize > self.msConfig.get('warningTransferThreshold')
                    if aboveWarningThreshold and subscription.request_only != "y":
                        subscription.request_only = "y"

                    # Then make the data subscription, for real!!!
                    success, transferId = self._subscribeData(subscription, wflow.getName(), dataIn['name'])
                    if not success:
                        break
                    if transferId:
                        transRec['transferIDs'].add(transferId)

                    # Warn about data transfer subscriptions going above some treshold
                    if aboveWarningThreshold:
                        emailSubject = "[MS] Large pending data transfer under request id: {transferid}".format(
                            transferid=transferId)
                        emailMsg = "Workflow: {}\nhas a large amount of ".format(wflow.getName())
                        emailMsg += "data subscribed: {} TB,\n".format(teraBytes(dataSize))
                        emailMsg += "for {} data: {}.""".format(dataIn['type'], dataIn['name'])
                        self.emailAlert.send(emailSubject, emailMsg)
                        self.logger.info(emailMsg)

                    # and update some instance caches
                    self.rseQuotas.updateNodeUsage(nodes[idx], dataSize)
                    if subLevel == 'dataset':
                        self.dsetCounter += 1
                    else:
                        self.blockCounter += len(blocks)
                else:
                    self.logger.info("DRY-RUN: making subscription: %s", subscription)

            transRec['transferIDs'] = list(transRec['transferIDs'])
            response.append(transRec)

        # once the workflow has been completely processed, update the node usage
        self.rseQuotas.evaluateQuotaExceeded()
        return success, response
Example #6
0
def requestsInfo(reqmgrAuxSvc, state='assignment-approved'):
    """
    Helper function to get information about all requests
    in assignment-approved state in ReqMgr
    """
    # get list of known request in workqueue
    requestJobs = workqueueRequests(state)
    requests = requestJobs.keys()

    # get workflows from list of requests 
    time0 = orig = time.time()
    requestWorkflows = getRequestWorkflows(requests)
    workflows = requestWorkflows.values()
    elapsedTime(time0, "### getWorkflows")

#     time0 = orig = time.time()
#     workflows = getWorkflows(state)
#     elapsedTime(time0, "### getWorkflows")

    # get workflows info summaries and collect datasets we need to process
    winfo = workflowsInfo(workflows)
    datasets = [d for row in winfo.values() for d in row['datasets']]

    # find dataset info
    time0 = time.time()
    datasetBlocks, datasetSizes = dbsInfo(datasets)
    elapsedTime(time0, "### dbsInfo")

    # find block nodes information for our datasets
    time0 = time.time()
    blockNodes = phedexInfo(datasets)
    elapsedTime(time0, "### phedexInfo")

    # find events-lumis info for our datasets
    time0 = time.time()
    eventsLumis = eventsLumisInfo(datasets)
    elapsedTime(time0, "### eventsLumisInfo")

    # get specs for all requests and re-use them later in getSiteWhiteList as cache
    requests = [v['RequestName'] for w in workflows for v in w.values()]
    reqSpecs = getRequestSpecs(requests)

    # get siteInfo instance once and re-use it later, it is time-consumed object
    siteInfo = SiteInfo()

    requests = {}
    totBlocks = totEvents = totSize = totCpuT = 0
    tst0 = time.time()
    for wflow in workflows:
        for wname, wspec in wflow.items():
            time0 = time.time()
            cput = getComputingTime(wspec, eventsLumis=eventsLumis)
            ncopies = getNCopies(cput)

            attrs = winfo[wname]
            ndatasets = len(attrs['datasets'])
            npileups = len(attrs['pileups'])
            nblocks = nevts = nlumis = size = 0
            nodes = set()
            for dataset in attrs['datasets']:
                blocks = datasetBlocks[dataset]
                for blk in blocks:
                    for node in blockNodes.get(blk, []):
                        nodes.add(node)
                nblocks += len(blocks)
                size += datasetSizes[dataset]
                edata = eventsLumis.get(dataset, {'num_event':0, 'num_lumi':0})
                nevts += edata['num_event']
                nlumis += edata['num_lumi']
            totBlocks += nblocks
            totEvents += nevts
            totSize += size
            totCpuT += cput
            sites = json.dumps(sorted(list(nodes)))
            njobs = requestJobs[wname]
            print("\n### %s" % wname)
            print("%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s" \
                    % (ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites))
            # find out which site can serve given workflow request
            t0 = time.time()
            lheInput, primary, parent, secondary, allowedSites \
                    = getSiteWhiteList(wspec, siteInfo, reqmgrAuxSvc, reqSpecs)
            rdict = dict(name=wname, datasets=datasets, blocks=datasetBlocks,\
                    npileups=npileups, size=size, njobs=njobs,\
                    nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies,\
                    sites=sites, allowedSites=allowedSites, parent=parent,\
                    lheInput=lheInput, primary=primary, secondary=secondary)
            requests[wname] = rdict
            print("sites", allowedSites)
            elapsedTime(t0, "getSiteWhiteList")
    print("\ntotal # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)" \
            % (len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT))
    elapsedTime(tst0, 'workflows info')
    elapsedTime(orig)
    return requests
Example #7
0
def requestsInfo(reqmgrAuxSvc, state='assignment-approved'):
    """
    Helper function to get information about all requests
    in assignment-approved state in ReqMgr
    """
    # get list of known request in workqueue
    requestJobs = workqueueRequests(state)
    requests = requestJobs.keys()

    # get workflows from list of requests
    time0 = orig = time.time()
    requestWorkflows = getRequestWorkflows(requests)
    workflows = requestWorkflows.values()
    elapsedTime(time0, "### getWorkflows")

    #     time0 = orig = time.time()
    #     workflows = getWorkflows(state)
    #     elapsedTime(time0, "### getWorkflows")

    # get workflows info summaries and collect datasets we need to process
    winfo = workflowsInfo(workflows)
    datasets = [d for row in winfo.values() for d in row['datasets']]

    # find dataset info
    time0 = time.time()
    datasetBlocks, datasetSizes = dbsInfo(datasets)
    elapsedTime(time0, "### dbsInfo")

    # find block nodes information for our datasets
    time0 = time.time()
    blockNodes = phedexInfo(datasets)
    elapsedTime(time0, "### phedexInfo")

    # find events-lumis info for our datasets
    time0 = time.time()
    eventsLumis = eventsLumisInfo(datasets)
    elapsedTime(time0, "### eventsLumisInfo")

    # get specs for all requests and re-use them later in getSiteWhiteList as cache
    requests = [v['RequestName'] for w in workflows for v in w.values()]
    reqSpecs = getRequestSpecs(requests)

    # get siteInfo instance once and re-use it later, it is time-consumed object
    siteInfo = SiteInfo()

    requests = {}
    totBlocks = totEvents = totSize = totCpuT = 0
    tst0 = time.time()
    for wflow in workflows:
        for wname, wspec in wflow.items():
            time0 = time.time()
            cput = getComputingTime(wspec, eventsLumis=eventsLumis)
            ncopies = getNCopies(cput)

            attrs = winfo[wname]
            ndatasets = len(attrs['datasets'])
            npileups = len(attrs['pileups'])
            nblocks = nevts = nlumis = size = 0
            nodes = set()
            for dataset in attrs['datasets']:
                blocks = datasetBlocks[dataset]
                for blk in blocks:
                    for node in blockNodes.get(blk, []):
                        nodes.add(node)
                nblocks += len(blocks)
                size += datasetSizes[dataset]
                edata = eventsLumis.get(dataset, {
                    'num_event': 0,
                    'num_lumi': 0
                })
                nevts += edata['num_event']
                nlumis += edata['num_lumi']
            totBlocks += nblocks
            totEvents += nevts
            totSize += size
            totCpuT += cput
            sites = json.dumps(sorted(list(nodes)))
            njobs = requestJobs[wname]
            print("\n### %s" % wname)
            print("%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s" \
                  % (ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites))
            # find out which site can serve given workflow request
            t0 = time.time()
            lheInput, primary, parent, secondary, allowedSites \
                = getSiteWhiteList(wspec, siteInfo, reqmgrAuxSvc, reqSpecs)
            rdict = dict(name=wname, datasets=datasets, blocks=datasetBlocks, \
                         npileups=npileups, size=size, njobs=njobs, \
                         nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies, \
                         sites=sites, allowedSites=allowedSites, parent=parent, \
                         lheInput=lheInput, primary=primary, secondary=secondary)
            requests[wname] = rdict
            print("sites", allowedSites)
            elapsedTime(t0, "getSiteWhiteList")
    print("\ntotal # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)" \
          % (len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT))
    elapsedTime(tst0, 'workflows info')
    elapsedTime(orig)
    return requests