Example #1
0
    def _decideDataDestination(self, wflow, dataIn, numNodes):
        """
        Given a global list of blocks and the campaign configuration,
        decide which blocks have to be transferred and to where.
        :param wflow: workflow object
        :param dataIn: dictionary with a summary of the data to be placed
        :param numNodes: amount of nodes/RSEs that can receive data
        :return: yield a block list, the total chunk size and a node index
        """
        # FIXME: implement multiple copies (MaxCopies > 1)
        blockList = []
        dsetName = dataIn["name"]

        ### NOTE: data placement done in a block basis
        if dataIn["type"] == "primary":
            # Except for DQMHarvest workflows, which must have a data placement of the
            # whole dataset within the same location
            if wflow.getReqType() == "DQMHarvest":
                numNodes = 1
            # if there is no parent data, just make one big rule for all the primary data
            # against all RSEs available for the workflow (intersection with PU data
            if not wflow.getParentBlocks():
                numNodes = 1
            listBlockSets, listSetsSize = wflow.getChunkBlocks(numNodes)
            if not listBlockSets:
                self.logger.warning(
                    "  found 0 primary/parent blocks for dataset: %s, moving on...",
                    dsetName)
                yield blockList, 0, 0
            for idx, blocksSet in enumerate(listBlockSets):
                self.logger.info(
                    "Have a chunk of %d blocks (%s GB) for dataset: %s",
                    len(blocksSet), gigaBytes(listSetsSize[idx]), dsetName)
                yield blocksSet, listSetsSize[idx], idx
        ### NOTE: data placement done in a dataset basis
        elif dataIn["type"] == "secondary":
            # secondary datasets are transferred as a whole, until better days...
            dsetSize = wflow.getSecondarySummary()
            dsetSize = dsetSize[dsetName]['dsetSize']
            # randomly pick one of the PNNs to put the whole pileup dataset in
            idx = randint(0, numNodes - 1)
            self.logger.info("Have whole PU dataset: %s (%s GB)", dsetName,
                             gigaBytes(dsetSize))
            yield blockList, dsetSize, idx
Example #2
0
    def _checkPrimaryDataVolume(self, wflow, wflowPnns):
        """
        Calculate the total data volume already available in the
        restricted list of PNNs, such that we can minimize primary/
        parent data transfers
        :param wflow: a workflow object
        :param wflowPnns: set with the allowed PNNs to receive data
        :return: the PNN which contains most of the data already in
        """
        msg = "Checking primary data volume for: %s, allowed PNNs: %s"
        self.logger.info(msg, wflow.getName(), wflowPnns)

        volumeByPNN = dict()
        for pnn in wflowPnns:
            volumeByPNN.setdefault(pnn, 0)

        for methodName in ("getPrimaryBlocks", "getParentBlocks"):
            inputBlocks = getattr(wflow, methodName)()
            self.logger.info("Request %s has %d initial blocks from %s",
                             wflow.getName(), len(inputBlocks), methodName)

            for block, blockDict in viewitems(inputBlocks):
                blockLocation = self._diskPNNs(blockDict['locations'])
                commonLocation = wflowPnns & set(blockLocation)
                if not commonLocation:
                    continue
                for pnn in commonLocation:
                    volumeByPNN[pnn] += blockDict['blockSize']

        maxSize = 0
        finalPNN = set()
        self.logger.info("Primary/parent data volume currently available:")
        for pnn, size in viewitems(volumeByPNN):
            self.logger.info("  PNN: %s\t\tData volume: %s GB", pnn,
                             gigaBytes(size))
            if size > maxSize:
                maxSize = size
                finalPNN = {pnn}
            elif size == maxSize:
                finalPNN.add(pnn)
        self.logger.info(
            "The PNN that would require less data to be transferred is: %s",
            finalPNN)
        if len(finalPNN) > 1:
            # magically picks one site from the list. It could pick the one with highest
            # available quota, but that might overload that one site...
            # make sure it's a set object
            finalPNN = choice(list(finalPNN))
            finalPNN = {finalPNN}
            self.logger.info("Randomly picked PNN: %s as final location",
                             finalPNN)

        return finalPNN
Example #3
0
    def __init__(self, dataAcct, quotaFraction, **kwargs):
        """
        Executes a basic setup, including proper logging.
        :param dataAcct: string with the Rucio account
        :param quotaFraction: float point number representing the fraction of the quota
        :param kwargs: the supported keyword arguments are:
          minimumThreshold: integer value defining the minimum available space required
          verbose: logger verbosity
          logger: logger object
        """
        self.dataAcct = dataAcct
        self.quotaFraction = quotaFraction

        self.minimumSpace = kwargs["minimumThreshold"]
        self.logger = getMSLogger(kwargs.get("verbose"), kwargs.get("logger"))
        msg = "RSEQuotas started with parameters: dataAcct=%s, quotaFraction=%s, "
        msg += "minimumThreshold=%s GB"
        self.logger.info(msg, dataAcct, quotaFraction,
                         gigaBytes(self.minimumSpace))

        self.nodeUsage = {}
        self.availableRSEs = set()
        self.outOfSpaceNodes = set()
Example #4
0
    def getChunkBlocks(self, numChunks=1):
        """
        Break down the input and parent blocks by a given number
        of chunks (usually the amount of sites available for data
        placement).
        :param numChunks: integer representing the number of chunks to be created
        :return: it returns two lists:
          * a list of sets, where each set corresponds to a set of blocks to be
            transferred to a single location;
          * and a list integers, which references the total size of each chunk in
            the list above (same order).
        """
        if numChunks == 1:
            thisChunk = set()
            thisChunk.update(list(self.getPrimaryBlocks()))
            thisChunkSize = sum([
                blockInfo['blockSize']
                for blockInfo in viewvalues(self.getPrimaryBlocks())
            ])
            if self.getParentDataset():
                thisChunk.update(list(self.getParentBlocks()))
                thisChunkSize += sum([
                    blockInfo['blockSize']
                    for blockInfo in viewvalues(self.getParentBlocks())
                ])
            # keep same data structure as multiple chunks, so list of lists
            return [thisChunk], [thisChunkSize]

        # create a descendant list of blocks according to their sizes
        sortedPrimary = sorted(viewitems(self.getPrimaryBlocks()),
                               key=operator.itemgetter(1),
                               reverse=True)
        if len(sortedPrimary) < numChunks:
            msg = "There are less blocks than chunks to create. "
            msg += "Reducing numChunks from %d to %d" % (numChunks,
                                                         len(sortedPrimary))
            self.logger.info(msg)
            numChunks = len(sortedPrimary)
        chunkSize = sum(item[1]['blockSize']
                        for item in sortedPrimary) // numChunks

        self.logger.info("Found %d blocks and the avg chunkSize is: %s GB",
                         len(sortedPrimary), gigaBytes(chunkSize))
        # list of sets with the block names
        blockChunks = []
        # list of integers with the total block sizes in each chunk (same order as above)
        sizeChunks = []
        for i in range(numChunks):
            thisChunk = set()
            thisChunkSize = 0
            idx = 0
            while True:
                self.logger.debug("Chunk: %d and idx: %s and length: %s", i,
                                  idx, len(sortedPrimary))
                if not sortedPrimary or idx >= len(sortedPrimary):
                    # then all blocks have been distributed
                    break
                elif not thisChunkSize:
                    # then this site/chunk is empty, assign a block to it
                    thisChunk.add(sortedPrimary[idx][0])
                    thisChunkSize += sortedPrimary[idx][1]['blockSize']
                    sortedPrimary.pop(idx)
                elif thisChunkSize + sortedPrimary[idx][1][
                        'blockSize'] <= chunkSize:
                    thisChunk.add(sortedPrimary[idx][0])
                    thisChunkSize += sortedPrimary[idx][1]['blockSize']
                    sortedPrimary.pop(idx)
                else:
                    idx += 1
            if thisChunk:
                blockChunks.append(thisChunk)
                sizeChunks.append(thisChunkSize)

        # now take care of the leftovers... in a round-robin style....
        while sortedPrimary:
            for chunkNum in range(numChunks):
                blockChunks[chunkNum].add(sortedPrimary[0][0])
                sizeChunks[chunkNum] += sortedPrimary[0][1]['blockSize']
                sortedPrimary.pop(0)
                if not sortedPrimary:
                    break
        self.logger.info("Created %d primary data chunks out of %d chunks",
                         len(blockChunks), numChunks)
        self.logger.info("    with chunk size distribution: %s", sizeChunks)

        if not self.getParentDataset():
            return blockChunks, sizeChunks

        # now add the parent blocks, considering that input blocks were evenly
        # distributed, I'd expect the same to automatically happen to the parents...
        childParent = self.getChildToParentBlocks()
        parentsSize = self.getParentBlocks()
        for chunkNum in range(numChunks):
            parentSet = set()
            for child in blockChunks[chunkNum]:
                parentSet.update(childParent[child])

            # now with the final list of parents in hand, update the list
            # of blocks within the chunk and update the chunk size as well
            blockChunks[chunkNum].update(parentSet)
            for parent in parentSet:
                sizeChunks[chunkNum] += parentsSize[parent]['blockSize']
        self.logger.info(
            "Created %d primary+parent data chunks out of %d chunks",
            len(blockChunks), numChunks)
        self.logger.info("    with chunk size distribution: %s", sizeChunks)
        return blockChunks, sizeChunks
Example #5
0
    def checkPUDataLocation(self, wflow):
        """
        Check the workflow configuration - in terms of AAA - and the secondary
        pileup distribution; and if possible remove the pileup dataset from the
        next step where data is placed.
        If workflow has XRootD/AAA enabled, data location can be outside of the
        SiteWhitelist.
        :param wflow: workflow object
        """
        pileupInput = wflow.getSecondarySummary()
        if not pileupInput:
            # nothing to be done here
            return

        wflowPnns = self._getPNNsFromPSNs(wflow.getSitelist())
        secondaryAAA = wflow.getReqParam("TrustPUSitelists")
        msg = "Checking secondary data location for request: {}, ".format(
            wflow.getName())
        msg += "TrustPUSitelists: {}, request white/black list PNNs: {}".format(
            secondaryAAA, wflowPnns)
        self.logger.info(msg)

        if secondaryAAA:
            # what matters is to have pileup dataset(s) available in ANY disk storage
            for dset, dsetDict in listitems(
                    pileupInput):  # dict can change size here
                datasetLocation = self._diskPNNs(dsetDict['locations'])
                msg = "it has secondary: %s, total size: %s GB, disk locations: %s"
                self.logger.info(msg, dset, gigaBytes(dsetDict['dsetSize']),
                                 datasetLocation)
                if datasetLocation:
                    self.logger.info(
                        "secondary dataset %s already in place through AAA: %s",
                        dset, datasetLocation)
                    pileupInput.pop(dset)
                else:
                    self.logger.info(
                        "secondary dataset %s not available even through AAA",
                        dset)
        else:
            if len(pileupInput) == 1:
                for dset, dsetDict in listitems(
                        pileupInput):  # dict can change size here
                    datasetLocation = self._diskPNNs(dsetDict['locations'])
                    msg = "it has secondary: %s, total size: %s GB, current disk locations: %s"
                    self.logger.info(msg, dset,
                                     gigaBytes(dsetDict['dsetSize']),
                                     datasetLocation)
                    commonLocation = wflowPnns & set(datasetLocation)
                    if commonLocation:
                        msg = "secondary dataset: %s already in place. "
                        msg += "Common locations with site white/black list is: %s"
                        self.logger.info(msg, dset, commonLocation)
                        pileupInput.pop(dset)
                        wflow.setPURSElist(commonLocation)
                    else:
                        self.logger.info(
                            "secondary: %s will need data placement!!!", dset)
            elif len(pileupInput) >= 2:
                # then make sure multiple pileup datasets are available at the same location
                # Note: avoid transferring the biggest one
                largestSize = 0
                largestDset = ""
                for dset, dsetDict in viewitems(pileupInput):
                    if dsetDict['dsetSize'] > largestSize:
                        largestSize = dsetDict['dsetSize']
                        largestDset = dset
                datasetLocation = self._diskPNNs(
                    pileupInput[largestDset]['locations'])
                msg = "it has multiple pileup datasets, the largest one is: %s,"
                msg += "total size: %s GB, current disk locations: %s"
                self.logger.info(msg, largestDset, gigaBytes(largestSize),
                                 datasetLocation)
                commonLocation = wflowPnns & set(datasetLocation)
                if commonLocation:
                    self.logger.info(
                        "Largest secondary dataset %s already in place: %s",
                        largestDset, datasetLocation)
                    pileupInput.pop(largestDset)
                    wflow.setPURSElist(commonLocation)
                else:
                    self.logger.info(
                        "Largest secondary dataset %s not available in a common location. This is BAD!"
                    )
                # now iterate normally through the pileup datasets
                for dset, dsetDict in listitems(
                        pileupInput):  # dict can change size here
                    datasetLocation = self._diskPNNs(dsetDict['locations'])
                    msg = "it has secondary: %s, total size: %s GB, current disk locations: %s"
                    self.logger.info(msg, dset,
                                     gigaBytes(dsetDict['dsetSize']),
                                     datasetLocation)
                    commonLocation = wflowPnns & set(datasetLocation)
                    if not commonLocation:
                        msg = "secondary dataset: %s not in any common location. Its current locations are: %s"
                        self.logger.info(msg, dset, datasetLocation)
                    elif commonLocation and not wflow.getPURSElist():
                        # then it's the first pileup dataset available within the SiteWhitelist,
                        # force its common location for the workflow from now on
                        msg = "secondary dataset: %s already in place: %s, common location: %s"
                        msg += ". Forcing the whole workflow to this new common location."
                        self.logger.info(msg, dset, datasetLocation,
                                         commonLocation)
                        pileupInput.pop(dset)
                        wflow.setPURSElist(commonLocation)
                    else:
                        # pileup RSE list has already been defined. Get the new common location
                        newCommonLocation = commonLocation & wflow.getPURSElist(
                        )
                        if newCommonLocation:
                            msg = "secondary dataset: %s already in place. "
                            msg += "New common locations with site white/black list is: %s"
                            self.logger.info(msg, dset, newCommonLocation)
                            pileupInput.pop(dset)
                            wflow.setPURSElist(newCommonLocation)
                        else:
                            msg = "secondary dataset: %s is currently available within the site white/black list: %s"
                            msg += " But there is no common location with the other(s) pileup datasets: %s"
                            msg += " It will need data placement!!!"
                            self.logger.info(msg, dset, commonLocation,
                                             wflow.getPURSElist())

        # check if there are remaining pileups to be placed
        # we need to figure out its location NOW!
        if wflow.getSecondarySummary() and not wflow.getPURSElist():
            pnns = self._findFinalPULocation(wflow)
            wflow.setPURSElist(pnns)
Example #6
0
    def makeTapeSubscriptions(self, workflow):
        """
        Makes the output data placement to the Tape endpoints. It works either with
        PhEDEx or with Rucio, configurable. It also relies on the Unified configuration
        to decide whether a given datatier can go to tape, and where it can be auto-approved.
        :param workflow: a MSOutputTemplate object representing a workflow
        :return: the MSOutputTemplate object itself (with the necessary updates in place)
        """
        # if anything fails along the way, set it back to "pending"
        transferStatus = "done"

        # this RSE name will be used for all output datasets to be subscribed
        # within this workflow
        dataBytesForTape = self._getDataVolumeForTape(workflow)
        tapeRSE, requiresApproval = self._getTapeDestination(dataBytesForTape)
        self.logger.info(
            "Workflow: %s, total output size: %s GB, against RSE: %s",
            workflow['RequestName'], gigaBytes(dataBytesForTape), tapeRSE)
        for dMap in workflow['OutputMap']:
            if not self.canDatasetGoToTape(dMap, workflow):
                continue

            # this RSE name will be used for all output datasets to be subscribed
            # within this workflow
            dMap['TapeDestination'] = tapeRSE
            ruleAttrs = {
                'activity': 'Production Output',
                'account': self.msConfig['rucioAccount'],
                'copies': 1,
                'grouping': "ALL",
                'ask_approval': requiresApproval,
                'comment': 'WMCore MSOutput output data placement'
            }
            msg = "Creating Rucio TAPE rule for container: {} and RSE: {}".format(
                dMap['Dataset'], dMap['TapeDestination'])
            self.logger.info(msg)

            if self.msConfig['enableDataPlacement']:
                resp = self.rucio.createReplicationRule(
                    dMap['Dataset'], dMap['TapeDestination'], **ruleAttrs)
                if not resp:
                    # then the call failed
                    transferStatus = "pending"
                elif len(resp) == 1:
                    dMap['TapeRuleID'] = resp[0]
                elif len(resp) > 1:
                    msg = "Tape rule creation returned multiple rule IDs and it needs to be investigated!!! "
                    msg += "For DID: {}, rseExpr: {} and rucio account: {}".format(
                        dMap['Dataset'], dMap['TapeDestination'],
                        ruleAttrs['account'])
                    self.logger.critical(msg)
                    return workflow
            else:
                msg = "DRY-RUN RUCIO: skipping tape rule creation for DID: {}, ".format(
                    dMap['Dataset'])
                msg += "rseExpr: {} and standard parameters: {}".format(
                    dMap['TapeDestination'], ruleAttrs)
                self.logger.info(msg)

        # Finally, update the MSOutput template document with either partial or
        # complete transfer ids
        self.docKeyUpdate(workflow, OutputMap=workflow['OutputMap'])
        workflow.updateTime()
        # NOTE: updating the TransferStatus at this stage is a bit trickier, we
        # cannot bypass bad disk data placements!
        if transferStatus == "done" and workflow['TransferStatus'] == "done":
            self.logger.info(
                "All the tape requests succeeded for: %s. Marking it as 'done'",
                workflow['RequestName'])
        elif transferStatus == "done" and workflow[
                'TransferStatus'] == "pending":
            self.logger.info(
                "All the tape requests succeeded for: %s, but disk ones are still pending",
                workflow['RequestName'])
        elif transferStatus == "pending" and workflow[
                'TransferStatus'] == "done":
            self.logger.info(
                "Tape requests partially successful for: %s. Marking it as 'pending'",
                workflow['RequestName'])
            self.docKeyUpdate(workflow, TransferStatus='pending')
        else:
            self.logger.info(
                "Tape requests partially successful for: %s. Keeping it as 'pending'",
                workflow['RequestName'])

        return workflow