Python gigaBytes Examples

Programming Language: Python

Namespace/Package Name: WMCore.MicroService.Tools.Common

Method/Function: gigaBytes

Examples at hotexamples.com: 6

Python gigaBytes - 6 examples found. These are the top rated real world Python examples of WMCore.MicroService.Tools.Common.gigaBytes extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: MSTransferor.py Project: todor-ivanov/WMCore

    def _decideDataDestination(self, wflow, dataIn, numNodes):
        """
        Given a global list of blocks and the campaign configuration,
        decide which blocks have to be transferred and to where.
        :param wflow: workflow object
        :param dataIn: dictionary with a summary of the data to be placed
        :param numNodes: amount of nodes/RSEs that can receive data
        :return: yield a block list, the total chunk size and a node index
        """
        # FIXME: implement multiple copies (MaxCopies > 1)
        blockList = []
        dsetName = dataIn["name"]

        ### NOTE: data placement done in a block basis
        if dataIn["type"] == "primary":
            # Except for DQMHarvest workflows, which must have a data placement of the
            # whole dataset within the same location
            if wflow.getReqType() == "DQMHarvest":
                numNodes = 1
            # if there is no parent data, just make one big rule for all the primary data
            # against all RSEs available for the workflow (intersection with PU data
            if not wflow.getParentBlocks():
                numNodes = 1
            listBlockSets, listSetsSize = wflow.getChunkBlocks(numNodes)
            if not listBlockSets:
                self.logger.warning(
                    "  found 0 primary/parent blocks for dataset: %s, moving on...",
                    dsetName)
                yield blockList, 0, 0
            for idx, blocksSet in enumerate(listBlockSets):
                self.logger.info(
                    "Have a chunk of %d blocks (%s GB) for dataset: %s",
                    len(blocksSet), gigaBytes(listSetsSize[idx]), dsetName)
                yield blocksSet, listSetsSize[idx], idx
        ### NOTE: data placement done in a dataset basis
        elif dataIn["type"] == "secondary":
            # secondary datasets are transferred as a whole, until better days...
            dsetSize = wflow.getSecondarySummary()
            dsetSize = dsetSize[dsetName]['dsetSize']
            # randomly pick one of the PNNs to put the whole pileup dataset in
            idx = randint(0, numNodes - 1)
            self.logger.info("Have whole PU dataset: %s (%s GB)", dsetName,
                             gigaBytes(dsetSize))
            yield blockList, dsetSize, idx

Example #2

Show file

File: MSTransferor.py Project: todor-ivanov/WMCore

    def _checkPrimaryDataVolume(self, wflow, wflowPnns):
        """
        Calculate the total data volume already available in the
        restricted list of PNNs, such that we can minimize primary/
        parent data transfers
        :param wflow: a workflow object
        :param wflowPnns: set with the allowed PNNs to receive data
        :return: the PNN which contains most of the data already in
        """
        msg = "Checking primary data volume for: %s, allowed PNNs: %s"
        self.logger.info(msg, wflow.getName(), wflowPnns)

        volumeByPNN = dict()
        for pnn in wflowPnns:
            volumeByPNN.setdefault(pnn, 0)

        for methodName in ("getPrimaryBlocks", "getParentBlocks"):
            inputBlocks = getattr(wflow, methodName)()
            self.logger.info("Request %s has %d initial blocks from %s",
                             wflow.getName(), len(inputBlocks), methodName)

            for block, blockDict in viewitems(inputBlocks):
                blockLocation = self._diskPNNs(blockDict['locations'])
                commonLocation = wflowPnns & set(blockLocation)
                if not commonLocation:
                    continue
                for pnn in commonLocation:
                    volumeByPNN[pnn] += blockDict['blockSize']

        maxSize = 0
        finalPNN = set()
        self.logger.info("Primary/parent data volume currently available:")
        for pnn, size in viewitems(volumeByPNN):
            self.logger.info("  PNN: %s\t\tData volume: %s GB", pnn,
                             gigaBytes(size))
            if size > maxSize:
                maxSize = size
                finalPNN = {pnn}
            elif size == maxSize:
                finalPNN.add(pnn)
        self.logger.info(
            "The PNN that would require less data to be transferred is: %s",
            finalPNN)
        if len(finalPNN) > 1:
            # magically picks one site from the list. It could pick the one with highest
            # available quota, but that might overload that one site...
            # make sure it's a set object
            finalPNN = choice(list(finalPNN))
            finalPNN = {finalPNN}
            self.logger.info("Randomly picked PNN: %s as final location",
                             finalPNN)

        return finalPNN

Example #3

Show file

    def __init__(self, dataAcct, quotaFraction, **kwargs):
        """
        Executes a basic setup, including proper logging.
        :param dataAcct: string with the Rucio account
        :param quotaFraction: float point number representing the fraction of the quota
        :param kwargs: the supported keyword arguments are:
          minimumThreshold: integer value defining the minimum available space required
          verbose: logger verbosity
          logger: logger object
        """
        self.dataAcct = dataAcct
        self.quotaFraction = quotaFraction

        self.minimumSpace = kwargs["minimumThreshold"]
        self.logger = getMSLogger(kwargs.get("verbose"), kwargs.get("logger"))
        msg = "RSEQuotas started with parameters: dataAcct=%s, quotaFraction=%s, "
        msg += "minimumThreshold=%s GB"
        self.logger.info(msg, dataAcct, quotaFraction,
                         gigaBytes(self.minimumSpace))

        self.nodeUsage = {}
        self.availableRSEs = set()
        self.outOfSpaceNodes = set()

Example #4

Show file

    def getChunkBlocks(self, numChunks=1):
        """
        Break down the input and parent blocks by a given number
        of chunks (usually the amount of sites available for data
        placement).
        :param numChunks: integer representing the number of chunks to be created
        :return: it returns two lists:
          * a list of sets, where each set corresponds to a set of blocks to be
            transferred to a single location;
          * and a list integers, which references the total size of each chunk in
            the list above (same order).
        """
        if numChunks == 1:
            thisChunk = set()
            thisChunk.update(list(self.getPrimaryBlocks()))
            thisChunkSize = sum([
                blockInfo['blockSize']
                for blockInfo in viewvalues(self.getPrimaryBlocks())
            ])
            if self.getParentDataset():
                thisChunk.update(list(self.getParentBlocks()))
                thisChunkSize += sum([
                    blockInfo['blockSize']
                    for blockInfo in viewvalues(self.getParentBlocks())
                ])
            # keep same data structure as multiple chunks, so list of lists
            return [thisChunk], [thisChunkSize]

        # create a descendant list of blocks according to their sizes
        sortedPrimary = sorted(viewitems(self.getPrimaryBlocks()),
                               key=operator.itemgetter(1),
                               reverse=True)
        if len(sortedPrimary) < numChunks:
            msg = "There are less blocks than chunks to create. "
            msg += "Reducing numChunks from %d to %d" % (numChunks,
                                                         len(sortedPrimary))
            self.logger.info(msg)
            numChunks = len(sortedPrimary)
        chunkSize = sum(item[1]['blockSize']
                        for item in sortedPrimary) // numChunks

        self.logger.info("Found %d blocks and the avg chunkSize is: %s GB",
                         len(sortedPrimary), gigaBytes(chunkSize))
        # list of sets with the block names
        blockChunks = []
        # list of integers with the total block sizes in each chunk (same order as above)
        sizeChunks = []
        for i in range(numChunks):
            thisChunk = set()
            thisChunkSize = 0
            idx = 0
            while True:
                self.logger.debug("Chunk: %d and idx: %s and length: %s", i,
                                  idx, len(sortedPrimary))
                if not sortedPrimary or idx >= len(sortedPrimary):
                    # then all blocks have been distributed
                    break
                elif not thisChunkSize:
                    # then this site/chunk is empty, assign a block to it
                    thisChunk.add(sortedPrimary[idx][0])
                    thisChunkSize += sortedPrimary[idx][1]['blockSize']
                    sortedPrimary.pop(idx)
                elif thisChunkSize + sortedPrimary[idx][1][
                        'blockSize'] <= chunkSize:
                    thisChunk.add(sortedPrimary[idx][0])
                    thisChunkSize += sortedPrimary[idx][1]['blockSize']
                    sortedPrimary.pop(idx)
                else:
                    idx += 1
            if thisChunk:
                blockChunks.append(thisChunk)
                sizeChunks.append(thisChunkSize)

        # now take care of the leftovers... in a round-robin style....
        while sortedPrimary:
            for chunkNum in range(numChunks):
                blockChunks[chunkNum].add(sortedPrimary[0][0])
                sizeChunks[chunkNum] += sortedPrimary[0][1]['blockSize']
                sortedPrimary.pop(0)
                if not sortedPrimary:
                    break
        self.logger.info("Created %d primary data chunks out of %d chunks",
                         len(blockChunks), numChunks)
        self.logger.info("    with chunk size distribution: %s", sizeChunks)

        if not self.getParentDataset():
            return blockChunks, sizeChunks

        # now add the parent blocks, considering that input blocks were evenly
        # distributed, I'd expect the same to automatically happen to the parents...
        childParent = self.getChildToParentBlocks()
        parentsSize = self.getParentBlocks()
        for chunkNum in range(numChunks):
            parentSet = set()
            for child in blockChunks[chunkNum]:
                parentSet.update(childParent[child])

            # now with the final list of parents in hand, update the list
            # of blocks within the chunk and update the chunk size as well
            blockChunks[chunkNum].update(parentSet)
            for parent in parentSet:
                sizeChunks[chunkNum] += parentsSize[parent]['blockSize']
        self.logger.info(
            "Created %d primary+parent data chunks out of %d chunks",
            len(blockChunks), numChunks)
        self.logger.info("    with chunk size distribution: %s", sizeChunks)
        return blockChunks, sizeChunks

Example #5

Show file

File: MSTransferor.py Project: todor-ivanov/WMCore

    def checkPUDataLocation(self, wflow):
        """
        Check the workflow configuration - in terms of AAA - and the secondary
        pileup distribution; and if possible remove the pileup dataset from the
        next step where data is placed.
        If workflow has XRootD/AAA enabled, data location can be outside of the
        SiteWhitelist.
        :param wflow: workflow object
        """
        pileupInput = wflow.getSecondarySummary()
        if not pileupInput:
            # nothing to be done here
            return

        wflowPnns = self._getPNNsFromPSNs(wflow.getSitelist())
        secondaryAAA = wflow.getReqParam("TrustPUSitelists")
        msg = "Checking secondary data location for request: {}, ".format(
            wflow.getName())
        msg += "TrustPUSitelists: {}, request white/black list PNNs: {}".format(
            secondaryAAA, wflowPnns)
        self.logger.info(msg)

        if secondaryAAA:
            # what matters is to have pileup dataset(s) available in ANY disk storage
            for dset, dsetDict in listitems(
                    pileupInput):  # dict can change size here
                datasetLocation = self._diskPNNs(dsetDict['locations'])
                msg = "it has secondary: %s, total size: %s GB, disk locations: %s"
                self.logger.info(msg, dset, gigaBytes(dsetDict['dsetSize']),
                                 datasetLocation)
                if datasetLocation:
                    self.logger.info(
                        "secondary dataset %s already in place through AAA: %s",
                        dset, datasetLocation)
                    pileupInput.pop(dset)
                else:
                    self.logger.info(
                        "secondary dataset %s not available even through AAA",
                        dset)
        else:
            if len(pileupInput) == 1:
                for dset, dsetDict in listitems(
                        pileupInput):  # dict can change size here
                    datasetLocation = self._diskPNNs(dsetDict['locations'])
                    msg = "it has secondary: %s, total size: %s GB, current disk locations: %s"
                    self.logger.info(msg, dset,
                                     gigaBytes(dsetDict['dsetSize']),
                                     datasetLocation)
                    commonLocation = wflowPnns & set(datasetLocation)
                    if commonLocation:
                        msg = "secondary dataset: %s already in place. "
                        msg += "Common locations with site white/black list is: %s"
                        self.logger.info(msg, dset, commonLocation)
                        pileupInput.pop(dset)
                        wflow.setPURSElist(commonLocation)
                    else:
                        self.logger.info(
                            "secondary: %s will need data placement!!!", dset)
            elif len(pileupInput) >= 2:
                # then make sure multiple pileup datasets are available at the same location
                # Note: avoid transferring the biggest one
                largestSize = 0
                largestDset = ""
                for dset, dsetDict in viewitems(pileupInput):
                    if dsetDict['dsetSize'] > largestSize:
                        largestSize = dsetDict['dsetSize']
                        largestDset = dset
                datasetLocation = self._diskPNNs(
                    pileupInput[largestDset]['locations'])
                msg = "it has multiple pileup datasets, the largest one is: %s,"
                msg += "total size: %s GB, current disk locations: %s"
                self.logger.info(msg, largestDset, gigaBytes(largestSize),
                                 datasetLocation)
                commonLocation = wflowPnns & set(datasetLocation)
                if commonLocation:
                    self.logger.info(
                        "Largest secondary dataset %s already in place: %s",
                        largestDset, datasetLocation)
                    pileupInput.pop(largestDset)
                    wflow.setPURSElist(commonLocation)
                else:
                    self.logger.info(
                        "Largest secondary dataset %s not available in a common location. This is BAD!"
                    )
                # now iterate normally through the pileup datasets
                for dset, dsetDict in listitems(
                        pileupInput):  # dict can change size here
                    datasetLocation = self._diskPNNs(dsetDict['locations'])
                    msg = "it has secondary: %s, total size: %s GB, current disk locations: %s"
                    self.logger.info(msg, dset,
                                     gigaBytes(dsetDict['dsetSize']),
                                     datasetLocation)
                    commonLocation = wflowPnns & set(datasetLocation)
                    if not commonLocation:
                        msg = "secondary dataset: %s not in any common location. Its current locations are: %s"
                        self.logger.info(msg, dset, datasetLocation)
                    elif commonLocation and not wflow.getPURSElist():
                        # then it's the first pileup dataset available within the SiteWhitelist,
                        # force its common location for the workflow from now on
                        msg = "secondary dataset: %s already in place: %s, common location: %s"
                        msg += ". Forcing the whole workflow to this new common location."
                        self.logger.info(msg, dset, datasetLocation,
                                         commonLocation)
                        pileupInput.pop(dset)
                        wflow.setPURSElist(commonLocation)
                    else:
                        # pileup RSE list has already been defined. Get the new common location
                        newCommonLocation = commonLocation & wflow.getPURSElist(
                        )
                        if newCommonLocation:
                            msg = "secondary dataset: %s already in place. "
                            msg += "New common locations with site white/black list is: %s"
                            self.logger.info(msg, dset, newCommonLocation)
                            pileupInput.pop(dset)
                            wflow.setPURSElist(newCommonLocation)
                        else:
                            msg = "secondary dataset: %s is currently available within the site white/black list: %s"
                            msg += " But there is no common location with the other(s) pileup datasets: %s"
                            msg += " It will need data placement!!!"
                            self.logger.info(msg, dset, commonLocation,
                                             wflow.getPURSElist())

        # check if there are remaining pileups to be placed
        # we need to figure out its location NOW!
        if wflow.getSecondarySummary() and not wflow.getPURSElist():
            pnns = self._findFinalPULocation(wflow)
            wflow.setPURSElist(pnns)

Example #6

Show file

    def makeTapeSubscriptions(self, workflow):
        """
        Makes the output data placement to the Tape endpoints. It works either with
        PhEDEx or with Rucio, configurable. It also relies on the Unified configuration
        to decide whether a given datatier can go to tape, and where it can be auto-approved.
        :param workflow: a MSOutputTemplate object representing a workflow
        :return: the MSOutputTemplate object itself (with the necessary updates in place)
        """
        # if anything fails along the way, set it back to "pending"
        transferStatus = "done"

        # this RSE name will be used for all output datasets to be subscribed
        # within this workflow
        dataBytesForTape = self._getDataVolumeForTape(workflow)
        tapeRSE, requiresApproval = self._getTapeDestination(dataBytesForTape)
        self.logger.info(
            "Workflow: %s, total output size: %s GB, against RSE: %s",
            workflow['RequestName'], gigaBytes(dataBytesForTape), tapeRSE)
        for dMap in workflow['OutputMap']:
            if not self.canDatasetGoToTape(dMap, workflow):
                continue

            # this RSE name will be used for all output datasets to be subscribed
            # within this workflow
            dMap['TapeDestination'] = tapeRSE
            ruleAttrs = {
                'activity': 'Production Output',
                'account': self.msConfig['rucioAccount'],
                'copies': 1,
                'grouping': "ALL",
                'ask_approval': requiresApproval,
                'comment': 'WMCore MSOutput output data placement'
            }
            msg = "Creating Rucio TAPE rule for container: {} and RSE: {}".format(
                dMap['Dataset'], dMap['TapeDestination'])
            self.logger.info(msg)

            if self.msConfig['enableDataPlacement']:
                resp = self.rucio.createReplicationRule(
                    dMap['Dataset'], dMap['TapeDestination'], **ruleAttrs)
                if not resp:
                    # then the call failed
                    transferStatus = "pending"
                elif len(resp) == 1:
                    dMap['TapeRuleID'] = resp[0]
                elif len(resp) > 1:
                    msg = "Tape rule creation returned multiple rule IDs and it needs to be investigated!!! "
                    msg += "For DID: {}, rseExpr: {} and rucio account: {}".format(
                        dMap['Dataset'], dMap['TapeDestination'],
                        ruleAttrs['account'])
                    self.logger.critical(msg)
                    return workflow
            else:
                msg = "DRY-RUN RUCIO: skipping tape rule creation for DID: {}, ".format(
                    dMap['Dataset'])
                msg += "rseExpr: {} and standard parameters: {}".format(
                    dMap['TapeDestination'], ruleAttrs)
                self.logger.info(msg)

        # Finally, update the MSOutput template document with either partial or
        # complete transfer ids
        self.docKeyUpdate(workflow, OutputMap=workflow['OutputMap'])
        workflow.updateTime()
        # NOTE: updating the TransferStatus at this stage is a bit trickier, we
        # cannot bypass bad disk data placements!
        if transferStatus == "done" and workflow['TransferStatus'] == "done":
            self.logger.info(
                "All the tape requests succeeded for: %s. Marking it as 'done'",
                workflow['RequestName'])
        elif transferStatus == "done" and workflow[
                'TransferStatus'] == "pending":
            self.logger.info(
                "All the tape requests succeeded for: %s, but disk ones are still pending",
                workflow['RequestName'])
        elif transferStatus == "pending" and workflow[
                'TransferStatus'] == "done":
            self.logger.info(
                "Tape requests partially successful for: %s. Marking it as 'pending'",
                workflow['RequestName'])
            self.docKeyUpdate(workflow, TransferStatus='pending')
        else:
            self.logger.info(
                "Tape requests partially successful for: %s. Keeping it as 'pending'",
                workflow['RequestName'])

        return workflow