Esempio n. 1
0
    def _queryAndCompareWithDBS(self, pileupDict, defaultArguments, dbsUrl):
        """
        pileupDict is a Python dictionary containing particular pileup
        configuration information. Query DBS on given dataset contained
        now in both input defaultArguments as well as in the pileupDict
        and compare values.

        """
        reader = DBS3Reader(dbsUrl)
        phedex = PhEDEx()

        inputArgs = defaultArguments["PileupConfig"]

        self.assertEqual(len(inputArgs), len(pileupDict),
                         "Number of pileup types different.")
        for pileupType in inputArgs:
            m = ("pileup type '%s' not in PileupFetcher-produced pileup "
                 "configuration: '%s'" % (pileupType, pileupDict))
            self.assertTrue(pileupType in pileupDict, m)

        # now query DBS for compare actual results on files lists for each
        # pileup type and dataset and location (storage element names)
        # pileupDict is saved in the file and now comparing items of this
        # configuration with actual DBS results, the structure of pileupDict:
        #    {"pileupTypeA": {"BlockA": {"FileList": [], "PhEDExNodeNames": []},
        #                     "BlockB": {"FileList": [], "PhEDExNodeNames": []}, ....}
        for pileupType, datasets in inputArgs.items():
            # this is from the pileup configuration produced by PileupFetcher
            blockDict = pileupDict[pileupType]

            for dataset in datasets:
                dbsFileBlocks = reader.listFileBlocks(dataset=dataset)
                blocksLocation = phedex.getReplicaPhEDExNodesForBlocks(
                    dataset=dataset, complete='y')
                for dbsFileBlockName in dbsFileBlocks:
                    fileList = []
                    pnns = set()
                    for pnn in blocksLocation[dbsFileBlockName]:
                        pnns.add(pnn)
                    # now get list of files in the block
                    dbsFiles = reader.listFilesInBlock(dbsFileBlockName)
                    for dbsFile in dbsFiles:
                        fileList.append(dbsFile["LogicalFileName"])
                    # now compare the sets:
                    m = ("PNNs don't agree for pileup type '%s', "
                         "dataset '%s' in configuration: '%s'" %
                         (pileupType, dataset, pileupDict))
                    self.assertEqual(
                        set(blockDict[dbsFileBlockName]["PhEDExNodeNames"]),
                        pnns, m)
                    m = (
                        "FileList don't agree for pileup type '%s', dataset '%s' "
                        " in configuration: '%s'" %
                        (pileupType, dataset, pileupDict))
                    storedFileList = [
                        item['logical_file_name']
                        for item in blockDict[dbsFileBlockName]["FileList"]
                    ]
                    self.assertItemsEqual(storedFileList, fileList, m)
Esempio n. 2
0
    def _queryDbsAndGetPileupConfig(self, stepHelper, dbsReader):
        """
        Method iterates over components of the pileup configuration input
        and queries DBS. Then iterates over results from DBS.

        There needs to be a list of files and their locations for each
        dataset name.
        Use dbsReader
        the result data structure is a Python dict following dictionary:
            FileList is a list of LFNs

        {"pileupTypeA": {"BlockA": {"FileList": [], "PhEDExNodeNames": []},
                         "BlockB": {"FileList": [], "PhEDExNodeName": []}, ....}

        this structure preserves knowledge of where particular files of dataset
        are physically (list of PNNs) located. DBS only lists sites which
        have all files belonging to blocks but e.g. BlockA of dataset DS1 may
        be located at site1 and BlockB only at site2 - it's possible that only
        a subset of the blocks in a dataset will be at a site.

        """
        # only production PhEDEx is connected (This can be moved to init method
        phedex = PhEDEx()
        node_filter = set(['UNKNOWN', None])

        resultDict = {}
        # iterate over input pileup types (e.g. "cosmics", "minbias")
        for pileupType in stepHelper.data.pileup.listSections_():
            # the format here is: step.data.pileup.cosmics.dataset = [/some/data/set]
            datasets = getattr(getattr(stepHelper.data.pileup, pileupType),
                               "dataset")
            # each dataset input can generally be a list, iterate over dataset names
            blockDict = {}
            for dataset in datasets:

                blockFileInfo = dbsReader.getFileListByDataset(dataset=dataset,
                                                               detail=True)

                for fileInfo in blockFileInfo:
                    blockDict.setdefault(fileInfo['block_name'], {
                        'FileList': [],
                        'NumberOfEvents': 0,
                        'PhEDExNodeNames': []
                    })
                    blockDict[fileInfo['block_name']]['FileList'].append(
                        {'logical_file_name': fileInfo['logical_file_name']})
                    blockDict[fileInfo['block_name']][
                        'NumberOfEvents'] += fileInfo['event_count']

                blockReplicasInfo = phedex.getReplicaPhEDExNodesForBlocks(
                    dataset=dataset, complete='y')
                for block in blockReplicasInfo:
                    nodes = set(blockReplicasInfo[block]) - node_filter
                    blockDict[block]['PhEDExNodeNames'] = list(nodes)
                    blockDict[block]['FileList'] = sorted(
                        blockDict[block]['FileList'])

            resultDict[pileupType] = blockDict
        return resultDict
Esempio n. 3
0
    def _queryDbsAndGetPileupConfig(self, stepHelper, dbsReader, fakeSites):
        """
        Method iterates over components of the pileup configuration input
        and queries DBS. Then iterates over results from DBS.

        There needs to be a list of files and their locations for each
        dataset name.
        Use dbsReader
        the result data structure is a Python dict following dictionary:
            FileList is a list of LFNs

        {"pileupTypeA": {"BlockA": {"FileList": [], "PhEDExNodeNames": []},
                         "BlockB": {"FileList": [], "PhEDExNodeName": []}, ....}

        this structure preserves knowledge of where particular files of dataset
        are physically (list of PNNs) located. DBS only lists sites which
        have all files belonging to blocks but e.g. BlockA of dataset DS1 may
        be located at site1 and BlockB only at site2 - it's possible that only
        a subset of the blocks in a dataset will be at a site.

        """
        # only production PhEDEx is connected (This can be moved to init method
        phedex = PhEDEx()
        node_filter = set(['UNKNOWN', None])
        # convert the siteWhitelist into SE list and add SEs to the pileup location list
        fakePNNs = []
        if fakeSites:
            fakePNNs = mapSitetoPNN(fakeSites)

        resultDict = {}
        # iterate over input pileup types (e.g. "cosmics", "minbias")
        for pileupType in stepHelper.data.pileup.listSections_():
            # the format here is: step.data.pileup.cosmics.dataset = [/some/data/set]
            datasets = getattr(getattr(stepHelper.data.pileup, pileupType), "dataset")
            # each dataset input can generally be a list, iterate over dataset names
            blockDict = {}
            for dataset in datasets:

                blockFileInfo = dbsReader.getFileListByDataset(dataset=dataset, detail=True)

                for fileInfo in blockFileInfo:
                    blockDict.setdefault(fileInfo['block_name'], {'FileList': [],
                                                                  'NumberOfEvents': 0,
                                                                  'PhEDExNodeNames': []})
                    blockDict[fileInfo['block_name']]['FileList'].append(
                        {'logical_file_name': fileInfo['logical_file_name']})
                    blockDict[fileInfo['block_name']]['NumberOfEvents'] += fileInfo['event_count']

                blockReplicasInfo = phedex.getReplicaPhEDExNodesForBlocks(dataset=dataset, complete='y')
                for block in blockReplicasInfo:
                    nodes = set(blockReplicasInfo[block]) - node_filter | set(fakePNNs)
                    blockDict[block]['PhEDExNodeNames'] = list(nodes)
                    blockDict[block]['FileList'] = sorted(blockDict[block]['FileList'])

            resultDict[pileupType] = blockDict
        return resultDict
Esempio n. 4
0
    def _queryAndCompareWithDBS(self, pileupDict, defaultArguments, dbsUrl):
        """
        pileupDict is a Python dictionary containing particular pileup
        configuration information. Query DBS on given dataset contained
        now in both input defaultArguments as well as in the pileupDict
        and compare values.

        """
        reader = DBS3Reader(dbsUrl)
        phedex = PhEDEx()

        inputArgs = defaultArguments["PileupConfig"]

        self.assertEqual(len(inputArgs), len(pileupDict),
                         "Number of pileup types different.")
        for pileupType in inputArgs:
            m = ("pileup type '%s' not in PileupFetcher-produced pileup "
                 "configuration: '%s'" % (pileupType, pileupDict))
            self.assertTrue(pileupType in pileupDict, m)

        # now query DBS for compare actual results on files lists for each
        # pileup type and dataset and location (storage element names)
        # pileupDict is saved in the file and now comparing items of this
        # configuration with actual DBS results, the structure of pileupDict:
        #    {"pileupTypeA": {"BlockA": {"FileList": [], "PhEDExNodeNames": []},
        #                     "BlockB": {"FileList": [], "PhEDExNodeNames": []}, ....}
        for pileupType, datasets in inputArgs.items():
            # this is from the pileup configuration produced by PileupFetcher
            blockDict = pileupDict[pileupType]

            for dataset in datasets:
                dbsFileBlocks = reader.listFileBlocks(dataset=dataset)
                blocksLocation = phedex.getReplicaPhEDExNodesForBlocks(dataset=dataset, complete='y')
                for dbsFileBlockName in dbsFileBlocks:
                    fileList = []
                    pnns = set()
                    for pnn in blocksLocation[dbsFileBlockName]:
                        pnns.add(pnn)
                    # now get list of files in the block
                    dbsFiles = reader.listFilesInBlock(dbsFileBlockName)
                    for dbsFile in dbsFiles:
                        fileList.append(dbsFile["LogicalFileName"])
                    # now compare the sets:
                    m = ("PNNs don't agree for pileup type '%s', "
                         "dataset '%s' in configuration: '%s'" % (pileupType, dataset, pileupDict))
                    self.assertEqual(set(blockDict[dbsFileBlockName]["PhEDExNodeNames"]), pnns, m)
                    m = ("FileList don't agree for pileup type '%s', dataset '%s' "
                         " in configuration: '%s'" % (pileupType, dataset, pileupDict))
                    storedFileList = [item['logical_file_name'] for item in blockDict[dbsFileBlockName]["FileList"]]
                    self.assertItemsEqual(storedFileList, fileList, m)
Esempio n. 5
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    # cache all the datatiers known by DBS
    _datatiers = {}

    def __init__(self, url, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json")

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName,
                                                   validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(
                        self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary,
                                             data_tier_name=tier,
                                             detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        [runs.extend(x['run_num']) for x in results]
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary,
                                           data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [
            x['logical_file_name']
            for x in self.dbs.listFileArray(dataset=datasetPath)
        ]

    @staticmethod
    def listDatatiers(dbsUrl=None):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        if dbsUrl is None:
            msg = "Error in DBSReader.listDatatiers(). DBS Url not set."
            raise DBSReaderError(msg)

        timenow = int(time.time())
        if DBS3Reader._datatiers and timenow - 7200 < DBS3Reader._datatiers[
                'ts']:
            return DBS3Reader._datatiers['tiers']

        try:
            DBS3Reader._setDatatiersCache(timenow, dbsUrl)
        except Exception as ex:
            if not DBS3Reader._datatiers:
                msg = "Error in DBSReader.listDatatiers\n%s" % formatEx3(ex)
                raise DBSReaderError(msg)
        return DBS3Reader._datatiers['tiers']

    @staticmethod
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [
            tier['data_tier_name'] for tier in dbs.listDataTiers()
        ]

        return

    def listDatasetFileDetails(self,
                               datasetPath,
                               getParents=False,
                               validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath,
                                                validFileOnly=validFileOnly,
                                                detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(
                            p['parent_logical_file_name'])
            # get the lumis
            file_lumis = self.dbs.listFileLumis(block_name=blockName)
            for f in file_lumis:
                if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                    if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                        files[f['logical_file_name']]['Lumis'][
                            f['run_num']].extend(f['lumi_section_num'])
                    else:
                        files[f['logical_file_name']]['Lumis'][
                            f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        # FIXME: Doesnt raise exceptions on missing data as old api did
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block,
                                                     validFileOnly=1)
            else:  # dataset case dataset shouldn't be None
                summary = self.dbs.listFileSummaries(dataset=dataset,
                                                     validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset,
                                                                      block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        if not summary or summary[0].get(
                'file_size') is None:  # appears to indicate missing dataset
            msg = "DBSReader.listDatasetSummary(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, block))
        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self,
                          dataset,
                          onlyClosedBlocks=False,
                          blockName=None,
                          locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [
            remapDBS3Keys(block, stringify=True, block_name='Name')
            for block in blocks
        ]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{
                    'Name': x
                } for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [
                x['block_name'] for x in blocks
                if str(x['open_for_writing']) != "1"
            ]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [
            x['block_name'] for x in blocks
            if str(x['open_for_writing']) == "1"
        ]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName,
                                           validFileOnly=validFileOnly,
                                           detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName,
                                         validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self,
                                    fileBlockName,
                                    lumis=True,
                                    validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis,
                                                validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName, )
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = childByParents.keys()

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(
                self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[
                fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName,
                                          validFileOnly=1,
                                          detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(
                            block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(
                    block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {
            fileBlockName: {
                "PhEDExNodeNames":
                self.listFileBlockLocation(fileBlockName, dbsOnly),
                "Files":
                self.listFilesInBlock(fileBlockName),
                "IsOpen":
                self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {
            fileBlockName: {
                "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
                "Files": self.listFilesInBlockWithParents(fileBlockName),
                "IsOpen": self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        [result.update(self.getFileBlock(x)) for x in blocks]

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(
                toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(
                ['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(
                    dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" %
                                 pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName,
                                               dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" %
                                         (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):
        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset,
                                              validFileOnly=validFileOnly,
                                              detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
Esempio n. 6
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """

    def __init__(self, url, logger=None, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
            self.logger = logger or logging.getLogger(self.__class__.__name__)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json", dbsUrl=self.dbsURL)

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=tier, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        for x in results:
            runs.extend(x['run_num'])
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [x['logical_file_name'] for x in self.dbs.listFileArray(dataset=datasetPath)]

    def listDatatiers(self):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        return [tier['data_tier_name'] for tier in self.dbs.listDataTiers()]

    def listDatasetFileDetails(self, datasetPath, getParents=False, getLumis=True, validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath, validFileOnly=validFileOnly, detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(p['parent_logical_file_name'])

            if getLumis:
                # get the lumis
                file_lumis = self.dbs.listFileLumis(block_name=blockName)
                for f in file_lumis:
                    if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                            files[f['logical_file_name']]['Lumis'][f['run_num']].extend(f['lumi_section_num'])
                        else:
                            files[f['logical_file_name']]['Lumis'][f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block, validFileOnly=1)
            else:
                summary = self.dbs.listFileSummaries(dataset=dataset, validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if not summary:  # missing data or all files invalid
            return {}

        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False,
                          blockName=None, locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [remapDBS3Keys(block, stringify=True, block_name='Name') for block in blocks]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{'Name': x} for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [x['block_name'] for x in blocks if str(x['open_for_writing']) != "1"]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['block_name'] for x in blocks if str(x['open_for_writing']) == "1"]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=validFileOnly, detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName, validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName,)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = childByParents.keys()

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=1, detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName, dbsOnly),
            "Files": self.listFilesInBlock(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
            "Files": self.listFilesInBlockWithParents(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        for x in blocks:
            result.update(self.getFileBlock(x))

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" % pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName, dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" % (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):

        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset, validFileOnly=validFileOnly, detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    # def getListFilesByLumiAndDataset(self, dataset, files):
    #     "Unsing pycurl to get all the child parents pair for given dataset"
    #
    #     urls = ['%s/data/dbs/fileparentbylumis?block_name=%s' % (
    #              self.dbsURL, b["block_name"]) for b in self.dbs.listBlocks(dataset=dataset)]
    #
    #     data = multi_getdata(urls, ckey(), cert())
    #     rdict = {}
    #     for row in data:
    #         try:
    #             data = json.loads(row['data'])
    #             rdict[req] = data['result'][0]  # we get back {'result': [workflow]} dict
    #         except Exception as exp:
    #             print("ERROR: fail to load data as json record, error=%s" % str(exp))
    #             print(row)
    #     return rdict

    def getParentFilesGivenParentDataset(self, parentDataset, childLFNs):
        """
        returns parent files for given childLFN when DBS doesn't have direct parent child relationship in DB
        Only use this for finding missing parents

        :param parentDataset: parent dataset for childLFN
        :param childLFN: a file in child dataset
        :return: set of parent files for childLFN
        """
        fInfo = self.dbs.listFileLumiArray(logical_file_name=childLFNs)
        parentFiles = defaultdict(set)
        for f in fInfo:
            pFileList = self.dbs.listFiles(dataset=parentDataset, run_num=f['run_num'], lumi_list=f['lumi_section_num'])
            pFiles = set([x['logical_file_name'] for x in pFileList])
            parentFiles[f['logical_file_name']] = parentFiles[f['logical_file_name']].union(pFiles)
        return parentFiles

    def getParentFilesByLumi(self, childLFN):
        """
        get the parent file's lfns by lumi (This might not be the actual parentage relations in DBS just parentage by Lumis).
        use for only specific lfn for validating purpose, for the parentage fix use findAndInsertMissingParentage
        :param childLFN:
        :return: list of dictionary with parent files for given child LFN and parent dataset
        [{"ParentDataset": /abc/bad/ddd, "ParentFiles": [alf, baf, ...]]
        """
        childDatasets = self.dbs.listDatasets(logical_file_name=childLFN)
        result = []
        for i in childDatasets:
            parents = self.dbs.listDatasetParents(dataset=i["dataset"])
            for parent in parents:
                parentFiles = self.getParentFilesGivenParentDataset(parent['parent_dataset'], childLFN)
                result.append({"ParentDataset": parent['parent_dataset'], "ParentFiles": list(parentFiles)})
        return result

    def listParentsByLumi(self, childBlockName, childLFNs=None):
        """
        :param childBlockName: child block name
        :param childLFNs: list of child lfns if it is not specified, all the file in the block will be used,
               if specified, dbs validate child lfns from the childBlockName
        :return: list of list with child and parent id pair.  [[1,2], [3,4]...]
        """
        childLFNs = childLFNs or []
        return self.dbs.listFileParentsByLumi(block_name=childBlockName, logical_file_name=childLFNs)

    def insertFileParents(self, childBlockName, childParentsIDPairs):
        """
        :param childBlockName: child block name
        :param childParentsIDPairs: list of list child and parent file ids, i.e. [[1,2], [3,4]...]
                dbs validate child ids from the childBlockName
        :return: None
        """
        return self.dbs.insertFileParents({"block_name": childBlockName, "child_parent_id_list": childParentsIDPairs})

    def findAndInsertMissingParentage(self, childBlockName, childLFNs=None, insertFlag=True):
        """
        :param childBlockName: child block name
        :param childLFNs: list of child lfns if it is not specified, all the file in the block will be used,
               if specified, dbs validate child lfns from the childBlockName
        :return: number of file parents pair inserted
        """
        childLFNs = childLFNs or []
        fileParents = self.dbs.listFileParentsByLumi(block_name=childBlockName, logical_file_name=childLFNs)
        childParentsIDPairs = fileParents[0]["child_parent_id_list"]

        if insertFlag:
            self.dbs.insertFileParents({"block_name": childBlockName, "child_parent_id_list": childParentsIDPairs})
        return len(childParentsIDPairs)

    def listBlocksWithNoParents(self, childDataset):
        """
        :param childDataset: child dataset for
        :return: set of child blocks with no parentBlock
        """
        allBlocks = self.dbs.listBlocks(dataset=childDataset)
        blockNames = []
        for block in allBlocks:
            blockNames.append(block['block_name'])
        parentBlocks = self.dbs.listBlockParents(block_name=blockNames)

        cblock = set()
        for pblock in parentBlocks:
            cblock.add(pblock['this_block_name'])

        noParentBlocks = set(blockNames) - cblock
        return noParentBlocks

    def listFilesWithNoParents(self, childBlockName):
        """
        :param childBlockName:
        :return:
        """
        allFiles = self.dbs.listFiles(block_name=childBlockName)
        parentFiles = self.dbs.listFileParents(block_name=childBlockName)

        allFileNames = set()
        for fInfo in allFiles:
            allFileNames.add(fInfo['logical_file_name'])

        cfile = set()
        for pFile in parentFiles:
            cfile.add(pFile['logical_file_name'])

        noParentFiles = allFileNames - cfile
        return list(noParentFiles)

    def fixMissingParentageDatasets(self, childDataset, insertFlag=True):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :return: blocks which failed to insert parentage. for retry
        """
        pDatasets = self.listDatasetParents(childDataset)
        # print("parent datasets %s\n" % pDatasets)
        # pDatasets format is
        # [{'this_dataset': '/SingleMuon/Run2016D-03Feb2017-v1/MINIAOD', 'parent_dataset_id': 13265209, 'parent_dataset': '/SingleMuon/Run2016D-23Sep2016-v1/AOD'}]
        if not pDatasets:
            self.logger.warning("No parent dataset found for child dataset %s", childDataset)
            return {}

        blocks = self.listBlocksWithNoParents(childDataset)
        failedBlocks = []
        for blockName in blocks:
            try:
                numFiles = self.findAndInsertMissingParentage(blockName, insertFlag=insertFlag)
                self.logger.debug("%s file parentage added for block %s" % (numFiles, blockName))
            except Exception as ex:
                self.logger.exception("Parentage updated failed for block %s", blockName)
                failedBlocks.append(blockName)

        return failedBlocks

    def insertMissingParentageForAllFiles(self, childDataset, filterFilesWithParents=True, insertFlag=False):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :param filterFilesWithParents: if True, only select files without parents, if False all the files in the dataset
        :param insertFlag: if True, insert to DBS, if False just get the list of the file parentage without insert
        :return: blocks which failed to insert parentage. should be used for retrying
        """
        blocks = [b['block_name'] for b in self.dbs.listBlocks(dataset=childDataset)]
        failedBlocks = []
        print("Handling %d blocks" % len(blocks))
        totalFiles = 0
        for blockName in blocks:
            try:
                if filterFilesWithParents:
                    childLFNs = self.listFilesWithNoParents(blockName)
                    if len(childLFNs) == 0:
                        continue
                else:
                    childLFNs = []

                numFiles = self.findAndInsertMissingParentage(blockName, childLFNs=childLFNs, insertFlag=insertFlag)
                print("%s file parentage added for block %s" % (numFiles, blockName))
                totalFiles += numFiles
            except Exception as e:
                print(traceback.format_exc())
                failedBlocks.append(blockName)
        print("Total pairs: ", totalFiles)
        return failedBlocks
Esempio n. 7
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    # cache all the datatiers known by DBS
    _datatiers = {}

    def __init__(self, url, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json")

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name = slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            lumiDict[lumisItem['logical_file_name']].append(item)
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=tier, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        [runs.extend(x['run_num']) for x in results]
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [x['logical_file_name'] for x in self.dbs.listFileArray(dataset=datasetPath)]

    @staticmethod
    def listDatatiers(dbsUrl=None):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        if dbsUrl is None:
            msg = "Error in DBSReader.listDatatiers(). DBS Url not set."
            raise DBSReaderError(msg)

        timenow = int(time.time())
        if DBS3Reader._datatiers and timenow - 7200 < DBS3Reader._datatiers['ts']:
            return DBS3Reader._datatiers['tiers']

        try:
            DBS3Reader._setDatatiersCache(timenow, dbsUrl)
        except Exception as ex:
            if not DBS3Reader._datatiers:
                msg = "Error in DBSReader.listDatatiers\n%s" % formatEx3(ex)
                raise DBSReaderError(msg)
        return DBS3Reader._datatiers['tiers']

    @staticmethod
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()]

        return

    def listDatasetFileDetails(self, datasetPath, getParents=False, validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath, validFileOnly=validFileOnly, detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(p['parent_logical_file_name'])
            # get the lumis
            file_lumis = self.dbs.listFileLumis(block_name=blockName)
            for f in file_lumis:
                if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                    if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                        files[f['logical_file_name']]['Lumis'][f['run_num']].extend(f['lumi_section_num'])
                    else:
                        files[f['logical_file_name']]['Lumis'][f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        # FIXME: Doesnt raise exceptions on missing data as old api did
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block, validFileOnly=1)
            else:  # dataset case dataset shouldn't be None
                summary = self.dbs.listFileSummaries(dataset=dataset, validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        if not summary or summary[0].get('file_size') is None:  # appears to indicate missing dataset
            msg = "DBSReader.listDatasetSummary(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, block))
        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False,
                          blockName=None, locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [remapDBS3Keys(block, stringify=True, block_name='Name') for block in blocks]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{'Name': x} for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [x['block_name'] for x in blocks if str(x['open_for_writing']) != "1"]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['block_name'] for x in blocks if str(x['open_for_writing']) == "1"]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=validFileOnly, detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName, validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName,)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])
        
        parentsLFNs = childByParents.keys()
        
        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=1, detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName, dbsOnly),
            "Files": self.listFilesInBlock(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
            "Files": self.listFilesInBlockWithParents(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        [result.update(self.getFileBlock(x)) for x in blocks]

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" % pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName, dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" % (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):

        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset, validFileOnly=validFileOnly, detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
Esempio n. 8
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    # cache all the datatiers known by DBS
    _datatiers = {}

    def __init__(self, url, logger=None, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
            self.logger = logger or logging.getLogger(self.__class__.__name__)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json", dbsUrl=self.dbsURL)

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=tier, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        for x in results:
            runs.extend(x['run_num'])
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [x['logical_file_name'] for x in self.dbs.listFileArray(dataset=datasetPath)]

    @staticmethod
    def listDatatiers(dbsUrl=None):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        if dbsUrl is None:
            msg = "Error in DBSReader.listDatatiers(). DBS Url not set."
            raise DBSReaderError(msg)

        timenow = int(time.time())
        if DBS3Reader._datatiers and timenow - 7200 < DBS3Reader._datatiers['ts']:
            return DBS3Reader._datatiers['tiers']

        try:
            DBS3Reader._setDatatiersCache(timenow, dbsUrl)
        except Exception as ex:
            if not DBS3Reader._datatiers:
                msg = "Error in DBSReader.listDatatiers\n%s" % formatEx3(ex)
                raise DBSReaderError(msg)
        return DBS3Reader._datatiers['tiers']

    @staticmethod
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()]

        return

    def listDatasetFileDetails(self, datasetPath, getParents=False, getLumis=True, validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath, validFileOnly=validFileOnly, detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(p['parent_logical_file_name'])

            if getLumis:
                # get the lumis
                file_lumis = self.dbs.listFileLumis(block_name=blockName)
                for f in file_lumis:
                    if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                            files[f['logical_file_name']]['Lumis'][f['run_num']].extend(f['lumi_section_num'])
                        else:
                            files[f['logical_file_name']]['Lumis'][f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block, validFileOnly=1)
            else:
                summary = self.dbs.listFileSummaries(dataset=dataset, validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if not summary:  # missing data or all files invalid
            return {}

        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False,
                          blockName=None, locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [remapDBS3Keys(block, stringify=True, block_name='Name') for block in blocks]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{'Name': x} for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [x['block_name'] for x in blocks if str(x['open_for_writing']) != "1"]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['block_name'] for x in blocks if str(x['open_for_writing']) == "1"]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=validFileOnly, detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName, validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName,)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = childByParents.keys()

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=1, detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName, dbsOnly),
            "Files": self.listFilesInBlock(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
            "Files": self.listFilesInBlockWithParents(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        for x in blocks:
            result.update(self.getFileBlock(x))

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" % pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName, dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" % (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):

        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset, validFileOnly=validFileOnly, detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    # def getListFilesByLumiAndDataset(self, dataset, files):
    #     "Unsing pycurl to get all the child parents pair for given dataset"
    #
    #     urls = ['%s/data/dbs/fileparentbylumis?block_name=%s' % (
    #              self.dbsURL, b["block_name"]) for b in self.dbs.listBlocks(dataset=dataset)]
    #
    #     data = multi_getdata(urls, ckey(), cert())
    #     rdict = {}
    #     for row in data:
    #         try:
    #             data = json.loads(row['data'])
    #             rdict[req] = data['result'][0]  # we get back {'result': [workflow]} dict
    #         except Exception as exp:
    #             print("ERROR: fail to load data as json record, error=%s" % str(exp))
    #             print(row)
    #     return rdict

    def getParentFilesGivenParentDataset(self, parentDataset, childLFNs):
        """
        returns parent files for given childLFN when DBS doesn't have direct parent child relationship in DB
        Only use this for finding missing parents

        :param parentDataset: parent dataset for childLFN
        :param childLFN: a file in child dataset
        :return: set of parent files for childLFN
        """
        fInfo = self.dbs.listFileLumiArray(logical_file_name=childLFNs)
        parentFiles = defaultdict(set)
        for f in fInfo:
            pFileList = self.dbs.listFiles(dataset=parentDataset, run_num=f['run_num'], lumi_list=f['lumi_section_num'])
            pFiles = set([x['logical_file_name'] for x in pFileList])
            parentFiles[f['logical_file_name']] = parentFiles[f['logical_file_name']].union(pFiles)
        return parentFiles

    def getParentFilesByLumi(self, childLFN):
        """
        get the parent file's lfns by lumi (This might not be the actual parentage relations in DBS just parentage by Lumis).
        use for only specific lfn for validating purpose, for the parentage fix use findAndInsertMissingParentage
        :param childLFN:
        :return: list of dictionary with parent files for given child LFN and parent dataset
        [{"ParentDataset": /abc/bad/ddd, "ParentFiles": [alf, baf, ...]]
        """
        childDatasets = self.dbs.listDatasets(logical_file_name=childLFN)
        result = []
        for i in childDatasets:
            parents = self.dbs.listDatasetParents(dataset=i["dataset"])
            for parent in parents:
                parentFiles = self.getParentFilesGivenParentDataset(parent['parent_dataset'], childLFN)
                result.append({"ParentDataset": parent['parent_dataset'], "ParentFiles": list(parentFiles)})
        return result

    def listParentsByLumi(self, childBlockName, childLFNs=None):
        """
        :param childBlockName: child block name
        :param childLFNs: list of child lfns if it is not specified, all the file in the block will be used,
               if specified, dbs validate child lfns from the childBlockName
        :return: list of list with child and parent id pair.  [[1,2], [3,4]...]
        """
        childLFNs = childLFNs or []
        return self.dbs.listFileParentsByLumi(block_name=childBlockName, logical_file_name=childLFNs)

    def insertFileParents(self, childBlockName, childParentsIDPairs):
        """
        :param childBlockName: child block name
        :param childParentsIDPairs: list of list child and parent file ids, i.e. [[1,2], [3,4]...]
                dbs validate child ids from the childBlockName
        :return: None
        """
        return self.dbs.insertFileParents({"block_name": childBlockName, "child_parent_id_list": childParentsIDPairs})

    def findAndInsertMissingParentage(self, childBlockName, childLFNs=None, insertFlag=True):
        """
        :param childBlockName: child block name
        :param childLFNs: list of child lfns if it is not specified, all the file in the block will be used,
               if specified, dbs validate child lfns from the childBlockName
        :return: number of file parents pair inserted
        """
        childLFNs = childLFNs or []
        fileParents = self.dbs.listFileParentsByLumi(block_name=childBlockName, logical_file_name=childLFNs)
        childParentsIDPairs = fileParents[0]["child_parent_id_list"]

        if insertFlag:
            self.dbs.insertFileParents({"block_name": childBlockName, "child_parent_id_list": childParentsIDPairs})
        return len(childParentsIDPairs)

    def listBlocksWithNoParents(self, childDataset):
        """
        :param childDataset: child dataset for
        :return: set of child blocks with no parentBlock
        """
        allBlocks = self.dbs.listBlocks(dataset=childDataset)
        blockNames = []
        for block in allBlocks:
            blockNames.append(block['block_name'])
        parentBlocks = self.dbs.listBlockParents(block_name=blockNames)

        cblock = set()
        for pblock in parentBlocks:
            cblock.add(pblock['this_block_name'])

        noParentBlocks = set(blockNames) - cblock
        return noParentBlocks

    def listFilesWithNoParents(self, childBlockName):
        """
        :param childBlockName:
        :return:
        """
        allFiles = self.dbs.listFiles(block_name=childBlockName)
        parentFiles = self.dbs.listFileParents(block_name=childBlockName)

        allFileNames = set()
        for fInfo in allFiles:
            allFileNames.add(fInfo['logical_file_name'])

        cfile = set()
        for pFile in parentFiles:
            cfile.add(pFile['logical_file_name'])

        noParentFiles = allFileNames - cfile
        return list(noParentFiles)

    def fixMissingParentageDatasets(self, childDataset, insertFlag=True):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :return: blocks which failed to insert parentage. for retry
        """
        pDatasets = self.listDatasetParents(childDataset)
        # print("parent datasets %s\n" % pDatasets)
        # pDatasets format is
        # [{'this_dataset': '/SingleMuon/Run2016D-03Feb2017-v1/MINIAOD', 'parent_dataset_id': 13265209, 'parent_dataset': '/SingleMuon/Run2016D-23Sep2016-v1/AOD'}]
        if not pDatasets:
            self.logger.warning("No parent dataset found for child dataset %s", childDataset)
            return {}

        blocks = self.listBlocksWithNoParents(childDataset)
        failedBlocks = []
        for blockName in blocks:
            try:
                numFiles = self.findAndInsertMissingParentage(blockName, insertFlag=insertFlag)
                self.logger.debug("%s file parentage added for block %s" % (numFiles, blockName))
            except Exception as ex:
                self.logger.exception("Parentage updated failed for block %s", blockName)
                failedBlocks.append(blockName)

        return failedBlocks

    def insertMissingParentageForAllFiles(self, childDataset, filterFilesWithParents=True, insertFlag=False):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :param filterFilesWithParents: if True, only select files without parents, if False all the files in the dataset
        :param insertFlag: if True, insert to DBS, if False just get the list of the file parentage without insert
        :return: blocks which failed to insert parentage. should be used for retrying
        """
        blocks = [b['block_name'] for b in self.dbs.listBlocks(dataset=childDataset)]
        failedBlocks = []
        print("Handling %d blocks" % len(blocks))
        totalFiles = 0
        for blockName in blocks:
            try:
                if filterFilesWithParents:
                    childLFNs = self.listFilesWithNoParents(blockName)
                    if len(childLFNs) == 0:
                        continue
                else:
                    childLFNs = []

                numFiles = self.findAndInsertMissingParentage(blockName, childLFNs=childLFNs, insertFlag=insertFlag)
                print("%s file parentage added for block %s" % (numFiles, blockName))
                totalFiles += numFiles
            except Exception as e:
                print(traceback.format_exc())
                failedBlocks.append(blockName)
        print("Total pairs: ", totalFiles)
        return failedBlocks
Esempio n. 9
0
class PileupFetcher(FetcherInterface):
    """
    Pull dataset block/SE : LFN list from DBS for the
    pileup datasets required by the steps in the job.

    Save these maps as files in the sandbox

    """
    def __init__(self):
        """
        Prepare module setup
        """
        super(PileupFetcher, self).__init__()
        if usingRucio():
            # Too much work to pass the rucio account name all the way to here
            # just use the production rucio account for resolving pileup location
            self.rucio = Rucio("wma_prod",
                               configDict={'phedexCompatible': False})
        else:
            self.phedex = PhEDEx()  # this will go away eventually

    def _queryDbsAndGetPileupConfig(self, stepHelper, dbsReader):
        """
        Method iterates over components of the pileup configuration input
        and queries DBS. Then iterates over results from DBS.

        There needs to be a list of files and their locations for each
        dataset name.
        Use dbsReader
        the result data structure is a Python dict following dictionary:
            FileList is a list of LFNs

        {"pileupTypeA": {"BlockA": {"FileList": [], "PhEDExNodeNames": []},
                         "BlockB": {"FileList": [], "PhEDExNodeName": []}, ....}

        this structure preserves knowledge of where particular files of dataset
        are physically (list of PNNs) located. DBS only lists sites which
        have all files belonging to blocks but e.g. BlockA of dataset DS1 may
        be located at site1 and BlockB only at site2 - it's possible that only
        a subset of the blocks in a dataset will be at a site.

        """
        resultDict = {}
        # iterate over input pileup types (e.g. "cosmics", "minbias")
        for pileupType in stepHelper.data.pileup.listSections_():
            # the format here is: step.data.pileup.cosmics.dataset = [/some/data/set]
            datasets = getattr(getattr(stepHelper.data.pileup, pileupType),
                               "dataset")
            # each dataset input can generally be a list, iterate over dataset names
            blockDict = {}
            for dataset in datasets:

                blockFileInfo = dbsReader.getFileListByDataset(dataset=dataset,
                                                               detail=True)

                for fileInfo in blockFileInfo:
                    blockDict.setdefault(fileInfo['block_name'], {
                        'FileList': [],
                        'NumberOfEvents': 0,
                        'PhEDExNodeNames': []
                    })
                    blockDict[fileInfo['block_name']]['FileList'].append(
                        {'logical_file_name': fileInfo['logical_file_name']})
                    blockDict[fileInfo['block_name']][
                        'NumberOfEvents'] += fileInfo['event_count']

                self._getDatasetLocation(dataset, blockDict)

            resultDict[pileupType] = blockDict
        return resultDict

    def _getDatasetLocation(self, dset, blockDict):
        """
        Given a dataset name, query PhEDEx or Rucio and resolve the block location
        :param dset: string with the dataset name
        :param blockDict: dictionary with DBS summary info
        :return: update blockDict in place
        """
        node_filter = set(['UNKNOWN', None])

        if hasattr(self, "rucio"):
            # then it's Rucio!!
            blockReplicasInfo = self.rucio.getReplicaInfoForBlocks(
                dataset=dset)
            for item in blockReplicasInfo:
                block = item['name']
                try:
                    blockDict[block]['PhEDExNodeNames'] = item['replica']
                    blockDict[block]['FileList'] = sorted(
                        blockDict[block]['FileList'])
                except KeyError:
                    logging.warning(
                        "Block '%s' does not have any complete Rucio replica",
                        block)
        else:
            blockReplicasInfo = self.phedex.getReplicaPhEDExNodesForBlocks(
                dataset=dset, complete='y')
            for block in blockReplicasInfo:
                nodes = set(blockReplicasInfo[block]) - node_filter
                try:
                    blockDict[block]['PhEDExNodeNames'] = list(nodes)
                    blockDict[block]['FileList'] = sorted(
                        blockDict[block]['FileList'])
                except KeyError:
                    logging.warning(
                        "Block '%s' does not have any complete PhEDEx replica",
                        block)

    def _getCacheFilePath(self, stepHelper):

        fileName = ""
        for pileupType in stepHelper.data.pileup.listSections_():
            datasets = getattr(getattr(stepHelper.data.pileup, pileupType),
                               "dataset")
            fileName += ("_").join(datasets)
        # TODO cache is not very effective if the dataset combination is different between workflow
        # here is possibility of hash value collision
        cacheFile = "%s/pileupconf-%s.json" % (self.cacheDirectory(),
                                               hash(fileName))
        return cacheFile

    def _getStepFilePath(self, stepHelper):
        stepPath = "%s/%s" % (self.workingDirectory(), stepHelper.name())
        fileName = "%s/%s" % (stepPath, "pileupconf.json")

        return fileName

    def _writeFile(self, filePath, jsonPU):

        directory = filePath.rsplit('/', 1)[0]

        if not os.path.exists(directory):
            os.mkdir(directory)
        try:
            with open(filePath, 'w') as f:
                f.write(jsonPU)
        except IOError:
            m = "Could not save pileup JSON configuration file: '%s'" % filePath
            raise RuntimeError(m)

    def _copyFile(self, src, dest):

        directory = dest.rsplit('/', 1)[0]

        if not os.path.exists(directory):
            os.mkdir(directory)
        shutil.copyfile(src, dest)

    def _isCacheExpired(self, cacheFilePath, delta=24):
        """Is the cache expired? At delta hours (default 24) in the future.
        """
        # cache can either be a file name or an already opened file object

        if not os.path.exists(cacheFilePath):
            return True

        delta = datetime.timedelta(hours=delta)
        t = datetime.datetime.now() - delta
        # cache file mtime has been set to cache expiry time
        if os.path.getmtime(cacheFilePath) < time.mktime(t.timetuple()):
            return True

        return False

    def _isCacheValid(self, stepHelper):
        """
        Check whether cache is exits
        TODO: if the cacheDirectory is not inside the Sandbox it should not autormatically deleted.
              We can add cache refresh policy here
        """
        cacheFile = self._getCacheFilePath(stepHelper)

        if not self._isCacheExpired(
                cacheFile, delta=0.5) and os.path.getsize(cacheFile) > 0:
            # if file already exist don't make a new dbs call and overwrite the file.
            # just return
            fileName = self._getStepFilePath(stepHelper)
            if not os.path.isfile(fileName) or os.path.getsize(
                    fileName) != os.path.getsize(cacheFile):
                self._copyFile(cacheFile, fileName)
            return True
        else:
            return False

    def _saveFile(self, stepHelper, jsonPU):

        cacheFile = self._getCacheFilePath(stepHelper)
        self._writeFile(cacheFile, jsonPU)
        fileName = self._getStepFilePath(stepHelper)
        self._copyFile(cacheFile, fileName)

    def createPileupConfigFile(self, helper):
        """
        Stores pileup JSON configuration file in the working
        directory / sandbox.

        """
        if self._isCacheValid(helper):
            # if file already exist don't make a new dbs call and overwrite the file.
            # just return
            return

        encoder = JSONEncoder()
        # this should have been set in CMSSWStepHelper along with
        # the pileup configuration
        url = helper.data.dbsUrl
        dbsReader = DBSReader(url)

        configDict = self._queryDbsAndGetPileupConfig(helper, dbsReader)

        # create JSON and save into a file
        jsonPU = encoder.encode(configDict)
        self._saveFile(helper, jsonPU)

    def __call__(self, wmTask):
        """
        Method is called  when WorkQueue creates the sandbox for a job.
        Need to look at the pileup configuration in the spec and query dbs to
        determine the lfns for the files in the datasets and what sites they're
        located at (WQ creates the job sandbox).

        wmTask is instance of WMTask.WMTaskHelper

        """
        for step in wmTask.steps().nodeIterator():
            helper = WMStep.WMStepHelper(step)
            # returns e.g. instance of CMSSWHelper
            # doesn't seem to be necessary ... strangely (some inheritance involved?)
            # typeHelper = helper.getTypeHelper()
            if hasattr(helper.data, "pileup"):
                self.createPileupConfigFile(helper)
Esempio n. 10
0
class WMBSHelperTest(EmulatedUnitTestCase):
    def setUp(self):
        """
        _setUp_

        """
        super(WMBSHelperTest, self).setUp()

        self.testInit = TestInitCouchApp(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase=True)
        self.testInit.setupCouch("wmbshelper_t/jobs", "JobDump")
        self.testInit.setupCouch("wmbshelper_t/fwjrs", "FWJRDump")
        self.testInit.setupCouch("config_test", "GroupUser", "ConfigCache")
        os.environ["COUCHDB"] = "wmbshelper_t"
        self.testInit.setSchema(customModules=[
            "WMCore.WMBS", "WMComponent.DBS3Buffer", "WMCore.BossAir",
            "WMCore.ResourceControl"
        ],
                                useDefault=False)

        self.workDir = self.testInit.generateWorkDir()

        self.wmspec = self.createWMSpec()
        self.topLevelTask = getFirstTask(self.wmspec)
        self.inputDataset = self.topLevelTask.inputDataset()
        self.dataset = self.topLevelTask.getInputDatasetPath()
        self.dbs = DBSReader(self.inputDataset.dbsurl)
        self.phedex = PhEDEx()
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=threading.currentThread().logger,
                                     dbinterface=threading.currentThread().dbi)

        self.configFile = EmulatorSetup.setupWMAgentConfig()
        self.config = loadConfigurationFile(self.configFile)

        self.config.component_("JobSubmitter")
        self.config.JobSubmitter.submitDir = self.workDir
        self.config.JobSubmitter.submitScript = os.path.join(
            getTestBase(), 'WMComponent_t/JobSubmitter_t', 'submit.sh')

        return

    def tearDown(self):
        """
        _tearDown_

        Clear out the database.
        """
        self.testInit.clearDatabase()
        self.testInit.tearDownCouch()
        self.testInit.delWorkDir()
        EmulatorSetup.deleteConfig(self.configFile)
        super(WMBSHelperTest, self).tearDown()

        return

    def setupForKillTest(self, baAPI=None):
        """
        _setupForKillTest_

        Inject a workflow into WMBS that has a processing task, a merge task and
        a cleanup task.  Inject files into the various tasks at various
        processing states (acquired, complete, available...).  Also create jobs
        for each subscription in various states.
        """
        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMCore.WMBS",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)

        dummyLocationAction = daoFactory(classname="Locations.New")
        changeStateAction = daoFactory(classname="Jobs.ChangeState")
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='site1',
                                   pnn='goodse.cern.ch',
                                   ceName='site1',
                                   plugin="TestPlugin")
        resourceControl.insertThreshold(siteName='site1', taskType='Processing', \
                                        maxSlots=10000, pendingSlots=10000)

        userDN = 'someDN'
        userAction = daoFactory(classname="Users.New")
        userAction.execute(dn=userDN,
                           group_name='DEFAULT',
                           role_name='DEFAULT')

        inputFileset = Fileset("input")
        inputFileset.create()

        inputFileA = File("lfnA", locations="goodse.cern.ch")
        inputFileB = File("lfnB", locations="goodse.cern.ch")
        inputFileC = File("lfnC", locations="goodse.cern.ch")
        inputFileA.create()
        inputFileB.create()
        inputFileC.create()

        inputFileset.addFile(inputFileA)
        inputFileset.addFile(inputFileB)
        inputFileset.addFile(inputFileC)
        inputFileset.commit()

        unmergedOutputFileset = Fileset("unmerged")
        unmergedOutputFileset.create()

        unmergedFileA = File("ulfnA", locations="goodse.cern.ch")
        unmergedFileB = File("ulfnB", locations="goodse.cern.ch")
        unmergedFileC = File("ulfnC", locations="goodse.cern.ch")
        unmergedFileA.create()
        unmergedFileB.create()
        unmergedFileC.create()

        unmergedOutputFileset.addFile(unmergedFileA)
        unmergedOutputFileset.addFile(unmergedFileB)
        unmergedOutputFileset.addFile(unmergedFileC)
        unmergedOutputFileset.commit()

        mainProcWorkflow = Workflow(spec="spec1",
                                    owner="Steve",
                                    name="Main",
                                    task="Proc")
        mainProcWorkflow.create()
        mainProcMergeWorkflow = Workflow(spec="spec1",
                                         owner="Steve",
                                         name="Main",
                                         task="ProcMerge")
        mainProcMergeWorkflow.create()
        mainCleanupWorkflow = Workflow(spec="spec1",
                                       owner="Steve",
                                       name="Main",
                                       task="Cleanup")
        mainCleanupWorkflow.create()

        self.mainProcSub = Subscription(fileset=inputFileset,
                                        workflow=mainProcWorkflow,
                                        type="Processing")
        self.mainProcSub.create()
        self.mainProcSub.acquireFiles(inputFileA)
        self.mainProcSub.completeFiles(inputFileB)

        procJobGroup = JobGroup(subscription=self.mainProcSub)
        procJobGroup.create()
        self.procJobA = Job(name="ProcJobA")
        self.procJobA["state"] = "new"
        self.procJobA["location"] = "site1"
        self.procJobB = Job(name="ProcJobB")
        self.procJobB["state"] = "executing"
        self.procJobB["location"] = "site1"
        self.procJobC = Job(name="ProcJobC")
        self.procJobC["state"] = "complete"
        self.procJobC["location"] = "site1"
        self.procJobA.create(procJobGroup)
        self.procJobB.create(procJobGroup)
        self.procJobC.create(procJobGroup)

        self.mainMergeSub = Subscription(fileset=unmergedOutputFileset,
                                         workflow=mainProcMergeWorkflow,
                                         type="Merge")
        self.mainMergeSub.create()
        self.mainMergeSub.acquireFiles(unmergedFileA)
        self.mainMergeSub.failFiles(unmergedFileB)

        mergeJobGroup = JobGroup(subscription=self.mainMergeSub)
        mergeJobGroup.create()
        self.mergeJobA = Job(name="MergeJobA")
        self.mergeJobA["state"] = "exhausted"
        self.mergeJobA["location"] = "site1"
        self.mergeJobB = Job(name="MergeJobB")
        self.mergeJobB["state"] = "cleanout"
        self.mergeJobB["location"] = "site1"
        self.mergeJobC = Job(name="MergeJobC")
        self.mergeJobC["state"] = "new"
        self.mergeJobC["location"] = "site1"
        self.mergeJobA.create(mergeJobGroup)
        self.mergeJobB.create(mergeJobGroup)
        self.mergeJobC.create(mergeJobGroup)

        self.mainCleanupSub = Subscription(fileset=unmergedOutputFileset,
                                           workflow=mainCleanupWorkflow,
                                           type="Cleanup")
        self.mainCleanupSub.create()
        self.mainCleanupSub.acquireFiles(unmergedFileA)
        self.mainCleanupSub.completeFiles(unmergedFileB)

        cleanupJobGroup = JobGroup(subscription=self.mainCleanupSub)
        cleanupJobGroup.create()
        self.cleanupJobA = Job(name="CleanupJobA")
        self.cleanupJobA["state"] = "new"
        self.cleanupJobA["location"] = "site1"
        self.cleanupJobB = Job(name="CleanupJobB")
        self.cleanupJobB["state"] = "executing"
        self.cleanupJobB["location"] = "site1"
        self.cleanupJobC = Job(name="CleanupJobC")
        self.cleanupJobC["state"] = "complete"
        self.cleanupJobC["location"] = "site1"
        self.cleanupJobA.create(cleanupJobGroup)
        self.cleanupJobB.create(cleanupJobGroup)
        self.cleanupJobC.create(cleanupJobGroup)

        jobList = [
            self.procJobA, self.procJobB, self.procJobC, self.mergeJobA,
            self.mergeJobB, self.mergeJobC, self.cleanupJobA, self.cleanupJobB,
            self.cleanupJobC
        ]

        changeStateAction.execute(jobList)

        if baAPI:
            for job in jobList:
                job['plugin'] = 'TestPlugin'
                job['userdn'] = userDN
                job['usergroup'] = 'DEFAULT'
                job['userrole'] = 'DEFAULT'
                job['custom']['location'] = 'site1'
            baAPI.createNewJobs(wmbsJobs=jobList)

        # We'll create an unrelated workflow to verify that it isn't affected
        # by the killing code.
        bogusFileset = Fileset("dontkillme")
        bogusFileset.create()

        bogusFileA = File("bogus/lfnA", locations="goodse.cern.ch")
        bogusFileA.create()
        bogusFileset.addFile(bogusFileA)
        bogusFileset.commit()

        bogusWorkflow = Workflow(spec="spec2",
                                 owner="Steve",
                                 name="Bogus",
                                 task="Proc")
        bogusWorkflow.create()
        self.bogusSub = Subscription(fileset=bogusFileset,
                                     workflow=bogusWorkflow,
                                     type="Processing")
        self.bogusSub.create()
        self.bogusSub.acquireFiles(bogusFileA)
        return

    def verifyFileKillStatus(self):
        """
        _verifyFileKillStatus_

        Verify that all files were killed correctly.  The status of files in
        Cleanup and LogCollect subscriptions isn't modified.  Status of
        already completed and failed files is not modified.  Also verify that
        the bogus subscription is untouched.
        """
        failedFiles = self.mainProcSub.filesOfStatus("Failed")
        acquiredFiles = self.mainProcSub.filesOfStatus("Acquired")
        completedFiles = self.mainProcSub.filesOfStatus("Completed")
        availableFiles = self.mainProcSub.filesOfStatus("Available")
        bogusAcquiredFiles = self.bogusSub.filesOfStatus("Acquired")

        self.assertEqual(len(availableFiles), 0, \
                         "Error: There should be no available files.")
        self.assertEqual(len(acquiredFiles), 0, \
                         "Error: There should be no acquired files.")
        self.assertEqual(len(bogusAcquiredFiles), 1, \
                         "Error: There should be one acquired file.")

        self.assertEqual(len(completedFiles), 3, \
                         "Error: There should be only one completed file.")
        goldenLFNs = ["lfnA", "lfnB", "lfnC"]
        for completedFile in completedFiles:
            self.assertTrue(completedFile["lfn"] in goldenLFNs, \
                            "Error: Extra completed file.")
            goldenLFNs.remove(completedFile["lfn"])

        self.assertEqual(len(failedFiles), 0, \
                         "Error: There should be no failed files.")

        self.assertEqual(len(goldenLFNs), 0, \
                         "Error: Missing LFN")

        failedFiles = self.mainMergeSub.filesOfStatus("Failed")
        acquiredFiles = self.mainMergeSub.filesOfStatus("Acquired")
        completedFiles = self.mainMergeSub.filesOfStatus("Completed")
        availableFiles = self.mainMergeSub.filesOfStatus("Available")

        self.assertEqual(len(acquiredFiles), 0, \
                         "Error: Merge subscription should have 0 acq files.")
        self.assertEqual(len(availableFiles), 0, \
                         "Error: Merge subscription should have 0 avail files.")

        self.assertEqual(len(failedFiles), 1, \
                         "Error: Merge subscription should have 1 failed files.")
        self.assertEqual(
            list(failedFiles)[0]["lfn"], "ulfnB", "Error: Wrong failed file.")

        self.assertEqual(len(completedFiles), 2, \
                         "Error: Merge subscription should have 2 compl files.")
        goldenLFNs = ["ulfnA", "ulfnC"]
        for completedFile in completedFiles:
            self.assertTrue(completedFile["lfn"] in goldenLFNs, \
                            "Error: Extra complete file.")
            goldenLFNs.remove(completedFile["lfn"])

        self.assertEqual(len(goldenLFNs), 0, \
                         "Error: Missing LFN")

        failedFiles = self.mainCleanupSub.filesOfStatus("Failed")
        acquiredFiles = self.mainCleanupSub.filesOfStatus("Acquired")
        completedFiles = self.mainCleanupSub.filesOfStatus("Completed")
        availableFiles = self.mainCleanupSub.filesOfStatus("Available")

        self.assertEqual(len(failedFiles), 0, \
                         "Error: Cleanup subscription should have 0 fai files.")

        self.assertEqual(len(acquiredFiles), 1, \
                         "Error: There should be only one acquired file.")
        self.assertEqual(list(acquiredFiles)[0]["lfn"], "ulfnA", \
                         "Error: Wrong acquired LFN.")

        self.assertEqual(len(completedFiles), 1, \
                         "Error: There should be only one completed file.")
        self.assertEqual(list(completedFiles)[0]["lfn"], "ulfnB", \
                         "Error: Wrong completed LFN.")

        self.assertEqual(len(availableFiles), 1, \
                         "Error: There should be only one available file.")
        self.assertEqual(list(availableFiles)[0]["lfn"], "ulfnC", \
                         "Error: Wrong completed LFN.")

        return

    def verifyJobKillStatus(self):
        """
        _verifyJobKillStatus_

        Verify that jobs are killed correctly.  Jobs belonging to Cleanup and
        LogCollect subscriptions are not killed.  The status of jobs that have
        already finished running is not changed.
        """
        self.procJobA.load()
        self.procJobB.load()
        self.procJobC.load()

        self.assertEqual(self.procJobA["state"], "killed", \
                         "Error: Proc job A should be killed.")
        self.assertEqual(self.procJobB["state"], "killed", \
                         "Error: Proc job B should be killed.")
        self.assertEqual(self.procJobC["state"], "complete", \
                         "Error: Proc job C should be complete.")

        self.mergeJobA.load()
        self.mergeJobB.load()
        self.mergeJobC.load()

        self.assertEqual(self.mergeJobA["state"], "exhausted", \
                         "Error: Merge job A should be exhausted.")
        self.assertEqual(self.mergeJobB["state"], "cleanout", \
                         "Error: Merge job B should be cleanout.")
        self.assertEqual(self.mergeJobC["state"], "killed", \
                         "Error: Merge job C should be killed.")

        self.cleanupJobA.load()
        self.cleanupJobB.load()
        self.cleanupJobC.load()

        self.assertEqual(self.cleanupJobA["state"], "new", \
                         "Error: Cleanup job A should be new.")
        self.assertEqual(self.cleanupJobB["state"], "executing", \
                         "Error: Cleanup job B should be executing.")
        self.assertEqual(self.cleanupJobC["state"], "complete", \
                         "Error: Cleanup job C should be complete.")
        return

    def createTestWMSpec(self):
        """
        _createTestWMSpec_

        Create a WMSpec that has a processing, merge, cleanup and skims tasks that
        can be used by the subscription creation test.
        """
        testWorkload = WMWorkloadHelper(WMWorkload("TestWorkload"))
        testWorkload.setDashboardActivity("TestReReco")
        testWorkload.setSpecUrl("/path/to/workload")
        testWorkload.setOwnerDetails("sfoulkes", "DMWM", {'dn': 'MyDN'})

        procTask = testWorkload.newTask("ProcessingTask")
        procTask.setTaskType("Processing")
        procTask.setSplittingAlgorithm("FileBased", files_per_job=1)
        procTaskCMSSW = procTask.makeStep("cmsRun1")
        procTaskCMSSW.setStepType("CMSSW")
        procTaskCMSSWHelper = procTaskCMSSW.getTypeHelper()
        procTask.setTaskType("Processing")
        procTask.setSiteWhitelist(["site1"])
        procTask.setSiteBlacklist(["site2"])
        procTask.applyTemplates()

        procTaskCMSSWHelper.addOutputModule("OutputA",
                                            primaryDataset="bogusPrimary",
                                            processedDataset="bogusProcessed",
                                            dataTier="DataTierA",
                                            lfnBase="bogusUnmerged",
                                            mergedLFNBase="bogusMerged",
                                            filterName=None)

        mergeTask = procTask.addTask("MergeTask")
        mergeTask.setInputReference(procTaskCMSSW,
                                    outputModule="OutputA",
                                    dataTier='DataTierA')
        mergeTask.setTaskType("Merge")
        mergeTask.setSplittingAlgorithm("WMBSMergeBySize",
                                        min_merge_size=1,
                                        max_merge_size=2,
                                        max_merge_events=3)
        mergeTaskCMSSW = mergeTask.makeStep("cmsRun1")
        mergeTaskCMSSW.setStepType("CMSSW")
        mergeTaskCMSSWHelper = mergeTaskCMSSW.getTypeHelper()
        mergeTask.setTaskType("Merge")
        mergeTask.applyTemplates()

        mergeTaskCMSSWHelper.addOutputModule("Merged",
                                             primaryDataset="bogusPrimary",
                                             processedDataset="bogusProcessed",
                                             dataTier="DataTierA",
                                             lfnBase="bogusUnmerged",
                                             mergedLFNBase="bogusMerged",
                                             filterName=None)

        cleanupTask = procTask.addTask("CleanupTask")
        cleanupTask.setInputReference(procTaskCMSSW,
                                      outputModule="OutputA",
                                      dataTier="DataTierA")
        cleanupTask.setTaskType("Merge")
        cleanupTask.setSplittingAlgorithm("SiblingProcessingBased",
                                          files_per_job=50)
        cleanupTaskCMSSW = cleanupTask.makeStep("cmsRun1")
        cleanupTaskCMSSW.setStepType("CMSSW")
        cleanupTask.setTaskType("Cleanup")
        cleanupTask.applyTemplates()

        skimTask = mergeTask.addTask("SkimTask")
        skimTask.setTaskType("Skim")
        skimTask.setInputReference(mergeTaskCMSSW,
                                   outputModule="Merged",
                                   dataTier="DataTierA")
        skimTask.setSplittingAlgorithm("FileBased",
                                       files_per_job=1,
                                       include_parents=True)
        skimTaskCMSSW = skimTask.makeStep("cmsRun1")
        skimTaskCMSSW.setStepType("CMSSW")
        skimTaskCMSSWHelper = skimTaskCMSSW.getTypeHelper()
        skimTask.setTaskType("Skim")
        skimTask.applyTemplates()

        skimTaskCMSSWHelper.addOutputModule("SkimOutputA",
                                            primaryDataset="bogusPrimary",
                                            processedDataset="bogusProcessed",
                                            dataTier="DataTierA",
                                            lfnBase="bogusUnmerged",
                                            mergedLFNBase="bogusMerged",
                                            filterName=None)

        skimTaskCMSSWHelper.addOutputModule("SkimOutputB",
                                            primaryDataset="bogusPrimary",
                                            processedDataset="bogusProcessed",
                                            dataTier="DataTierB",
                                            lfnBase="bogusUnmerged",
                                            mergedLFNBase="bogusMerged",
                                            filterName=None)

        return testWorkload

    def setupMCWMSpec(self):
        """Setup MC workflow"""
        self.wmspec = self.createMCWMSpec()
        self.topLevelTask = getFirstTask(self.wmspec)
        self.inputDataset = self.topLevelTask.inputDataset()
        self.dataset = self.topLevelTask.getInputDatasetPath()
        self.dbs = None

        # add sites that would normally be added by operator via resource_control
        locationDAO = self.daoFactory(classname="Locations.New")
        self.pnns = []
        for site in ['T2_XX_SiteA', 'T2_XX_SiteB']:
            locationDAO.execute(siteName=site, pnn=site)
            self.pnns.append(site)

    def createWMSpec(self, name='ReRecoWorkload'):
        factory = ReRecoWorkloadFactory()
        rerecoArgs["ConfigCacheID"] = createConfig(rerecoArgs["CouchDBName"])
        wmspec = factory.factoryWorkloadConstruction(name, rerecoArgs)
        wmspec.setSpecUrl("/path/to/workload")
        wmspec.setSubscriptionInformation(custodialSites=[],
                                          nonCustodialSites=[],
                                          autoApproveSites=[],
                                          priority="Low",
                                          custodialSubType="Move")
        return wmspec

    def createMCWMSpec(self, name='MonteCarloWorkload'):
        mcArgs = TaskChainWorkloadFactory.getTestArguments()
        mcArgs["CouchDBName"] = rerecoArgs["CouchDBName"]
        mcArgs["Task1"]["ConfigCacheID"] = createConfig(mcArgs["CouchDBName"])

        wmspec = taskChainWorkload(name, mcArgs)
        wmspec.setSpecUrl("/path/to/workload")
        getFirstTask(wmspec).addProduction(totalevents=10000)
        return wmspec

    def getDBS(self, wmspec):
        topLevelTask = getFirstTask(wmspec)
        inputDataset = topLevelTask.inputDataset()
        dbs = DBSReader(inputDataset.dbsurl)
        # dbsDict = {self.inputDataset.dbsurl : self.dbs}
        return dbs

    def createWMBSHelperWithTopTask(self,
                                    wmspec,
                                    block,
                                    mask=None,
                                    parentFlag=False,
                                    detail=False,
                                    commonLocation=None):

        topLevelTask = getFirstTask(wmspec)

        wmbs = WMBSHelper(wmspec,
                          topLevelTask.name(),
                          block,
                          mask,
                          cachepath=self.workDir,
                          commonLocation=commonLocation)
        if block:
            blockName = block
            if parentFlag:
                block = self.dbs.getFileBlockWithParents(blockName)
                location = self.phedex.getReplicaPhEDExNodesForBlocks(
                    block=[blockName], complete='y')
                block['PhEDExNodeNames'] = location[blockName]
            else:
                block = self.dbs.getFileBlock(blockName)
                location = self.phedex.getReplicaPhEDExNodesForBlocks(
                    block=[blockName], complete='y')
                block['PhEDExNodeNames'] = location[blockName]
        sub, files = wmbs.createSubscriptionAndAddFiles(block=block)
        if detail:
            return wmbs, sub, files
        else:
            return wmbs

    def testKillWorkflow(self):
        """
        _testKillWorkflow_

        Verify that workflow killing works correctly.
        """
        baAPI = BossAirAPI(config=self.config, insertStates=True)

        # Create nine jobs
        self.setupForKillTest(baAPI=baAPI)
        self.assertEqual(len(baAPI._listRunJobs()), 9)
        killWorkflow("Main", self.config, self.config)

        self.verifyFileKillStatus()
        self.verifyJobKillStatus()
        self.assertEqual(len(baAPI._listRunJobs()), 8)

        return

    def testCreateSubscription(self):
        """
        _testCreateSubscription_

        Verify that the subscription creation code works correctly.
        """
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='site1',
                                   pnn='goodse.cern.ch',
                                   ceName='site1',
                                   plugin="TestPlugin")
        resourceControl.insertSite(siteName='site2',
                                   pnn='goodse2.cern.ch',
                                   ceName='site2',
                                   plugin="TestPlugin")

        testWorkload = self.createTestWMSpec()
        testTopLevelTask = getFirstTask(testWorkload)
        testWMBSHelper = WMBSHelper(testWorkload,
                                    testTopLevelTask.name(),
                                    "SomeBlock",
                                    cachepath=self.workDir)
        testWMBSHelper.createTopLevelFileset()
        testWMBSHelper._createSubscriptionsInWMBS(
            testTopLevelTask, testWMBSHelper.topLevelFileset)

        procWorkflow = Workflow(name="TestWorkload",
                                task="/TestWorkload/ProcessingTask")
        procWorkflow.load()

        self.assertEqual(procWorkflow.owner, "sfoulkes",
                         "Error: Wrong owner: %s" % procWorkflow.owner)
        self.assertEqual(procWorkflow.group, "DMWM",
                         "Error: Wrong group: %s" % procWorkflow.group)
        self.assertEqual(procWorkflow.wfType, "TestReReco",
                         "Error: Wrong type.")
        self.assertEqual(
            procWorkflow.spec,
            os.path.join(self.workDir, procWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(procWorkflow.outputMap), 1,
                         "Error: Wrong number of WF outputs.")
        mergedProcOutput = procWorkflow.outputMap["OutputADataTierA"][0][
            "merged_output_fileset"]
        unmergedProcOutput = procWorkflow.outputMap["OutputADataTierA"][0][
            "output_fileset"]

        mergedProcOutput.loadData()
        unmergedProcOutput.loadData()
        self.assertEqual(
            mergedProcOutput.name,
            "/TestWorkload/ProcessingTask/MergeTask/merged-MergedDataTierA",
            "Error: Merged output fileset is wrong.")
        self.assertEqual(
            unmergedProcOutput.name,
            "/TestWorkload/ProcessingTask/unmerged-OutputADataTierA",
            "Error: Unmerged output fileset is wrong.")

        mergeWorkflow = Workflow(name="TestWorkload",
                                 task="/TestWorkload/ProcessingTask/MergeTask")
        mergeWorkflow.load()

        self.assertEqual(mergeWorkflow.owner, "sfoulkes",
                         "Error: Wrong owner.")
        self.assertEqual(
            mergeWorkflow.spec,
            os.path.join(self.workDir, mergeWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(mergeWorkflow.outputMap), 1,
                         "Error: Wrong number of WF outputs.")

        cleanupWorkflow = Workflow(
            name="TestWorkload",
            task="/TestWorkload/ProcessingTask/CleanupTask")
        cleanupWorkflow.load()

        self.assertEqual(cleanupWorkflow.owner, "sfoulkes",
                         "Error: Wrong owner.")
        self.assertEqual(
            cleanupWorkflow.spec,
            os.path.join(self.workDir, cleanupWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(cleanupWorkflow.outputMap), 0,
                         "Error: Wrong number of WF outputs.")

        unmergedMergeOutput = mergeWorkflow.outputMap["MergedDataTierA"][0][
            "output_fileset"]
        unmergedMergeOutput.loadData()

        self.assertEqual(
            unmergedMergeOutput.name,
            "/TestWorkload/ProcessingTask/MergeTask/merged-MergedDataTierA",
            "Error: Unmerged output fileset is wrong.")

        skimWorkflow = Workflow(
            name="TestWorkload",
            task="/TestWorkload/ProcessingTask/MergeTask/SkimTask")
        skimWorkflow.load()

        self.assertEqual(skimWorkflow.owner, "sfoulkes", "Error: Wrong owner.")
        self.assertEqual(
            skimWorkflow.spec,
            os.path.join(self.workDir, skimWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(skimWorkflow.outputMap), 2,
                         "Error: Wrong number of WF outputs.")

        mergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][0][
            "merged_output_fileset"]
        unmergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][
            0]["output_fileset"]
        mergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][0][
            "merged_output_fileset"]
        unmergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][
            0]["output_fileset"]

        mergedSkimOutputA.loadData()
        mergedSkimOutputB.loadData()
        unmergedSkimOutputA.loadData()
        unmergedSkimOutputB.loadData()

        self.assertEqual(
            mergedSkimOutputA.name,
            "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputADataTierA",
            "Error: Merged output fileset is wrong: %s" %
            mergedSkimOutputA.name)
        self.assertEqual(
            unmergedSkimOutputA.name,
            "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputADataTierA",
            "Error: Unmerged output fileset is wrong.")
        self.assertEqual(
            mergedSkimOutputB.name,
            "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB",
            "Error: Merged output fileset is wrong.")
        self.assertEqual(
            unmergedSkimOutputB.name,
            "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB",
            "Error: Unmerged output fileset is wrong.")

        topLevelFileset = Fileset(name="TestWorkload-ProcessingTask-SomeBlock")
        topLevelFileset.loadData()

        procSubscription = Subscription(fileset=topLevelFileset,
                                        workflow=procWorkflow)
        procSubscription.loadData()

        self.assertEqual(len(procSubscription.getWhiteBlackList()), 2,
                         "Error: Wrong site white/black list for proc sub.")
        for site in procSubscription.getWhiteBlackList():
            if site["site_name"] == "site1":
                self.assertEqual(site["valid"], 1,
                                 "Error: Site should be white listed.")
            else:
                self.assertEqual(site["valid"], 0,
                                 "Error: Site should be black listed.")

        self.assertEqual(procSubscription["type"], "Processing",
                         "Error: Wrong subscription type.")
        self.assertEqual(procSubscription["split_algo"], "FileBased",
                         "Error: Wrong split algo.")

        mergeSubscription = Subscription(fileset=unmergedProcOutput,
                                         workflow=mergeWorkflow)
        mergeSubscription.loadData()

        self.assertEqual(len(mergeSubscription.getWhiteBlackList()), 0,
                         "Error: Wrong white/black list for merge sub.")

        self.assertEqual(mergeSubscription["type"], "Merge",
                         "Error: Wrong subscription type.")
        self.assertEqual(mergeSubscription["split_algo"], "WMBSMergeBySize",
                         "Error: Wrong split algo.")

        skimSubscription = Subscription(fileset=unmergedMergeOutput,
                                        workflow=skimWorkflow)
        skimSubscription.loadData()

        self.assertEqual(skimSubscription["type"], "Skim",
                         "Error: Wrong subscription type.")
        self.assertEqual(skimSubscription["split_algo"], "FileBased",
                         "Error: Wrong split algo.")
        return

    def testTruncatedWFInsertion(self):
        """
        _testTruncatedWFInsertion_

        """
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='site1',
                                   pnn='goodse.cern.ch',
                                   ceName='site1',
                                   plugin="TestPlugin")
        resourceControl.insertSite(siteName='site2',
                                   pnn='goodse2.cern.ch',
                                   ceName='site2',
                                   plugin="TestPlugin")

        testWorkload = self.createTestWMSpec()
        testTopLevelTask = getFirstTask(testWorkload)
        testWMBSHelper = WMBSHelper(testWorkload,
                                    testTopLevelTask.name(),
                                    "SomeBlock",
                                    cachepath=self.workDir)
        testWMBSHelper.createTopLevelFileset()
        testWMBSHelper._createSubscriptionsInWMBS(
            testTopLevelTask, testWMBSHelper.topLevelFileset)

        testWorkload.truncate("ResubmitTestWorkload",
                              "/TestWorkload/ProcessingTask/MergeTask",
                              "someserver", "somedatabase")

        # create  the subscription for multiple top task (MergeTask and CleanupTask for the same block)
        for task in testWorkload.getTopLevelTask():
            testResubmitWMBSHelper = WMBSHelper(testWorkload,
                                                task.name(),
                                                "SomeBlock2",
                                                cachepath=self.workDir)
            testResubmitWMBSHelper.createTopLevelFileset()
            testResubmitWMBSHelper._createSubscriptionsInWMBS(
                task, testResubmitWMBSHelper.topLevelFileset)

        mergeWorkflow = Workflow(name="ResubmitTestWorkload",
                                 task="/ResubmitTestWorkload/MergeTask")
        mergeWorkflow.load()

        self.assertEqual(mergeWorkflow.owner, "sfoulkes",
                         "Error: Wrong owner.")
        self.assertEqual(
            mergeWorkflow.spec,
            os.path.join(self.workDir, mergeWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(mergeWorkflow.outputMap), 1,
                         "Error: Wrong number of WF outputs.")

        unmergedMergeOutput = mergeWorkflow.outputMap["MergedDataTierA"][0][
            "output_fileset"]
        unmergedMergeOutput.loadData()

        self.assertEqual(
            unmergedMergeOutput.name,
            "/ResubmitTestWorkload/MergeTask/merged-MergedDataTierA",
            "Error: Unmerged output fileset is wrong.")

        skimWorkflow = Workflow(
            name="ResubmitTestWorkload",
            task="/ResubmitTestWorkload/MergeTask/SkimTask")
        skimWorkflow.load()

        self.assertEqual(skimWorkflow.owner, "sfoulkes", "Error: Wrong owner.")
        self.assertEqual(
            skimWorkflow.spec,
            os.path.join(self.workDir, skimWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(skimWorkflow.outputMap), 2,
                         "Error: Wrong number of WF outputs.")

        mergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][0][
            "merged_output_fileset"]
        unmergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][
            0]["output_fileset"]
        mergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][0][
            "merged_output_fileset"]
        unmergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][
            0]["output_fileset"]

        mergedSkimOutputA.loadData()
        mergedSkimOutputB.loadData()
        unmergedSkimOutputA.loadData()
        unmergedSkimOutputB.loadData()

        self.assertEqual(
            mergedSkimOutputA.name,
            "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputADataTierA",
            "Error: Merged output fileset is wrong: %s" %
            mergedSkimOutputA.name)
        self.assertEqual(
            unmergedSkimOutputA.name,
            "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputADataTierA",
            "Error: Unmerged output fileset is wrong.")
        self.assertEqual(
            mergedSkimOutputB.name,
            "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB",
            "Error: Merged output fileset is wrong.")
        self.assertEqual(
            unmergedSkimOutputB.name,
            "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB",
            "Error: Unmerged output fileset is wrong.")

        topLevelFileset = Fileset(
            name="ResubmitTestWorkload-MergeTask-SomeBlock2")
        topLevelFileset.loadData()

        mergeSubscription = Subscription(fileset=topLevelFileset,
                                         workflow=mergeWorkflow)
        mergeSubscription.loadData()

        self.assertEqual(len(mergeSubscription.getWhiteBlackList()), 0,
                         "Error: Wrong white/black list for merge sub.")

        self.assertEqual(mergeSubscription["type"], "Merge",
                         "Error: Wrong subscription type.")
        self.assertEqual(mergeSubscription["split_algo"], "WMBSMergeBySize",
                         "Error: Wrong split algo.")

        skimSubscription = Subscription(fileset=unmergedMergeOutput,
                                        workflow=skimWorkflow)
        skimSubscription.loadData()

        self.assertEqual(skimSubscription["type"], "Skim",
                         "Error: Wrong subscription type.")
        self.assertEqual(skimSubscription["split_algo"], "FileBased",
                         "Error: Wrong split algo.")

        return

    def testReReco(self):
        """ReReco workflow"""
        # create workflow
        block = self.dataset + "#" + BLOCK1
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 5)

    def testReRecoBlackRunRestriction(self):
        """ReReco workflow with Run restrictions"""
        block = self.dataset + "#" + BLOCK2
        self.topLevelTask.setInputRunBlacklist(
            [181183])  # Set run blacklist to only run in the block
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)

        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 0)

    def testReRecoWhiteRunRestriction(self):
        block = self.dataset + "#" + BLOCK2
        self.topLevelTask.setInputRunWhitelist(
            [181183])  # Set run whitelist to only run in the block
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 1)

    def testLumiMaskRestrictionsOK(self):
        block = self.dataset + "#" + BLOCK1
        self.wmspec.getTopLevelTask()[0].data.input.splitting.runs = ['181367']
        self.wmspec.getTopLevelTask()[0].data.input.splitting.lumis = ['57,80']
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 1)

    def testLumiMaskRestrictionsKO(self):
        block = self.dataset + "#" + BLOCK1
        self.wmspec.getTopLevelTask()[0].data.input.splitting.runs = [
            '123454321'
        ]
        self.wmspec.getTopLevelTask()[0].data.input.splitting.lumis = [
            '123,123'
        ]
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 0)

    def testDuplicateFileInsert(self):
        # using default wmspec
        block = self.dataset + "#" + BLOCK1
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        wmbs.topLevelFileset.loadData()
        numOfFiles = len(wmbs.topLevelFileset.files)
        # check initially inserted files.
        dbsFiles = self.dbs.getFileBlock(block)['Files']
        self.assertEqual(numOfFiles, len(dbsFiles))
        firstFileset = wmbs.topLevelFileset
        wmbsDao = wmbs.daofactory(classname="Files.InFileset")

        numOfFiles = len(wmbsDao.execute(firstFileset.id))
        self.assertEqual(numOfFiles, len(dbsFiles))

        # use the new spec with same inputdataset
        block = self.dataset + "#" + BLOCK1
        wmspec = self.createWMSpec("TestSpec1")
        dbs = self.getDBS(wmspec)
        wmbs = self.createWMBSHelperWithTopTask(wmspec, block)
        # check duplicate insert
        dbsFiles = dbs.getFileBlock(block)
        location = self.phedex.getReplicaPhEDExNodesForBlocks(block=[block],
                                                              complete='y')
        dbsFiles['PhEDExNodeNames'] = location[block]
        numOfFiles = wmbs.addFiles(dbsFiles)
        self.assertEqual(numOfFiles, 0)
        secondFileset = wmbs.topLevelFileset

        wmbsDao = wmbs.daofactory(classname="Files.InFileset")
        numOfFiles = len(wmbsDao.execute(secondFileset.id))
        self.assertEqual(numOfFiles, len(dbsFiles['Files']))

        self.assertNotEqual(firstFileset.id, secondFileset.id)

    def testDuplicateSubscription(self):
        """Can't duplicate subscriptions"""
        siteWhitelist = ["T2_XX_SiteA", "T2_XX_SiteB"]
        # using default wmspec
        block = self.dataset + "#" + BLOCK1
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        wmbs.topLevelFileset.loadData()
        numOfFiles = len(wmbs.topLevelFileset.files)
        filesetId = wmbs.topLevelFileset.id
        subId = wmbs.topLevelSubscription['id']

        # check initially inserted files.
        dbsFiles = self.dbs.getFileBlock(block)['Files']
        self.assertEqual(numOfFiles, len(dbsFiles))

        # Not clear what's supposed to happen here, 2nd test is completely redundant
        dummyFirstFileset = wmbs.topLevelFileset
        self.assertEqual(numOfFiles, len(dbsFiles))

        # reinsert subscription - shouldn't create anything new
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        wmbs.topLevelFileset.loadData()
        self.assertEqual(numOfFiles, len(wmbs.topLevelFileset.files))
        self.assertEqual(filesetId, wmbs.topLevelFileset.id)
        self.assertEqual(subId, wmbs.topLevelSubscription['id'])

        # now do a montecarlo workflow
        self.setupMCWMSpec()
        mask = Mask(FirstRun=12,
                    FirstLumi=1234,
                    FirstEvent=12345,
                    LastEvent=999995,
                    LastLumi=12345,
                    LastRun=12)
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec,
                                                None,
                                                mask,
                                                commonLocation=siteWhitelist)
        wmbs.topLevelFileset.loadData()
        numOfFiles = len(wmbs.topLevelFileset.files)
        filesetId = wmbs.topLevelFileset.id
        subId = wmbs.topLevelSubscription['id']

        # check initially inserted files.
        # Not clear what's supposed to happen here, 2nd test is completely redundant
        numDbsFiles = 1
        self.assertEqual(numOfFiles, numDbsFiles)
        dummyFirstFileset = wmbs.topLevelFileset
        self.assertEqual(numOfFiles, numDbsFiles)

        # reinsert subscription - shouldn't create anything new
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec,
                                                None,
                                                mask,
                                                commonLocation=siteWhitelist)
        wmbs.topLevelFileset.loadData()
        self.assertEqual(numOfFiles, len(wmbs.topLevelFileset.files))
        self.assertEqual(filesetId, wmbs.topLevelFileset.id)
        self.assertEqual(subId, wmbs.topLevelSubscription['id'])

    def testParentage(self):
        """
        1. check whether parent files are created in wmbs.
        2. check parent files are associated to child.
        3. When 2 specs with the same input data (one with parent processing, one without it)
           is inserted, if one without parent processing inserted first then the other with
           parent processing insert, it still needs to create parent files although child files
           are duplicate
        """

        # Swap out the dataset for one that has parents
        task = next(self.wmspec.taskIterator())
        oldDS = task.inputDataset(
        )  # Copy the old dataset, only will use DBS URL from it
        task.addInputDataset(name="/Cosmics/ComissioningHI-PromptReco-v1/RECO",
                             primary='Cosmics',
                             processed='ComissioningHI-PromptReco-v1',
                             tier='RECO',
                             dbsurl=oldDS.dbsurl)
        block = '/Cosmics/ComissioningHI-PromptReco-v1/RECO' + '#5b89ba9c-0dbf-11e1-9b6c-003048caaace'

        # File creation without parents
        wmbs, _, numFiles = self.createWMBSHelperWithTopTask(self.wmspec,
                                                             block,
                                                             parentFlag=False,
                                                             detail=True)
        self.assertEqual(8, numFiles)
        wmbs.topLevelFileset.loadData()
        for child in wmbs.topLevelFileset.files:
            self.assertEqual(len(child["parents"]), 0)  # no parents per child

        # File creation with parents
        wmbs, _, numFiles = self.createWMBSHelperWithTopTask(self.wmspec,
                                                             block,
                                                             parentFlag=True,
                                                             detail=True)
        self.assertEqual(8, numFiles)
        wmbs.topLevelFileset.loadData()
        for child in wmbs.topLevelFileset.files:
            self.assertEqual(len(child["parents"]), 1)  # one parent per child

    def testMCFakeFileInjection(self):
        """Inject fake Monte Carlo files into WMBS"""

        # This test is failing because the name of the couch DB is set to None
        # in BasicProductionWorkload.getProdArgs() but changing it to
        # "reqmgr_config_cache_t" from StdBase test arguments does not fix the
        # situation. testDuplicateSubscription probably has the same issue
        siteWhitelist = ["T2_XX_SiteA", "T2_XX_SiteB"]

        self.setupMCWMSpec()

        mask = Mask(FirstRun=12,
                    FirstLumi=1234,
                    FirstEvent=12345,
                    LastEvent=999995,
                    LastLumi=12345,
                    LastRun=12)

        wmbs = self.createWMBSHelperWithTopTask(self.wmspec,
                                                None,
                                                mask,
                                                commonLocation=siteWhitelist)
        subscription = wmbs.topLevelSubscription
        self.assertEqual(1, subscription.exists())
        fileset = subscription['fileset']
        self.assertEqual(1, fileset.exists())
        fileset.loadData()  # need to refresh from database

        self.assertEqual(len(fileset.files), 1)
        self.assertEqual(len(fileset.parents), 0)
        self.assertFalse(fileset.open)

        firstFile = list(fileset.files)[0]
        self.assertEqual(firstFile['events'], mask['LastEvent'] -
                         mask['FirstEvent'] + 1)  # inclusive range
        self.assertEqual(firstFile['merged'],
                         False)  # merged files get added to dbs
        self.assertEqual(len(firstFile['parents']), 0)
        # firstFile.loadData()
        self.assertEqual(sorted(firstFile['locations']), sorted(self.pnns))
        self.assertEqual(len(firstFile.getParentLFNs()), 0)

        self.assertEqual(len(firstFile.getRuns()), 1)
        run = firstFile.getRuns()[0]
        self.assertEqual(run.run, mask['FirstRun'])
        self.assertEqual(run.lumis[0], mask['FirstLumi'])
        self.assertEqual(run.lumis[-1], mask['LastLumi'])
        self.assertEqual(len(run.lumis),
                         mask['LastLumi'] - mask['FirstLumi'] + 1)
Esempio n. 11
0
class StartPolicyInterface(PolicyInterface):
    """Interface for start policies"""
    def __init__(self, **args):
        PolicyInterface.__init__(self, **args)
        self.workQueueElements = []
        self.wmspec = None
        self.team = None
        self.initialTask = None
        self.splitParams = None
        self.dbs_pool = {}
        self.data = {}
        self.lumi = None
        self.couchdb = None
        self.rejectedWork = []  # List of inputs that were rejected
        self.badWork = [
        ]  # list of bad work unit (e.g. without any valid files)
        self.pileupData = {}
        self.cric = CRIC()
        if usingRucio():
            self.rucio = Rucio(self.args['rucioAcct'],
                               configDict={'logger': self.logger})
        else:
            self.phedex = PhEDEx()  # this will go away eventually

    def split(self):
        """Apply policy to spec"""
        raise NotImplementedError

    def validate(self):
        """Check params and spec are appropriate for the policy"""
        raise NotImplementedError

    def validateCommon(self):
        """Common validation stuff"""
        try:
            Lexicon.requestName(self.wmspec.name())
        except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(
                self.wmspec, "Workflow name validation error: %s" % str(ex))
            raise error

        if self.initialTask.siteWhitelist():
            if isinstance(self.initialTask.siteWhitelist(), basestring):
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    'Invalid site whitelist: Must be tuple/list but is %s' %
                    type(self.initialTask.siteWhitelist()))
                raise error
            try:
                [
                    Lexicon.cmsname(site)
                    for site in self.initialTask.siteWhitelist()
                ]
            except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    "Site whitelist validation error: %s" % str(ex))
                raise error
        else:
            error = WorkQueueWMSpecError(
                self.wmspec,
                "Site whitelist validation error: Empty site whitelist")
            raise error

        if self.initialTask.siteBlacklist():
            if isinstance(self.initialTask.siteBlacklist(), basestring):
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    'Invalid site blacklist: Must be tuple/list but is %s' %
                    type(self.initialTask.siteBlacklist()))
                raise error
            try:
                [
                    Lexicon.cmsname(site)
                    for site in self.initialTask.siteBlacklist()
                ]
            except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    "Site blacklist validation error: %s" % str(ex))
                raise error

        # splitter settings
        if self.args.get('SliceSize', 1) <= 0:
            error = WorkQueueWMSpecError(
                self.wmspec, 'Zero or negative SliceSize parameter')
            raise error
        if self.args.get('SubSliceSize', 1) <= 0:
            error = WorkQueueWMSpecError(
                self.wmspec, 'Zero or negative SubSliceSize parameter')
            raise error

        # check input dataset is valid
        try:
            if self.initialTask.getInputDatasetPath():
                Lexicon.dataset(self.initialTask.getInputDatasetPath())
        except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(
                self.wmspec, "Dataset validation error: %s" % str(ex))
            raise error

        # if pileup is found, check that they are valid datasets
        try:
            pileupDatasets = self.wmspec.listPileupDatasets()
            for dbsUrl in pileupDatasets:
                for dataset in pileupDatasets[dbsUrl]:
                    Lexicon.dataset(dataset)
        except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(
                self.wmspec, "Pileup dataset validation error: %s" % str(ex))
            raise error

    def newQueueElement(self, **args):
        # DBS Url may not be available in the initial task
        # but in the pileup data (MC pileup)
        dbsUrl = self.initialTask.dbsUrl()
        if dbsUrl is None and self.pileupData:
            # Get the first DBS found
            dbsUrl = self.wmspec.listPileupDatasets().keys()[0]

        args.setdefault('Status', 'Available')
        args.setdefault('WMSpec', self.wmspec)
        args.setdefault('Task', self.initialTask)
        args.setdefault('RequestName', self.wmspec.name())
        args.setdefault('TaskName', self.initialTask.name())
        args.setdefault('Dbs', dbsUrl)
        args.setdefault('SiteWhitelist', self.initialTask.siteWhitelist())
        args.setdefault('SiteBlacklist', self.initialTask.siteBlacklist())
        args.setdefault('StartPolicy', self.wmspec.startPolicy())
        args.setdefault('EndPolicy', self.wmspec.endPolicyParameters())
        args.setdefault('Priority', self.wmspec.priority())
        args.setdefault('PileupData', self.pileupData)
        if not args['Priority']:
            args['Priority'] = 0
        ele = WorkQueueElement(**args)
        for data, sites in ele['Inputs'].items():
            if not sites:
                raise WorkQueueWMSpecError(
                    self.wmspec, 'Input data has no locations "%s"' % data)

        # catch infinite splitting loops
        if len(self.workQueueElements) > self.args.get('maxRequestSize', 1e8):
            raise WorkQueueWMSpecError(
                self.wmspec, 'Too many elements (%d)' %
                self.args.get('MaxRequestElements', 1e8))
        self.workQueueElements.append(ele)

    def __call__(self,
                 wmspec,
                 task,
                 data=None,
                 mask=None,
                 team=None,
                 continuous=False):
        self.wmspec = wmspec
        # bring in spec specific settings
        self.args.update(self.wmspec.startPolicyParameters())
        self.initialTask = task
        if data:
            self.data = data
        self.mask = mask
        self.validate()
        try:
            pileupDatasets = self.wmspec.listPileupDatasets()
            if pileupDatasets:
                self.pileupData = self.getDatasetLocations(pileupDatasets)
            self.split()
        # For known exceptions raise custom error that will fail the workflow.
        except dbsClientException as ex:
            # A dbs configuration error implies the spec is invalid
            error = WorkQueueWMSpecError(self.wmspec,
                                         "DBS config error: %s" % str(ex))
            raise error
        except AssertionError as ex:
            # Assertion generally means validation of an input field failed
            error = WorkQueueWMSpecError(self.wmspec,
                                         "Assertion error: %s" % str(ex))
            raise error
        except DBSReaderError as ex:
            # Hacky way of identifying non-existant data, DbsBadRequest chomped by DBSReader
            if 'Invalid parameters' in str(ex):
                data = task.data.input.pythonise_(
                ) if task.data.input else 'None'
                msg = """data: %s, mask: %s, pileup: %s. %s""" % (
                    str(data), str(mask), str(pileupDatasets), str(ex))
                error = WorkQueueNoWorkError(self.wmspec, msg)
                raise error
            raise  # propagate other dbs errors

        # if we have no new elements and we are not adding work to request
        # already running, then raise exception
        if not self.workQueueElements and not continuous:
            data = task.data.input.pythonise_() if task.data.input else 'None'
            msg = "Failed to add work. Input data: %s, mask: %s." % (str(data),
                                                                     str(mask))
            error = WorkQueueNoWorkError(self.wmspec, msg)
            raise error

        return self.workQueueElements, self.rejectedWork, self.badWork

    def dbs(self, dbs_url=None):
        """Get DBSReader"""
        from WMCore.WorkQueue.WorkQueueUtils import get_dbs
        if dbs_url is None:
            dbs_url = self.initialTask.dbsUrl()
        return get_dbs(dbs_url)

    @staticmethod
    def supportsWorkAddition():
        """Indicates if a given policy supports addition of new work"""
        return False

    def getMaskedBlocks(self, task, dbs, datasetPath):
        """
        Get the blocks which pass the lumi mask restrictions. For each block
        return the list of lumis which were ok (given the lumi mask). The data
        structure returned is the following:
        {
            "block1" : {"file1" : LumiList(), "file5" : LumiList(), ...}
            "block2" : {"file2" : LumiList(), "file7" : LumiList(), ...}
        }
        """
        # Get the task mask as a LumiList object to make operations easier
        maskedBlocks = {}
        taskMask = task.getLumiMask()

        # for performance reasons, we first get all the blocknames
        blocks = [
            x['block_name'] for x in dbs.dbs.listBlocks(dataset=datasetPath)
        ]

        for block in blocks:
            fileLumis = dbs.dbs.listFileLumis(block_name=block,
                                              validFileOnly=1)
            for fileLumi in fileLumis:
                lfn = fileLumi['logical_file_name']
                runNumber = str(fileLumi['run_num'])
                lumis = fileLumi['lumi_section_num']
                fileMask = LumiList(runsAndLumis={runNumber: lumis})
                commonMask = taskMask & fileMask
                if commonMask:
                    maskedBlocks.setdefault(block, {})
                    maskedBlocks[block].setdefault(lfn, LumiList())
                    maskedBlocks[block][lfn] += commonMask

        return maskedBlocks

    def modifyPolicyForWorkAddition(self, inboxElement):
        """Set modifiers to the policy based on the inboxElement information so that after a splitting pass
        with this policy strictly new work is returned, the inbox element must have information
        about already existing work"""
        raise NotImplementedError(
            "This can't be called on a base StartPolicyInterface object")

    def newDataAvailable(self, task, inbound):
        """
            Returns True if there is data in the future could be included as an element
            for the inbound parent. However it doesn't guarantee that the new data
            will be included if the inbound element is split (i.e. the new data could be open blocks for the Block policy).
        """
        raise NotImplementedError(
            "This can't be called on a base StartPolicyInterface object")

    def getDatasetLocations(self, datasets):
        """
        Returns a dictionary with the location of the datasets according to Rucio
        The definition of "location" here is a union of all sites holding at least
        part of the dataset (defined by the DATASET grouping).
        :param datasets: dictionary with a list of dataset names (key'ed by the DBS URL)
        :return: a dictionary of dataset locations, key'ed by the dataset name
        """
        result = {}
        for dbsUrl in datasets:
            for datasetPath in datasets[dbsUrl]:
                if hasattr(self, "rucio"):
                    locations = self.rucio.getDataLockedAndAvailable(
                        name=datasetPath, account=self.args['rucioAcct'])
                else:
                    locations = set()
                    resp = self.phedex.getReplicaPhEDExNodesForBlocks(
                        dataset=[datasetPath], complete='y')
                    for blockSites in resp.values():
                        locations.update(blockSites)
                result[datasetPath] = self.cric.PNNstoPSNs(locations)
        return result

    def blockLocationRucioPhedex(self, blockName):
        """
        Wrapper around Rucio and PhEDEx systems.
        Fetch the current location of the block name (if Rucio,
        also consider the locks made on that block)
        :param blockName: string with the block name
        :return: a list of RSEs
        """
        if hasattr(self, "rucio"):
            location = self.rucio.getDataLockedAndAvailable(
                name=blockName, account=self.args['rucioAcct'])
        else:
            location = self.phedex.getReplicaPhEDExNodesForBlocks(
                block=[blockName], complete='y')[blockName]
        return location