Esempi in Python per DBSReader.listDatasetFileDetails

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: WMCore.Services.DBS.DBSReader

Classe/tipologia: DBSReader

Metodo/funzione: listDatasetFileDetails

Esempi su hotexamples.com: 14

DBSReader.listDatasetFileDetails in Python: 14 esempi trovati. Questi sono i migliori esempi reali in Python per WMCore.Services.DBS.DBSReader.DBSReader.listDatasetFileDetails, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

DBSReader(30)

listFileBlockLocation(13)

listDatasetFileDetails(9)

listFileBlocks(9)

getFileBlocksInfo(9)

listFilesInBlockWithParents(6)

listFilesInBlock(6)

blockIsOpen(5)

getFileBlock(5)

getFiles(5)

blockExists(4)

matchProcessedDatasets(4)

getFileBlockWithParents(4)

listDatasetFiles(3)

listBlockParents(3)

lfnsInBlock(3)

getDBSSummaryInfo(3)

blockToDatasetPath(3)

listOpenFileBlocks(3)

listPrimaryDatasets(3)

listProcessedDatasets(3)

listRunLumis(3)

listRuns(3)

listDatatiers(2)

listDatasetParents(1)

listDatasetLocation(1)

Esempio n. 1

Mostra file

File: report.py Progetto: belforte/CRABClient

    def getDBSPublicationInfo(self, outputDatasets):
        """
        What has been published

        Get the lumis and number of events in the published output datasets.
        """
        res = {}
        res['outputDatasets'] = {}

        for outputDataset in outputDatasets:
            res['outputDatasets'][outputDataset] = {'lumis': {}, 'numEvents': 0}
            try:
                dbs = DBSReader("https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader",
                                cert=self.proxyfilename, key=self.proxyfilename)
                outputDatasetDetails = dbs.listDatasetFileDetails(outputDataset)
            except Exception as ex:
                msg  = "Failed to retrieve information from DBS for output dataset %s." % (outputDataset)
                msg += " Exception while contacting DBS: %s" % (str(ex))
                self.logger.exception(msg)
            else:
                outputDatasetLumis = self.compactLumis(outputDatasetDetails)
                outputDatasetLumis = LumiList(runsAndLumis=outputDatasetLumis).getCompactList()
                res['outputDatasets'][outputDataset]['lumis'] = outputDatasetLumis
                for outputFileDetails in outputDatasetDetails.values():
                    res['outputDatasets'][outputDataset]['numEvents'] += outputFileDetails['NumberOfEvents']

        return res

Esempio n. 2

Mostra file

File: report2.py Progetto: tomcornelis/CRABClient

    def getDBSPublicationInfo(self, outputDatasets):
        """
        What has been published

        Get the lumis and number of events in the published output datasets.
        """
        res = {}
        res['outputDatasets'] = {}

        for outputDataset in outputDatasets:
            res['outputDatasets'][outputDataset] = {
                'lumis': {},
                'numEvents': 0
            }
            try:
                dbs = DBSReader(
                    "https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader",
                    cfg_dict={
                        "cert": self.proxyfilename,
                        "key": self.proxyfilename,
                        "logger": self.logger,
                        "pycurl": True
                    })  #We can only publish here with DBS3
                outputDatasetDetails = dbs.listDatasetFileDetails(
                    outputDataset)
            except Exception as ex:
                msg = "Failed to retrieve information from DBS for output dataset %s." % (
                    outputDataset)
                msg += " Exception while contacting DBS: %s" % (str(ex))
                self.logger.exception(msg)
            else:
                outputDatasetLumis = self.compactLumis(outputDatasetDetails)
                outputDatasetLumis = LumiList(
                    runsAndLumis=outputDatasetLumis).getCompactList()
                res['outputDatasets'][outputDataset][
                    'lumis'] = outputDatasetLumis
                for outputFileDetails in outputDatasetDetails.values():
                    res['outputDatasets'][outputDataset][
                        'numEvents'] += outputFileDetails['NumberOfEvents']

        return res

Esempio n. 3

Mostra file

class DBSReaderTest(unittest.TestCase):
    def setUp(self):
        """
        _setUp_

        Initialize the PhEDEx API to point at the test server.
        """
        #self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
        self.endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader'
        self.dbs = None
        return

    @attr("integration")
    def testListPrimaryDatasets(self):
        """
        listPrimaryDatasets returns known primary datasets
        """
        self.dbs = DBSReader(self.endpoint)
        results = self.dbs.listPrimaryDatasets('Jet*')
        self.assertTrue('Jet' in results)
        self.assertTrue('JetMET' in results)
        self.assertTrue('JetMETTau' in results)
        self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist'))
        return

    @attr("integration")
    def testMatchProcessedDatasets(self):
        """
        matchProcessedDatasets returns known processed datasets
        """
        self.dbs = DBSReader(self.endpoint)
        dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1')
        self.assertEqual(1, len(dataset))
        self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList'])
        self.assertEqual('Run2011A-v1', dataset[0]['Name'])
        self.assertFalse(
            self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666'))

    @attr("integration")
    def testlistRuns(self):
        """listRuns returns known runs"""
        self.dbs = DBSReader(self.endpoint)
        runs = self.dbs.listRuns(dataset=DATASET)
        self.assertEqual(46, len(runs))
        self.assertTrue(174074 in runs)
        runs = self.dbs.listRuns(dataset=DATASET, block=BLOCK)
        self.assertEqual([173657], runs)

    @attr("integration")
    def testlistRunLumis(self):
        """listRunLumis returns known runs and lumicounts"""
        self.dbs = DBSReader(self.endpoint)
        runs = self.dbs.listRunLumis(dataset=DATASET)
        self.assertEqual(46, len(runs))
        self.assertTrue(173692 in runs)
        self.assertEqual(runs[173692], 2782)
        runs = self.dbs.listRuns(dataset=DATASET, block=BLOCK)
        self.assertEqual({173657: 94}, runs)

    @attr("integration")
    def testListProcessedDatasets(self):
        """listProcessedDatasets returns known processed datasets"""
        self.dbs = DBSReader(self.endpoint)
        datasets = self.dbs.listProcessedDatasets('Jet', 'RAW')
        self.assertTrue('Run2011A-v1' in datasets)
        self.assertTrue('Run2011B-v1' in datasets)
        self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah'))
        self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW'))

    @attr("integration")
    def testlistDatasetFiles(self):
        """listDatasetFiles returns files in dataset"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.listDatasetFiles(DATASET)
        self.assertEqual(49, len(files))
        self.assertTrue(FILE in files)

    @attr("integration")
    def testlistDatasetFileDetails(self):
        """testlistDatasetFilesDetails returns lumis, events, and parents of a dataset"""
        TESTFILE = '/store/data/Run2011A/HighPileUp/RAW/v1/000/173/658/56484BAB-CBCB-E011-AF00-BCAEC518FF56.root'
        for endpoint in [
                self.endpoint,
                'test/python/WMCore_t/Services_t/DBS_t/DBSReader_t.py:'
        ]:
            self.dbs = DBSReader(endpoint)
            details = self.dbs.listDatasetFileDetails(DATASET)
            self.assertEqual(len(details), 49)
            self.assertTrue(TESTFILE in details)
            self.assertEqual(details[TESTFILE]['NumberOfEvents'], 545)
            self.assertEqual(details[TESTFILE]['Size'], 286021145)
            self.assertEqual(
                details[TESTFILE]['BlockName'],
                '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace'
            )
            self.assertEqual(details[TESTFILE]['Checksums'], {
                'Checksum': '22218315',
                'Adler32': 'a41a1446',
                'Md5': 'NOTSET'
            })
            self.assertTrue(173658 in details[TESTFILE]['Lumis'])
            self.assertEqual( sorted(details[TESTFILE]['Lumis'][173658]),
                sorted( map( long, [8, 12, 9, 14, 10, 6, 2, 1, 4, 3, 36, 49, 16, 11, 27, 35, 46, 39, 20, 24, 52, 23, 40, 42, 45, 21, 32, 37,  \
                                    25, 22, 5, 33, 17, 15, 26, 50, 18, 29, 51, 44, 69, 43, 30, 73, 19, 41, 13, 38, 7, 31, 75, 48, 59, 65, 55, \
                                    57, 34, 28, 74, 47, 64, 61, 68, 77, 66, 71, 60, 76, 70, 67, 62, 78, 82, 79, 88, 56, 101, 92, 58, 72, 54,  \
                                    63, 96, 53, 84, 95, 89, 85, 99, 81, 91, 102, 80, 100, 107, 94, 93, 90, 86, 87, 83, 97, 104, 110, 111, 106,\
                                    108, 98, 103, 109, 105]))
            )

    @attr("integration")
    def testGetDBSSummaryInfo(self):
        """getDBSSummaryInfo returns summary of dataset and block"""
        self.dbs = DBSReader(self.endpoint)
        dataset = self.dbs.getDBSSummaryInfo(DATASET)
        self.assertEqual(dataset['path'], DATASET)
        self.assertEqual(dataset['block'], '')
        self.assertEqual(dataset['NumberOfEvents'], '22075')
        self.assertEqual(dataset['NumberOfBlocks'], '46')
        self.assertEqual(dataset['total_size'], '4001680824')
        self.assertEqual(dataset['NumberOfFiles'], '49')
        self.assertEqual(dataset['NumberOfLumis'], '7223')

        block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK)
        self.assertEqual(block['path'], '')
        self.assertEqual(block['block'], BLOCK)
        self.assertEqual(block['NumberOfEvents'], '377')
        self.assertEqual(block['NumberOfBlocks'], '1')
        self.assertEqual(block['total_size'], '150780132')
        self.assertEqual(block['NumberOfFiles'], '2')
        self.assertEqual(block['NumberOfLumis'], '94')

        self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo,
                          DATASET + 'blah')
        self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET,
                          BLOCK + 'asas')

    @attr("integration")
    def testGetFileBlocksInfo(self):
        """getFileBlocksInfo returns block info, including location lookup"""
        self.dbs = DBSReader(self.endpoint)
        blocks = self.dbs.getFileBlocksInfo(DATASET)
        block = self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK)
        self.assertEqual(1, len(block))
        block = block[0]
        self.assertEqual(46, len(blocks))
        self.assertTrue(block['Name'] in [x['Name'] for x in blocks])
        self.assertEqual(BLOCK, block['Name'])
        #self.assertEqual(377, block['NumberOfEvents'])
        self.assertEqual(150780132, block['BlockSize'])
        self.assertEqual(2, block['NumberOfFiles'])
        # possibly fragile but assume block located at least at cern
        sites = [
            x['Name'] for x in block['StorageElementList']
            if x['Name'].find('cern.ch') > -1
        ]
        self.assertTrue(sites)

        # weird error handling - depends on whether block or dataset is missing
        self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo,
                          DATASET + 'blah')
        self.assertFalse(
            self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK + 'asas'))

    @attr("integration")
    def testListFileBlocks(self):
        """listFileBlocks returns block names in dataset"""
        self.dbs = DBSReader(self.endpoint)
        blocks = self.dbs.listFileBlocks(DATASET)
        # block is closed
        block = self.dbs.listFileBlocks(DATASET,
                                        blockName=BLOCK,
                                        onlyClosedBlocks=True)[0]
        self.assertEqual(block, BLOCK)
        self.assertTrue(BLOCK in block)

    @attr("integration")
    def testListOpenFileBlocks(self):
        """listOpenFileBlocks finds open blocks"""
        # hard to find a dataset with open blocks, so don't bother
        self.dbs = DBSReader(self.endpoint)
        self.assertFalse(self.dbs.listOpenFileBlocks(DATASET))

    @attr("integration")
    def testBlockExists(self):
        """blockExists returns existence of blocks"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(self.dbs.blockExists(BLOCK))
        self.assertFalse(self.dbs.blockExists(DATASET + '#somethingelse'))

    @attr("integration")
    def testListFilesInBlock(self):
        """listFilesInBlock returns files in block"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(
            FILE in
            [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)])
        self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock,
                          DATASET + '#blah')

    @attr("integration")
    def testListFilesInBlockWithParents(self):
        """listFilesInBlockWithParents gets files with parents for a block"""
        # hope PromptReco doesn't get deleted
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.listFilesInBlockWithParents(
            '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60'
        )
        self.assertEqual(1, len(files))
        self.assertEqual(
            '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60',
            files[0]['Block']['Name'])
        self.assertEqual(
            '/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root',
            files[0]['ParentList'][0]['LogicalFileName'])

        self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents,
                          BLOCK + 'asas')

    @attr("integration")
    def testLfnsInBlock(self):
        """lfnsInBlock returns lfns in block"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(FILE in self.dbs.lfnsInBlock(BLOCK))
        self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas')

    @attr("integration")
    def testListFileBlockLocation(self):
        """listFileBlockLocation returns block location"""
        WRONG_BLOCK = BLOCK[:-4] + 'abcd'
        BLOCK2 = '/HighPileUp/Run2011A-v1/RAW#6021175e-cbfb-11e0-80a9-003048caaace'
        DBS_BLOCK = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\
                                    'ea0972193530f531086947d06eb0f121/USER#fb978442-a61b-413a-b4f4-526e6cdb142e'
        DBS_BLOCK2 = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\
                                    'ea0972193530f531086947d06eb0f121/USER#0b04d417-d734-4ef2-88b0-392c48254dab'
        self.dbs = DBSReader(
            'https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader/')
        # assume one site is cern
        sites = [
            x for x in self.dbs.listFileBlockLocation(BLOCK)
            if x and x.find('cern.ch') > -1
        ]
        self.assertTrue(sites)
        #This block is only found on DBS
        self.assertTrue(self.dbs.listFileBlockLocation(DBS_BLOCK))
        # doesn't raise on non-existant block
        self.assertFalse(self.dbs.listFileBlockLocation(WRONG_BLOCK))
        #test bulk call:
        ## two blocks in phedex
        self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK,
                                                                BLOCK2])))
        ## one block in phedex one does not exist
        self.assertEqual(
            1, len(self.dbs.listFileBlockLocation([BLOCK, WRONG_BLOCK])))
        ## one in phedex one in dbs
        self.assertEqual(
            2, len(self.dbs.listFileBlockLocation([BLOCK, DBS_BLOCK])))
        ## two in dbs
        self.assertEqual(
            2, len(self.dbs.listFileBlockLocation([DBS_BLOCK, DBS_BLOCK2])))
        ## one in DBS and one does not exist
        self.assertEqual(
            1, len(self.dbs.listFileBlockLocation([DBS_BLOCK, WRONG_BLOCK])))

    @attr("integration")
    def testGetFileBlock(self):
        """getFileBlock returns block"""
        self.dbs = DBSReader(self.endpoint)
        block = self.dbs.getFileBlock(BLOCK)
        self.assertEqual(len(block), 1)
        block = block[BLOCK]
        self.assertEqual(2, len(block['Files']))

        self.assertRaises(DBSReaderError, self.dbs.getFileBlock,
                          BLOCK + 'asas')

    @attr("integration")
    def testGetFileBlockWithParents(self):
        """getFileBlockWithParents returns block and parents"""
        self.dbs = DBSReader(self.endpoint)
        block = self.dbs.getFileBlockWithParents(
            '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60'
        )
        self.assertEqual(len(block), 1)
        block = block[
            '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60']
        self.assertEqual(
            '/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root',
            block['Files'][0]['ParentList'][0]['LogicalFileName'])

        self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents,
                          BLOCK + 'asas')

    @attr("integration")
    def testGetFiles(self):
        """getFiles returns files in dataset"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.getFiles(DATASET)
        self.assertEqual(len(files), 46)

    @attr("integration")
    def testListBlockParents(self):
        """listBlockParents returns block parents"""
        self.dbs = DBSReader(self.endpoint)
        parents = self.dbs.listBlockParents(
            '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60'
        )
        self.assertEqual(1, len(parents))
        self.assertEqual(
            '/Jet/Run2011A-v1/RAW#37cf2a40-4e0e-11e0-9833-00151755cb60',
            parents[0]['Name'])
        sites = [
            x for x in parents[0]['StorageElementList']
            if x.find("cern.ch") > -1
        ]
        self.assertTrue(sites)

        self.assertFalse(
            self.dbs.listBlockParents(
                '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60dsl'
            ))

    @attr("integration")
    def testBlockIsOpen(self):
        """blockIsOpen checks if a block is open"""
        self.dbs = DBSReader(self.endpoint)
        self.assertFalse(self.dbs.blockIsOpen(BLOCK))

    @attr("integration")
    def testBlockToDatasetPath(self):
        """blockToDatasetPath extracts path from block name"""
        self.dbs = DBSReader(self.endpoint)
        self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET)
        self.assertFalse(self.dbs.blockToDatasetPath(BLOCK + 'asas'))

Esempio n. 4

Mostra file

class DBSDataDiscovery(DataDiscovery):
    """Performing the data discovery through CMS DBS service.
    """
    def checkDatasetStatus(self, dataset, kwargs):
        res = self.dbs.dbs.listDatasets(dataset=dataset,
                                        detail=1,
                                        dataset_access_type='*')
        if not res:
            raise TaskWorkerException(
                "Cannot find dataset %s in %s DBS instance" %
                (dataset, self.dbsInstance))
        if len(res) > 1:
            raise TaskWorkerException(
                "Found more than one dataset while checking in DBS the status of %s"
                % dataset)
        res = res[0]
        #import pprint
        #self.logger.info("Input dataset details: %s", pprint.pformat(res))
        accessType = res['dataset_access_type']
        if accessType != 'VALID':
            # as per Dima's suggestion https://github.com/dmwm/CRABServer/issues/4739
            msgForDeprecDS = "Please contact your physics group if you think the dataset should not be deprecated."
            if kwargs['task']['tm_nonvalid_input_dataset'] != 'T':
                msg = "CRAB refuses to proceed in getting the details of the dataset %s from DBS, because the dataset is not 'VALID' but '%s'." % (
                    dataset, accessType)
                if accessType == 'DEPRECATED':
                    msg += " (%s)" % (msgForDeprecDS)
                msg += " To allow CRAB to consider a dataset that is not 'VALID', set Data.allowNonValidInputDataset = True in the CRAB configuration."
                msg += " Notice that this will not force CRAB to run over all files in the dataset;"
                msg += " CRAB will still check if there are any valid files in the dataset and run only over those files."
                raise TaskWorkerException(msg)
            msg = "The input dataset %s is not 'VALID' but '%s'." % (
                dataset, accessType)
            msg += " CRAB will check if there are any valid files in the dataset and run only over those files."
            if accessType == 'DEPRECATED':
                msg += " %s" % (msgForDeprecDS)
            self.uploadWarning(msg, kwargs['task']['user_proxy'],
                               kwargs['task']['tm_taskname'])
        return

    def keepOnlyDisks(self, locationsMap):
        phedex = PhEDEx()  # TODO use certs from the config!
        # get all the PNNs that are of kind 'Disk'
        try:
            diskLocations = set([
                pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node']
                if pnn['kind'] == 'Disk'
            ])
        except HTTPException as ex:
            self.logger.error(ex.headers)
            raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\
                                "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\
                                " and contact the experts if the error persists.\nError reason: %s" % str(ex)) # TODO addo the nodes phedex so the user can check themselves
        diskLocationsMap = {}
        for block, locations in locationsMap.iteritems():
            locations[:] = [
                x for x in locations if x != 'T3_CH_CERN_OpenData'
            ]  # ignore OpenData until it is accessible by CRAB
            if set(locations) & diskLocations:
                # at least some locations are disk
                diskLocationsMap[block] = locationsMap[block]
            else:
                # no locations are in the disk list, assume that they are tape
                self.tapeLocations = self.tapeLocations.union(
                    set(locations) - diskLocations)
        locationsMap.clear()  # remove all blocks
        locationsMap.update(
            diskLocationsMap)  # add only blocks with disk locations

    def checkBlocksSize(self, blocks):
        """ Make sure no single blocks has more than 100k lumis. See
            https://hypernews.cern.ch/HyperNews/CMS/get/dmDevelopment/2022/1/1/1/1/1/1/2.html
        """
        MAX_LUMIS = 100000
        for block in blocks:
            blockInfo = self.dbs.getDBSSummaryInfo(block=block)
            if blockInfo.get('NumberOfLumis', 0) > MAX_LUMIS:
                msg = "Block %s contains more than %s lumis.\nThis blows up CRAB server memory" % (
                    block, MAX_LUMIS)
                msg += "\nCRAB can only split this by ignoring lumi information. You can do this"
                msg += "\nusing FileBased split algorithm and avoiding any additional request"
                msg += "\nwich may cause lumi information to be looked up. See CRAB FAQ for more info:"
                msg += "\nhttps://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ"
                raise TaskWorkerException(msg)

    def execute(self, *args, **kwargs):
        """
        This is a convenience wrapper around the executeInternal function
        """

        # DBS3 requires X509_USER_CERT to be set - but we don't want to leak that to other modules
        # so use a context manager to set an ad hoc env and restore as soon as
        # executeInternal is over, even if it raises exception

        with self.config.TaskWorker.envForCMSWEB:
            result = self.executeInternal(*args, **kwargs)

        return result

    def executeInternal(self, *args, **kwargs):

        self.logger.info(
            "Data discovery with DBS")  ## to be changed into debug

        dbsurl = self.config.Services.DBSUrl
        if kwargs['task']['tm_dbs_url']:
            dbsurl = kwargs['task']['tm_dbs_url']
        self.dbs = DBSReader(dbsurl)
        self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"]
        isUserDataset = self.dbsInstance.split('/')[1] != 'global'
        # where to look locations in pre-Rucio world
        PhEDExOrDBS = 'PhEDEx' if not isUserDataset else 'DBS origin site'

        taskName = kwargs['task']['tm_taskname']
        userProxy = kwargs['task']['user_proxy']
        self.logger.debug("Data discovery through %s for %s", self.dbs,
                          taskName)

        inputDataset = kwargs['task']['tm_input_dataset']
        secondaryDataset = kwargs['task'].get('tm_secondary_input_dataset',
                                              None)

        self.checkDatasetStatus(inputDataset, kwargs)
        if secondaryDataset:
            self.checkDatasetStatus(secondaryDataset, kwargs)

        try:
            # Get the list of blocks for the locations.
            # The WMCore DBS3 implementation makes one call to DBS for each block
            # when using locations=True so we are using locations=False and looking up location later
            blocks = [
                x['Name'] for x in self.dbs.getFileBlocksInfo(inputDataset,
                                                              locations=False)
            ]
            if secondaryDataset:
                secondaryBlocks = [
                    x['Name']
                    for x in self.dbs.getFileBlocksInfo(secondaryDataset,
                                                        locations=False)
                ]
        except DBSReaderError as dbsexc:
            # dataset not found in DBS is a known use case
            if str(dbsexc).find('No matching data'):
                raise TaskWorkerException(
                    "CRAB could not find dataset %s in this DBS instance: %s" %
                    inputDataset, dbsurl)
            raise
        ## Create a map for block's locations: for each block get the list of locations.
        ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no
        ## locations are found it gets the original locations from DBS. So it should
        ## never be the case at this point that some blocks have no locations.
        ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example:
        ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'],
        ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL']}

        # For now apply Rucio data location only to NANOAOD*
        # in time useRucioForLocations may become a more rich expression
        isNano = blocks[0].split("#")[0].split("/")[-1] in [
            "NANOAOD", "NANOAODSIM"
        ]
        if isNano:
            self.logger.info(
                "NANOAOD* datset. Will use Rucio for data location")
        useRucioForLocations = isNano
        locationsFoundWithRucio = False

        if not useRucioForLocations:
            self.logger.info("Will not use Rucio for this dataset")
        # if locations should be in Rucio, try it first and fall back to old ways if Rucio calls fail
        # of if they return no locations (possible Rucio teething pain). If Rucio returns a list, trust it.
        if useRucioForLocations:
            locationsMap = {}
            scope = "cms"
            # If the dataset is a USER one, use the Rucio user scope to find it
            # TODO: we need a way to enable users to indicate others user scopes as source
            if isUserDataset:
                scope = "user.%s" % kwargs['task']['tm_username']
            rucio_config_dict = {
                "phedexCompatible": True,
                "auth_type": "x509",
                "ca_cert": self.config.Services.Rucio_caPath,
                "logger": self.logger,
                "creds": {
                    "client_cert": self.config.TaskWorker.cmscert,
                    "client_key": self.config.TaskWorker.cmskey
                }
            }
            try:
                self.logger.info("Initializing Rucio client")
                # WMCore is awfully verbose
                with tempSetLogLevel(logger=self.logger, level=logging.ERROR):
                    rucioClient = Rucio(
                        self.config.Services.Rucio_account,
                        hostUrl=self.config.Services.Rucio_host,
                        authUrl=self.config.Services.Rucio_authUrl,
                        configDict=rucio_config_dict)
                rucioClient.whoAmI()
                self.logger.info(
                    "Looking up data location with Rucio in %s scope.", scope)
                with tempSetLogLevel(logger=self.logger, level=logging.ERROR):
                    locations = rucioClient.getReplicaInfoForBlocks(
                        scope=scope, block=list(blocks))
            except Exception as exc:
                msg = "Rucio lookup failed with\n%s" % str(exc)
                # TODO when removing fall-back to PhEDEx, this should be a fatal error
                # raise TaskWorkerException(msg)
                self.logger.warn(msg)
                locations = None

            # TODO when removing fall-back to PhEDEx, above code will raise if it fails, therefore
            # the following "if" must be removed and the code shifted left
            if locations:
                located_blocks = locations['phedex']['block']
                for element in located_blocks:
                    if element[
                            'replica']:  # only fill map for blocks which have at least one location
                        locationsMap.update({
                            element['name']:
                            [x['node'] for x in element['replica']]
                        })
                if locationsMap:
                    locationsFoundWithRucio = True
                else:
                    msg = "No locations found with Rucio for this dataset"
                    # since NANO* are not in PhEDEx, this should be a fatal error
                    if isNano:
                        raise TaskWorkerException(msg)
                    else:
                        # note it down and try with PhEDEx
                        self.logger.warn(msg)

        if not locationsFoundWithRucio:  # fall back to pre-Rucio methods
            try:
                self.logger.info("Looking up data locations using %s",
                                 PhEDExOrDBS)
                locationsMap = self.dbs.listFileBlockLocation(
                    list(blocks), dbsOnly=isUserDataset)
            except Exception as ex:
                raise TaskWorkerException(
                    "The CRAB3 server backend could not get the location of the files from dbs nor phedex nor rucio.\n"+\
                    "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)"+\
                    " and contact the experts if the error persists.\nError reason: %s" % str(ex)
                    )
            # only fill map for blocks which have at least one location
            locationsMap = {
                key: value
                for key, value in locationsMap.iteritems() if value
            }

        if secondaryDataset:
            secondaryLocationsMap = {}
            # see https://github.com/dmwm/CRABServer/issues/6075#issuecomment-641569446
            self.logger.info(
                "Trying data location of secondary blocks with Rucio")
            try:
                locations = rucioClient.getReplicaInfoForBlocks(
                    scope=scope, block=list(secondaryBlocks))
            except Exception as exc:
                locations = None
                secondaryLocationsMap = {}
                self.logger.warn("Rucio lookup failed with. %s", exc)
            if locations:
                located_blocks = locations['phedex']['block']
                for element in located_blocks:
                    if element[
                            'replica']:  # only fill map for blocks which have at least one location
                        secondaryLocationsMap.update({
                            element['name']:
                            [x['node'] for x in element['replica']]
                        })
            if not secondaryLocationsMap:
                msg = "No locations found with Rucio for secondaryDataset."
                # TODO when removing fall-back to PhEDEx, this should be a fatal error
                # raise TaskWorkerException(msg)
                self.logger.warn(msg)
                self.logger.info(
                    "Trying data location of secondary blocks with PhEDEx")
                try:
                    secondaryLocationsMap = self.dbs.listFileBlockLocation(
                        list(secondaryBlocks), dbsOnly=isUserDataset)
                except Exception as ex:
                    raise TaskWorkerException(
                        "The CRAB3 server backend could not get the location of the secondary dataset files from dbs or phedex or rucio.\n" + \
                        "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)" + \
                        " and contact the experts if the error persists.\nError reason: %s" % str(ex)
                    )
                # only fill map for blocks which have at least one location
                secondaryLocationsMap = {
                    key: value
                    for key, value in secondaryLocationsMap.iteritems()
                    if value
                }

        # From now on code is not dependent from having used Rucio or PhEDEx

        blocksWithLocation = locationsMap.keys()
        if secondaryDataset:
            secondaryBlocksWithLocation = secondaryLocationsMap.keys()

        self.keepOnlyDisks(locationsMap)
        if not locationsMap:
            msg = "Task could not be submitted because there is no DISK replica for dataset %s" % inputDataset
            if self.tapeLocations:
                msg += "\nN.B.: the input dataset is stored at %s, but those are TAPE locations." % ', '.join(
                    sorted(self.tapeLocations))
                # submit request to DDM
                ddmRequest = None
                ddmServer = self.config.TaskWorker.DDMServer
                try:
                    ddmRequest = blocksRequest(blocksWithLocation,
                                               ddmServer,
                                               self.config.TaskWorker.cmscert,
                                               self.config.TaskWorker.cmskey,
                                               verbose=False)
                except HTTPException as hte:
                    self.logger.exception(hte)
                    msg += "\nThe automatic stage-out failed, please try again later. If the error persists contact the experts and provide this error message:"
                    msg += "\nHTTP Error while contacting the DDM server %s:\n%s" % (
                        ddmServer, str(hte))
                    msg += "\nHTTP Headers are: %s" % hte.headers
                    msg += "\nYou might want to contact your physics group if you need a disk replica."
                    raise TaskWorkerException(msg, retry=True)

                self.logger.info("Contacted %s using %s and %s, got:\n%s",
                                 self.config.TaskWorker.DDMServer,
                                 self.config.TaskWorker.cmscert,
                                 self.config.TaskWorker.cmskey, ddmRequest)
                # The query above returns a JSON with a format {"result": "OK", "message": "Copy requested", "data": [{"request_id": 18, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:57:37", "last_request": "2018-02-26 23:57:37", "request_count": 1}]}
                if ddmRequest["result"] == "OK":
                    # set status to TAPERECALL
                    tapeRecallStatus = 'TAPERECALL'
                    ddmReqId = ddmRequest["data"][0]["request_id"]
                    configreq = {
                        'workflow': taskName,
                        'taskstatus': tapeRecallStatus,
                        'ddmreqid': ddmReqId,
                        'subresource': 'addddmreqid',
                    }
                    try:
                        tapeRecallStatusSet = self.server.post(
                            self.restURInoAPI + '/task',
                            data=urllib.urlencode(configreq))
                    except HTTPException as hte:
                        self.logger.exception(hte)
                        msg = "HTTP Error while contacting the REST Interface %s:\n%s" % (
                            self.config.TaskWorker.restHost, str(hte))
                        msg += "\nSetting %s status and DDM request ID (%d) failed for task %s" % (
                            tapeRecallStatus, ddmReqId, taskName)
                        msg += "\nHTTP Headers are: %s" % hte.headers
                        raise TaskWorkerException(msg, retry=True)

                    msg += "\nA disk replica has been requested on %s to CMS DDM (request ID: %d)" % (
                        ddmRequest["data"][0]["first_request"], ddmReqId)
                    if tapeRecallStatusSet[2] == "OK":
                        self.logger.info("Status for task %s set to '%s'",
                                         taskName, tapeRecallStatus)
                        msg += "\nThis task will be automatically submitted as soon as the stage-out is completed."
                        self.uploadWarning(msg, userProxy, taskName)

                        raise TapeDatasetException(msg)
                    else:
                        msg += ", please try again in two days."

                else:
                    msg += "\nThe disk replica request failed with this error:\n %s" % ddmRequest[
                        "message"]

            msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK."
            raise TaskWorkerException(msg)

        # will not need lumi info if user has asked for split by file with no run/lumi mask
        splitAlgo = kwargs['task']['tm_split_algo']
        lumiMask = kwargs['task']['tm_split_args']['lumis']
        runRange = kwargs['task']['tm_split_args']['runs']

        needLumiInfo = splitAlgo != 'FileBased' or lumiMask != [] or runRange != []
        # secondary dataset access relies on run/lumi info
        if secondaryDataset:
            needLumiInfo = True
        if needLumiInfo:
            self.checkBlocksSize(
                blocksWithLocation
            )  # Interested only in blocks with locations, 'blocks' may contain invalid ones and trigger an Exception
            if secondaryDataset:
                self.checkBlocksSize(secondaryBlocksWithLocation)
        try:
            filedetails = self.dbs.listDatasetFileDetails(
                inputDataset,
                getParents=True,
                getLumis=needLumiInfo,
                validFileOnly=0)
            if secondaryDataset:
                moredetails = self.dbs.listDatasetFileDetails(
                    secondaryDataset,
                    getParents=False,
                    getLumis=needLumiInfo,
                    validFileOnly=0)

                for secfilename, secinfos in moredetails.items():
                    secinfos['lumiobj'] = LumiList(
                        runsAndLumis=secinfos['Lumis'])

                self.logger.info(
                    "Beginning to match files from secondary dataset")
                for dummyFilename, infos in filedetails.items():
                    infos['Parents'] = []
                    lumis = LumiList(runsAndLumis=infos['Lumis'])
                    for secfilename, secinfos in moredetails.items():
                        if lumis & secinfos['lumiobj']:
                            infos['Parents'].append(secfilename)
                self.logger.info("Done matching files from secondary dataset")
                kwargs['task']['tm_use_parent'] = 1
        except Exception as ex:  #TODO should we catch HttpException instead?
            self.logger.exception(ex)
            raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\
                                "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\
                                " and contact the experts if the error persists.\nError reason: %s" % str(ex))
            #TODO addo the nodes phedex so the user can check themselves
        if not filedetails:
            raise TaskWorkerException(("Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n" +\
                                "Aborting submission. Resubmitting your task will not help.") %\
                                ("https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s") %\
                                (self.dbsInstance, inputDataset))

        ## Format the output creating the data structures required by WMCore. Filters out invalid files,
        ## files whose block has no location, and figures out the PSN
        result = self.formatOutput(task=kwargs['task'],
                                   requestname=taskName,
                                   datasetfiles=filedetails,
                                   locations=locationsMap,
                                   tempDir=kwargs['tempDir'])

        if not result.result:
            raise TaskWorkerException((
                "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n"
                + "Aborting submission. Resubmitting your task will not help."
            ) % (
                "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s"
            ) % (self.dbsInstance, inputDataset))

        self.logger.debug("Got %s files", len(result.result.getFiles()))

        return result

Esempio n. 5

Mostra file

class DBSReaderTest(unittest.TestCase):
    def setUp(self):
        """
        _setUp_

        Initialize the PhEDEx API to point at the test server.
        """
        #self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
        self.endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader'
        self.dbs = None
        return

    @attr("integration")
    def testListDatatiers(self):
        """
        listDatatiers returns all datatiers available
        """
        self.dbs = DBSReader(self.endpoint)
        results = self.dbs.listDatatiers()
        self.assertTrue('RAW' in results)
        self.assertTrue('GEN-SIM-RECO' in results)
        self.assertTrue('GEN-SIM' in results)
        self.assertFalse('RAW-ALAN' in results)
        return

    @attr("integration")
    def testListPrimaryDatasets(self):
        """
        listPrimaryDatasets returns known primary datasets
        """
        self.dbs = DBSReader(self.endpoint)
        results = self.dbs.listPrimaryDatasets('Jet*')
        self.assertTrue('Jet' in results)
        self.assertTrue('JetMET' in results)
        self.assertTrue('JetMETTau' in results)
        self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist'))
        return

    @attr("integration")
    def testMatchProcessedDatasets(self):
        """
        matchProcessedDatasets returns known processed datasets
        """
        self.dbs = DBSReader(self.endpoint)
        dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1')
        self.assertEqual(1, len(dataset))
        self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList'])
        self.assertEqual('Run2011A-v1', dataset[0]['Name'])
        self.assertFalse(
            self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666'))

    def testlistRuns(self):
        """listRuns returns known runs"""
        self.dbs = DBSReader(self.endpoint)
        runs = self.dbs.listRuns(dataset=DATASET)
        self.assertEqual(46, len(runs))
        self.assertTrue(174074 in runs)
        runs = self.dbs.listRuns(block=BLOCK)
        self.assertEqual(1, len(runs))
        self.assertEqual([173657], runs)

    def testlistRunLumis(self):
        """listRunLumis returns known runs and lumicounts (None for DBS3)"""
        self.dbs = DBSReader(self.endpoint)
        runs = self.dbs.listRunLumis(dataset=DATASET)
        self.assertEqual(46, len(runs))
        self.assertTrue(173692 in runs)
        self.assertEqual(runs[173692], None)
        runs = self.dbs.listRunLumis(block=BLOCK)
        self.assertEqual(1, len(runs))
        self.assertTrue(173657 in runs)
        self.assertEqual(runs[173657], None)

    @attr("integration")
    def testListProcessedDatasets(self):
        """listProcessedDatasets returns known processed datasets"""
        self.dbs = DBSReader(self.endpoint)
        datasets = self.dbs.listProcessedDatasets('Jet', 'RAW')
        self.assertTrue('Run2011A-v1' in datasets)
        self.assertTrue('Run2011B-v1' in datasets)
        self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah'))
        self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW'))

    def testlistDatasetFiles(self):
        """listDatasetFiles returns files in dataset"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.listDatasetFiles(DATASET)
        self.assertEqual(49, len(files))
        self.assertTrue(FILE in files)

    def testlistDatasetFileDetails(self):
        """testlistDatasetFilesDetails returns lumis, events, and parents of a dataset"""
        TESTFILE = '/store/data/Run2011A/HighPileUp/RAW/v1/000/173/658/56484BAB-CBCB-E011-AF00-BCAEC518FF56.root'
        self.dbs = DBSReader(self.endpoint)
        details = self.dbs.listDatasetFileDetails(DATASET)
        self.assertEqual(len(details), 49)
        self.assertTrue(TESTFILE in details)
        self.assertEqual(details[TESTFILE]['NumberOfEvents'], 545)
        self.assertEqual(details[TESTFILE]['file_size'], 286021145)
        self.assertEqual(
            details[TESTFILE]['BlockName'],
            '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace')
        self.assertEqual(details[TESTFILE]['Md5'], 'NOTSET')
        self.assertEqual(details[TESTFILE]['md5'], 'NOTSET')
        self.assertEqual(details[TESTFILE]['Adler32'], 'a41a1446')
        self.assertEqual(details[TESTFILE]['adler32'], 'a41a1446')
        self.assertEqual(details[TESTFILE]['Checksum'], '22218315')
        self.assertEqual(details[TESTFILE]['check_sum'], '22218315')
        self.assertTrue(173658 in details[TESTFILE]['Lumis'])
        self.assertEqual(sorted(details[TESTFILE]['Lumis'][173658]), [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
            20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
            37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
            54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
            71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
            88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
            104, 105, 106, 107, 108, 109, 110, 111
        ])

    def testGetDBSSummaryInfo(self):
        """getDBSSummaryInfo returns summary of dataset and block"""
        self.dbs = DBSReader(self.endpoint)
        dataset = self.dbs.getDBSSummaryInfo(DATASET)
        self.assertEqual(dataset['path'], DATASET)
        self.assertEqual(dataset['block'], '')
        self.assertEqual(dataset['NumberOfEvents'], 22075)
        self.assertEqual(dataset['NumberOfBlocks'], 46)
        self.assertEqual(dataset['FileSize'], 4001680824)
        self.assertEqual(dataset['file_size'], 4001680824)
        self.assertEqual(dataset['NumberOfFiles'], 49)
        self.assertEqual(dataset['NumberOfLumis'], 7223)

        block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK)
        self.assertEqual(block['path'], '')
        self.assertEqual(block['block'], BLOCK)
        self.assertEqual(block['NumberOfEvents'], 377)
        self.assertEqual(block['NumberOfBlocks'], 1)
        self.assertEqual(block['FileSize'], 150780132)
        self.assertEqual(block['file_size'], 150780132)
        self.assertEqual(block['NumberOfFiles'], 2)
        self.assertEqual(block['NumberOfLumis'], 94)

        self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo,
                          DATASET + 'blah')
        self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET,
                          BLOCK + 'asas')

    @attr("integration")
    def testGetFileBlocksInfo(self):
        """getFileBlocksInfo returns block info, including location lookup"""
        self.dbs = DBSReader(self.endpoint)
        blocks = self.dbs.getFileBlocksInfo(DATASET)
        block = self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK)
        self.assertEqual(1, len(block))
        block = block[0]
        self.assertEqual(46, len(blocks))
        self.assertTrue(block['Name'] in [x['Name'] for x in blocks])
        self.assertEqual(BLOCK, block['Name'])
        self.assertEqual(0, block['OpenForWriting'])
        self.assertEqual(150780132, block['BlockSize'])
        self.assertEqual(2, block['NumberOfFiles'])
        # possibly fragile but assume block located at least at cern
        sites = [
            x['Name'] for x in block['PhEDExNodeList']
            if x['Name'].find('CH_CERN') > -1
        ]
        self.assertTrue(sites)

        # weird error handling - depends on whether block or dataset is missing
        self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo,
                          DATASET + 'blah')
        self.assertRaises(DBSReaderError,
                          self.dbs.getFileBlocksInfo,
                          DATASET,
                          blockName=BLOCK + 'asas')

    def testListFileBlocks(self):
        """listFileBlocks returns block names in dataset"""
        self.dbs = DBSReader(self.endpoint)
        blocks = self.dbs.listFileBlocks(DATASET)
        self.assertTrue(BLOCK in blocks)
        # block is closed
        block = self.dbs.listFileBlocks(DATASET,
                                        blockName=BLOCK,
                                        onlyClosedBlocks=True)[0]
        self.assertEqual(block, BLOCK)
        self.assertTrue(BLOCK in block)

    def testListOpenFileBlocks(self):
        """listOpenFileBlocks finds open blocks"""
        # hard to find a dataset with open blocks, so don't bother
        self.dbs = DBSReader(self.endpoint)
        self.assertFalse(self.dbs.listOpenFileBlocks(DATASET))

    def testBlockExists(self):
        """blockExists returns existence of blocks"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(self.dbs.blockExists(BLOCK))
        self.assertRaises(DBSReaderError, self.dbs.blockExists,
                          DATASET + '#somethingelse')

    def testListFilesInBlock(self):
        """listFilesInBlock returns files in block"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(
            FILE in
            [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)])
        self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock,
                          DATASET + '#blah')

    def testListFilesInBlockWithParents(self):
        """listFilesInBlockWithParents gets files with parents for a block"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.listFilesInBlockWithParents(
            '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0'
        )
        self.assertEqual(4, len(files))
        self.assertEqual(
            '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0',
            files[0]['block_name'])
        self.assertEqual(
            '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0',
            files[0]['BlockName'])
        self.assertEqual(
            '/store/data/Commissioning2015/Cosmics/RAW/v1/000/238/545/00000/1043E89F-2DCF-E411-9CAE-02163E013751.root',
            files[0]['ParentList'][0]['LogicalFileName'])

        self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents,
                          BLOCK + 'asas')

    def testLfnsInBlock(self):
        """lfnsInBlock returns lfns in block"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(
            FILE in
            [x['logical_file_name'] for x in self.dbs.lfnsInBlock(BLOCK)])
        self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas')

    @attr("integration")
    def testListFileBlockLocation(self):
        """listFileBlockLocation returns block location"""
        WRONG_BLOCK = BLOCK[:-4] + 'abcd'
        BLOCK2 = '/HighPileUp/Run2011A-v1/RAW#6021175e-cbfb-11e0-80a9-003048caaace'
        DBS_BLOCK = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\
                                    'ea0972193530f531086947d06eb0f121/USER#fb978442-a61b-413a-b4f4-526e6cdb142e'
        DBS_BLOCK2 = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\
                                    'ea0972193530f531086947d06eb0f121/USER#0b04d417-d734-4ef2-88b0-392c48254dab'
        self.dbs = DBSReader(
            'https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader/')
        # assume one site is cern
        sites = [
            x for x in self.dbs.listFileBlockLocation(BLOCK)
            if x and x.find('CH_CERN') > -1
        ]
        self.assertTrue(sites)
        #This block is only found on DBS
        self.assertTrue(self.dbs.listFileBlockLocation(DBS_BLOCK))
        # doesn't raise on non-existant block
        self.assertFalse(self.dbs.listFileBlockLocation(WRONG_BLOCK))
        #test bulk call:
        ## two blocks in phedex
        self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK,
                                                                BLOCK2])))
        ## one block in phedex one does not exist
        self.assertEqual(
            1, len(self.dbs.listFileBlockLocation([BLOCK, WRONG_BLOCK])))
        ## one in phedex one in dbs
        self.assertEqual(
            2, len(self.dbs.listFileBlockLocation([BLOCK, DBS_BLOCK])))
        ## two in dbs
        self.assertEqual(
            2, len(self.dbs.listFileBlockLocation([DBS_BLOCK, DBS_BLOCK2])))
        ## one in DBS and one does not exist
        self.assertEqual(
            1, len(self.dbs.listFileBlockLocation([DBS_BLOCK, WRONG_BLOCK])))

    def testGetFileBlock(self):
        """getFileBlock returns block"""
        self.dbs = DBSReader(self.endpoint)
        block = self.dbs.getFileBlock(BLOCK)
        self.assertEqual(len(block), 1)
        block = block[BLOCK]
        self.assertEqual(2, len(block['Files']))

        self.assertRaises(DBSReaderError, self.dbs.getFileBlock,
                          BLOCK + 'asas')

    def testGetFileBlockWithParents(self):
        """getFileBlockWithParents returns block and parents"""
        self.dbs = DBSReader(self.endpoint)
        block = self.dbs.getFileBlockWithParents(
            '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0'
        )
        self.assertEqual(len(block), 1)
        block = block[
            '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0']
        self.assertEqual(
            '/store/data/Commissioning2015/Cosmics/RAW/v1/000/238/545/00000/1043E89F-2DCF-E411-9CAE-02163E013751.root',
            block['Files'][0]['ParentList'][0]['LogicalFileName'])

        self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents,
                          BLOCK + 'asas')

    def testGetFiles(self):
        """getFiles returns files in dataset"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.getFiles(DATASET)
        self.assertEqual(len(files), 46)

    def testListBlockParents(self):
        """listBlockParents returns block parents"""
        self.dbs = DBSReader(self.endpoint)
        parents = self.dbs.listBlockParents(
            '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0'
        )
        self.assertEqual(1, len(parents))
        self.assertEqual(
            '/Cosmics/Commissioning2015-v1/RAW#942d76fe-cf0e-11e4-afad-001e67ac06a0',
            parents[0]['Name'])
        sites = [
            x for x in parents[0]['PhEDExNodeList'] if x.find("CH_CERN") > -1
        ]
        self.assertTrue(sites)

        self.assertFalse(
            self.dbs.listBlockParents(
                '/Cosmics/Commissioning2015-v1/RAW#942d76fe-cf0e-11e4-afad-001e67ac06a0'
            ))

    def testBlockIsOpen(self):
        """blockIsOpen checks if a block is open"""
        self.dbs = DBSReader(self.endpoint)
        self.assertFalse(self.dbs.blockIsOpen(BLOCK))

    def testBlockToDatasetPath(self):
        """blockToDatasetPath extracts path from block name"""
        self.dbs = DBSReader(self.endpoint)
        self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET)
        self.assertRaises(DBSReaderError, self.dbs.blockToDatasetPath,
                          BLOCK + 'asas')

Esempio n. 6

Mostra file

class DBSDataDiscovery(DataDiscovery):
    """Performing the data discovery through CMS DBS service.
    """
    def checkDatasetStatus(self, dataset, kwargs):
        res = self.dbs.dbs.listDatasets(dataset=dataset,
                                        detail=1,
                                        dataset_access_type='*')
        if len(res) > 1:
            raise TaskWorkerException(
                "Found more than one dataset while checking in DBS the status of %s"
                % dataset)
        if len(res) == 0:
            raise TaskWorkerException(
                "Cannot find dataset %s in %s DBS instance" %
                (dataset, self.dbsInstance))
        res = res[0]
        self.logger.info("Input dataset details: %s", pprint.pformat(res))
        accessType = res['dataset_access_type']
        if accessType != 'VALID':
            # as per Dima's suggestion https://github.com/dmwm/CRABServer/issues/4739
            msgForDeprecDS = "Please contact your physics group if you think the dataset should not be deprecated."
            if kwargs['task']['tm_nonvalid_input_dataset'] != 'T':
                msg = "CRAB refuses to proceed in getting the details of the dataset %s from DBS, because the dataset is not 'VALID' but '%s'." % (
                    dataset, accessType)
                if accessType == 'DEPRECATED':
                    msg += " (%s)" % (msgForDeprecDS)
                msg += " To allow CRAB to consider a dataset that is not 'VALID', set Data.allowNonValidInputDataset = True in the CRAB configuration."
                msg += " Notice that this will not force CRAB to run over all files in the dataset;"
                msg += " CRAB will still check if there are any valid files in the dataset and run only over those files."
                raise TaskWorkerException(msg)
            msg = "The input dataset %s is not 'VALID' but '%s'." % (
                dataset, accessType)
            msg += " CRAB will check if there are any valid files in the dataset and run only over those files."
            if accessType == 'DEPRECATED':
                msg += " %s" % (msgForDeprecDS)
            self.uploadWarning(msg, kwargs['task']['user_proxy'],
                               kwargs['task']['tm_taskname'])
        return

    def keepOnlyDisks(self, locationsMap):
        self.otherLocations = set()
        phedex = PhEDEx()  #TODO use certs from the config!
        #get all the PNN that are of kind disk
        try:
            diskLocations = set([
                pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node']
                if pnn['kind'] == 'Disk'
            ])
        except HTTPException as ex:
            self.logger.error(ex.headers)
            raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\
                                "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\
                                " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves
        for block, locations in locationsMap.iteritems():
            locationsMap[block] = set(locations) & diskLocations
            self.otherLocations = self.otherLocations.union(
                set(locations) - diskLocations)
        #remove any key with value that has set([])
        for key, value in locationsMap.items():  #wont work in python3!
            if value == set([]):
                locationsMap.pop(key)

    def checkBlocksSize(self, blocks):
        """ Make sure no single blocks has more than 100k lumis. See
            https://hypernews.cern.ch/HyperNews/CMS/get/dmDevelopment/2022/1/1/1/1/1/1/2.html
        """
        MAX_LUMIS = 100000
        for block in blocks:
            blockInfo = self.dbs.getDBSSummaryInfo(block=block)
            if blockInfo['NumberOfLumis'] > MAX_LUMIS:
                msg = "Block %s contains more than %s lumis.\nThis blows up CRAB server memory" % (
                    block, MAX_LUMIS)
                msg += "\nCRAB can only split this by ignoring lumi information. You can do this"
                msg += "\nusing FileBased split algorithm and avoiding any additional request"
                msg += "\nwich may cause lumi information to be looked up. See CRAB FAQ for more info:"
                msg += "\nhttps://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ"
                raise TaskWorkerException(msg)

    def execute(self, *args, **kwargs):
        """
        This is a convenience wrapper around the executeInternal function
        """

        # DBS3 requires X509_USER_CERT to be set - but we don't want to leak that to other modules
        # so use a context manager to set an ad hoc env and restore as soon as
        # executeInternal is over, even if it raises exception

        with self.config.TaskWorker.envForCMSWEB:
            result = self.executeInternal(*args, **kwargs)

        return result

    def executeInternal(self, *args, **kwargs):
        self.logger.info(
            "Data discovery with DBS")  ## to be changed into debug

        dbsurl = self.config.Services.DBSUrl
        if kwargs['task']['tm_dbs_url']:
            dbsurl = kwargs['task']['tm_dbs_url']
        self.dbs = DBSReader(dbsurl)
        self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"]

        taskName = kwargs['task']['tm_taskname']
        self.logger.debug("Data discovery through %s for %s", self.dbs,
                          taskName)

        inputDataset = kwargs['task']['tm_input_dataset']
        secondaryDataset = kwargs['task'].get('tm_secondary_input_dataset',
                                              None)

        self.checkDatasetStatus(inputDataset, kwargs)
        if secondaryDataset:
            self.checkDatasetStatus(secondaryDataset, kwargs)

        try:
            # Get the list of blocks for the locations and then call dls.
            # The WMCore DBS3 implementation makes one call to dls for each block
            # with locations = True so we are using locations=False and looking up location later
            blocks = [
                x['Name'] for x in self.dbs.getFileBlocksInfo(inputDataset,
                                                              locations=False)
            ]
            if secondaryDataset:
                secondaryBlocks = [
                    x['Name']
                    for x in self.dbs.getFileBlocksInfo(secondaryDataset,
                                                        locations=False)
                ]
        except DBSReaderError as dbsexc:
            #dataset not found in DBS is a known use case
            if str(dbsexc).find('No matching data'):
                raise TaskWorkerException(
                    "CRAB could not find dataset %s in this DBS instance: %s" %
                    inputDataset, dbsurl)
            raise
        ## Create a map for block's locations: for each block get the list of locations.
        ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no
        ## locations are found it gets the original locations from DBS. So it should
        ## never be the case at this point that some blocks have no locations.
        ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example:
        ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'],
        ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'}
        try:
            dbsOnly = self.dbsInstance.split('/')[1] != 'global'
            locationsMap = self.dbs.listFileBlockLocation(list(blocks),
                                                          dbsOnly=dbsOnly)
        except Exception as ex:  #TODO should we catch HttpException instead?
            self.logger.exception(ex)
            raise TaskWorkerException("The CRAB3 server backend could not get the location of the files from dbs or phedex.\n"+\
                                      "This is could be a temporary phedex/dbs glitch, please try to submit a new task (resubmit will not work)"+\
                                      " and contact the experts if the error persists.\nError reason: %s" % str(ex))
        self.keepOnlyDisks(locationsMap)
        if not locationsMap:
            msg = "Task could not be submitted because there is no DISK replica for dataset %s" % inputDataset
            if self.otherLocations:
                msg += "\nN.B.: the input dataset is stored at %s, but those are TAPE locations." % ', '.join(
                    sorted(self.otherLocations))
                # submit request to DDM
                ddmRequest = blocksRequest(blocks,
                                           self.config.TaskWorker.DDMServer,
                                           self.config.TaskWorker.cmscert,
                                           self.config.TaskWorker.cmskey,
                                           verbose=False)
                self.logger.info("Contacted %s using %s and %s, got:\n%s",
                                 self.config.TaskWorker.DDMServer,
                                 self.config.TaskWorker.cmscert,
                                 self.config.TaskWorker.cmskey, ddmRequest)
                # The query above returns a JSON with a format {"result": "OK", "message": "Copy requested", "data": [{"request_id": 18, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:57:37", "last_request": "2018-02-26 23:57:37", "request_count": 1}]}
                if ddmRequest["result"] == "OK":
                    msg += "\nA disk replica has been requested on %s" % ddmRequest[
                        "data"][0]["first_request"]
                    # set status to TAPERECALL
                    tapeRecallStatus = 'TAPERECALL'
                    ddmReqId = ddmRequest["data"][0]["request_id"]
                    server = HTTPRequests(
                        url=self.config.TaskWorker.resturl,
                        localcert=kwargs['task']['user_proxy'],
                        localkey=kwargs['task']['user_proxy'],
                        verbose=False)
                    configreq = {
                        'workflow': taskName,
                        'taskstatus': tapeRecallStatus,
                        'ddmreqid': ddmReqId,
                        'subresource': 'addddmreqid'
                    }
                    try:
                        tapeRecallStatusSet = server.post(
                            self.config.TaskWorker.restURInoAPI + 'task',
                            data=urllib.urlencode(configreq))
                    except HTTPException as hte:
                        msg = "HTTP Error while contacting the REST Interface %s:\n%s" % (
                            self.config.TaskWorker.resturl, str(hte))
                        msg += "\nSetting %s status and DDM request ID (%d) failed for task %s" % (
                            tapeRecallStatus, ddmReqId, taskName)
                        msg += "\nHTTP Headers are: %s" % hte.headers
                        raise TaskWorkerException(msg, retry=True)

                    if tapeRecallStatusSet[2] == "OK":
                        self.logger.info("Status for task %s set to '%s'",
                                         taskName, tapeRecallStatus)
                        msg += " and the task will be submitted as soon as it is completed."
                        self.uploadWarning(msg, kwargs['task']['user_proxy'],
                                           taskName)

                        raise TapeDatasetException(msg)
                    else:
                        msg += ", please try again in two days."

            msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK."
            msg += " You might want to contact your physics group if you need a disk replica."
            raise TaskWorkerException(msg)
        if len(blocks) != len(locationsMap):
            self.logger.warning(
                "The locations of some blocks have not been found: %s",
                set(blocks) - set(locationsMap))

        # will not need lumi info if user has asked for split by file with no run/lumi mask
        splitAlgo = kwargs['task']['tm_split_algo']
        lumiMask = kwargs['task']['tm_split_args']['lumis']
        runRange = kwargs['task']['tm_split_args']['runs']

        needLumiInfo = splitAlgo != 'FileBased' or lumiMask != [] or runRange != []
        # secondary dataset access relies on run/lumi info
        if secondaryDataset: needLumiInfo = True

        if needLumiInfo:
            self.checkBlocksSize(blocks)
            if secondaryDataset:
                self.checkBlocksSize(secondaryBlocks)
        try:
            filedetails = self.dbs.listDatasetFileDetails(
                inputDataset,
                getParents=True,
                getLumis=needLumiInfo,
                validFileOnly=0)
            if secondaryDataset:
                moredetails = self.dbs.listDatasetFileDetails(
                    secondaryDataset,
                    getParents=False,
                    getLumis=needLumiInfo,
                    validFileOnly=0)

                for secfilename, secinfos in moredetails.items():
                    secinfos['lumiobj'] = LumiList(
                        runsAndLumis=secinfos['Lumis'])

                self.logger.info(
                    "Beginning to match files from secondary dataset")
                for dummyFilename, infos in filedetails.items():
                    infos['Parents'] = []
                    lumis = LumiList(runsAndLumis=infos['Lumis'])
                    for secfilename, secinfos in moredetails.items():
                        if (lumis & secinfos['lumiobj']):
                            infos['Parents'].append(secfilename)
                self.logger.info("Done matching files from secondary dataset")
                kwargs['task']['tm_use_parent'] = 1
        except Exception as ex:  #TODO should we catch HttpException instead?
            self.logger.exception(ex)
            raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\
                                "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\
                                " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves
        if not filedetails:
            raise TaskWorkerException((
                "Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n"
                "Aborting submission. Resubmitting your task will not help."
            ) % (
                "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s"
            ) % (self.dbsInstance, inputDataset))

        ## Format the output creating the data structures required by wmcore. Filters out invalid files,
        ## files whose block has no location, and figures out the PSN
        result = self.formatOutput(task=kwargs['task'],
                                   requestname=taskName,
                                   datasetfiles=filedetails,
                                   locations=locationsMap,
                                   tempDir=kwargs['tempDir'])

        if not result.result:
            raise TaskWorkerException((
                "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n"
                "Aborting submission. Resubmitting your task will not help."
            ) % (
                "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s"
            ) % (self.dbsInstance, inputDataset))

        self.logger.debug("Got %s files", len(result.result.getFiles()))

        return result

Esempio n. 7

Mostra file

File: DBSDataDiscovery.py Progetto: sharmaprajesh/CRABServer

class DBSDataDiscovery(DataDiscovery):
    """Performing the data discovery through CMS DBS service.
    """

    # disable pylint warning in next line since they refer to conflict with the main()
    # at the bottom of this file which is only used for testing
    def __init__(self, config, crabserver='', procnum=-1, rucioClient=None): # pylint: disable=redefined-outer-name
        DataDiscovery.__init__(self, config, crabserver, procnum)
        self.rucioClient = rucioClient

    def checkDatasetStatus(self, dataset, kwargs):
        res = self.dbs.dbs.listDatasets(dataset=dataset, detail=1, dataset_access_type='*')
        if not res:
            raise TaskWorkerException("Cannot find dataset %s in %s DBS instance" % (dataset, self.dbsInstance))
        if len(res) > 1:
            raise TaskWorkerException("Found more than one dataset while checking in DBS the status of %s" % dataset)
        res = res[0]
        #import pprint
        #self.logger.info("Input dataset details: %s", pprint.pformat(res))
        accessType = res['dataset_access_type']
        if accessType != 'VALID':
            # as per Dima's suggestion https://github.com/dmwm/CRABServer/issues/4739
            msgForDeprecDS = "Please contact your physics group if you think the dataset should not be deprecated."
            if kwargs['task']['tm_nonvalid_input_dataset'] != 'T':
                msg = "CRAB refuses to proceed in getting the details of the dataset %s from DBS, because the dataset is not 'VALID' but '%s'." % (dataset, accessType)
                if accessType == 'DEPRECATED':
                    msg += " (%s)" % (msgForDeprecDS)
                msg += " To allow CRAB to consider a dataset that is not 'VALID', set Data.allowNonValidInputDataset = True in the CRAB configuration."
                msg += " Notice that this will not force CRAB to run over all files in the dataset;"
                msg += " CRAB will still check if there are any valid files in the dataset and run only over those files."
                raise TaskWorkerException(msg)
            msg = "The input dataset %s is not 'VALID' but '%s'." % (dataset, accessType)
            msg += " CRAB will check if there are any valid files in the dataset and run only over those files."
            if accessType == 'DEPRECATED':
                msg += " %s" % (msgForDeprecDS)
            self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname'])
        return

    def keepOnlyDiskRSEs(self, locationsMap):
        # get all the RucioStorageElements (RSEs) which are of kind 'Disk'
        # locationsMap is a dictionary {block1:[locations], block2:[locations],...}
        diskLocationsMap = {}
        for block, locations in locationsMap.iteritems():
            # as of Sept 2020, tape RSEs ends with _Tape, go for the quick hack
            diskRSEs = [rse for rse in locations if not 'Tape' in rse]
            if  'T3_CH_CERN_OpenData' in diskRSEs:
                diskRSEs.remove('T3_CH_CERN_OpenData') # ignore OpenData until it is accessible by CRAB
            if diskRSEs:
                # at least some locations are disk
                diskLocationsMap[block] = diskRSEs
            else:
                # no locations are disk, assume that they are tape
                # and keep tally of tape-only locations for this dataset
                self.tapeLocations = self.tapeLocations.union(set(locations) - set(diskRSEs))
        locationsMap.clear() # remove all blocks
        locationsMap.update(diskLocationsMap) # add only blocks with disk locations

    def checkBlocksSize(self, blocks):
        """ Make sure no single blocks has more than 100k lumis. See
            https://hypernews.cern.ch/HyperNews/CMS/get/dmDevelopment/2022/1/1/1/1/1/1/2.html
        """
        MAX_LUMIS = 100000
        for block in blocks:
            blockInfo = self.dbs.getDBSSummaryInfo(block=block)
            if blockInfo.get('NumberOfLumis', 0) > MAX_LUMIS:
                msg = "Block %s contains more than %s lumis.\nThis blows up CRAB server memory" % (block, MAX_LUMIS)
                msg += "\nCRAB can only split this by ignoring lumi information. You can do this"
                msg += "\nusing FileBased split algorithm and avoiding any additional request"
                msg += "\nwich may cause lumi information to be looked up. See CRAB FAQ for more info:"
                msg += "\nhttps://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ"
                raise TaskWorkerException(msg)

    def requestTapeRecall(self, blockList=[], system='Dynamo', msgHead=''):   # pylint: disable=W0102
        """
        :param blockList: a list of blocks to recall from Tape to Disk
        :param system: a string identifying the DDM system to use 'Dynamo' or 'Rucio' or 'None'
        :param msgHead: a string with the initial part of a message to be used for exceptions
        :return: nothing: Since data on tape means no submission possible, this function will
            always raise a TaskWorkerException to stop the action flow.
            The exception message contains details and an attempt is done to upload it to TaskDB
            so that crab status can report it
        """

        msg = msgHead
        if system == 'Rucio':
            # need to use crab_tape_recall Rucio account to create containers and create rules
            tapeRecallConfig = copy.copy(self.config)
            tapeRecallConfig.Services.Rucio_account = 'crab_tape_recall'
            rucioClient = getNativeRucioClient(tapeRecallConfig, self.logger) # pylint: disable=redefined-outer-name
            # turn input CMS blocks into Rucio dids in cms scope
            dids = [{'scope': 'cms', 'name': block} for block in blockList]
            # prepare container /TapeRecall/taskname/USER in the service scope
            myScope = 'user.crab_tape_recall'
            containerName = '/TapeRecall/%s/USER' % self.taskName.replace(':', '.')
            containerDid = {'scope':myScope, 'name':containerName}
            self.logger.info("Create RUcio container %s", containerName)
            try:
                rucioClient.add_container(myScope, containerName)
            except DataIdentifierAlreadyExists:
                self.logger.debug("Container name already exists in Rucio. Keep going")
            except Exception as ex:
                msg += "Rucio exception creating container: %s" %  (str(ex))
                raise TaskWorkerException(msg)
            try:
                rucioClient.attach_dids(myScope, containerName, dids)
            except DuplicateContent:
                self.logger.debug("Some dids are already in this container. Keep going")
            except Exception as ex:
                msg += "Rucio exception adding blocks to container: %s" %  (str(ex))
                raise TaskWorkerException(msg)
            self.logger.info("Rucio container %s:%s created with %d blocks", myScope, containerName, len(blockList))

            # Compute size of recall request
            sizeToRecall = 0
            for block in blockList:
                replicas = rucioClient.list_dataset_replicas('cms', block)
                blockBytes = replicas.next()['bytes']  # pick first replica for each block, they better all have same size
                sizeToRecall += blockBytes
            TBtoRecall = sizeToRecall // 1e12
            if TBtoRecall > 0:
                self.logger.info("Total size of data to recall : %d TBytes", TBtoRecall)
            else:
                self.logger.info("Total size of data to recall : %d GBytes", sizeToRecall/1e9)

            if TBtoRecall > 30.:
                grouping = 'DATASET'  # Rucio DATASET i.e. CMS block !
                self.logger.info("Will scatter blocks on multiple sites")
            else:
                grouping = 'ALL'
                self.logger.info("Will place all blocks at a single site")

            # create rule
            RSE_EXPRESSION = 'ddm_quota>0&(tier=1|tier=2)&rse_type=DISK'
            #RSE_EXPRESSION = 'T3_IT_Trieste' # for testing
            WEIGHT = 'ddm_quota'
            #WEIGHT = None # for testing
            LIFETIME = 14 * 24 * 3600  # 14 days
            ASK_APPROVAL = False
            #ASK_APPROVAL = True # for testing
            ACCOUNT = 'crab_tape_recall'
            copies = 1
            try:
                ruleId = rucioClient.add_replication_rule(dids=[containerDid],
                                                  copies=copies, rse_expression=RSE_EXPRESSION,
                                                  grouping=grouping,
                                                  weight=WEIGHT, lifetime=LIFETIME, account=ACCOUNT,
                                                  activity='Analysis Input',
                                                  comment='Staged from tape for %s' % self.username,
                                                  ask_approval=ASK_APPROVAL, asynchronous=True,
                                                  )
            except DuplicateRule as ex:
                # handle "A duplicate rule for this account, did, rse_expression, copies already exists"
                # which should only happen when testing, since container name is unique like task name, anyhow...
                self.logger.debug("A duplicate rule for this account, did, rse_expression, copies already exists. Use that")
                # find the existing rule id
                ruleId = rucioClient.list_did_rules(myScope, containerName)
            except (InsufficientTargetRSEs, InsufficientAccountLimit, FullStorage) as ex:
                msg = "Not enough global quota to issue a tape recall request. Rucio exception:\n%s" % str(ex)
                raise TaskWorkerException(msg)
            except Exception as ex:
                msg += "Rucio exception creating rule: %s" %  str(ex)
                raise TaskWorkerException(msg)
            ruleId = str(ruleId[0])  # from list to singleId and remove unicode

            msg += "\nA disk replica has been requested to Rucio (rule ID: %s )" % ruleId
            msg += "\nyou can check progress via either of the following two commands:"
            msg += "\n rucio rule-info %s" % ruleId
            msg += "\n rucio list-rules %s:%s" % (myScope, containerName)
            automaticTapeRecallIsImplemented = True
            if automaticTapeRecallIsImplemented:
                tapeRecallStatus = 'TAPERECALL'
            else:
                tapeRecallStatus = 'SUBMITFAILED'
            configreq = {'workflow': self.taskName,
                         'taskstatus': tapeRecallStatus,
                         'ddmreqid': ruleId,
                         'subresource': 'addddmreqid',
                         }
            try:
                tapeRecallStatusSet = self.crabserver.post(api='task', data=urllib.urlencode(configreq))
            except HTTPException as hte:
                self.logger.exception(hte)
                msg = "HTTP Error while contacting the REST Interface %s:\n%s" % (
                    self.config.TaskWorker.restHost, str(hte))
                msg += "\nStoring of %s status and ruleId (%s) failed for task %s" % (
                    tapeRecallStatus, ruleId, self.taskName)
                msg += "\nHTTP Headers are: %s" % hte.headers
                raise TaskWorkerException(msg, retry=True)
            if tapeRecallStatusSet[2] == "OK":
                self.logger.info("Status for task %s set to '%s'", self.taskName, tapeRecallStatus)
            if automaticTapeRecallIsImplemented:
                msg += "\nThis task will be automatically submitted as soon as the stage-out is completed."
                self.uploadWarning(msg, self.userproxy, self.taskName)
                raise TapeDatasetException(msg)
            # fall here if could not setup for automatic submission after recall
            msg += "\nPlease monitor recall progress via Rucio or DAS and try again once data are on disk."
            raise TaskWorkerException(msg)

        if system == 'None':
            msg += '\nIt is not possible to request a recall from tape.'
            msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK."
            raise TaskWorkerException(msg)

        if system == 'Dynamo':
            raise NotImplementedError


    def execute(self, *args, **kwargs):
        """
        This is a convenience wrapper around the executeInternal function
        """

        # DBS3 requires X509_USER_CERT to be set - but we don't want to leak that to other modules
        # so use a context manager to set an ad hoc env and restore as soon as
        # executeInternal is over, even if it raises exception

        with self.config.TaskWorker.envForCMSWEB:
            result = self.executeInternal(*args, **kwargs)

        return result

    def executeInternal(self, *args, **kwargs):


        self.logger.info("Data discovery with DBS") ## to be changed into debug


        if kwargs['task']['tm_dbs_url']:
            dbsurl = kwargs['task']['tm_dbs_url']
        else:
            dbsurl = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader'  # a sensible default
        if hasattr(self.config.Services, 'DBSHostName'):
            hostname = dbsurl.split('//')[1].split('/')[0]
            dbsurl = dbsurl.replace(hostname, self.config.Services.DBSHostName)
        self.logger.info("will connect to DBS at URL: %s", dbsurl)
        self.dbs = DBSReader(dbsurl)
        self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"]

        self.taskName = kwargs['task']['tm_taskname']           # pylint: disable=W0201
        self.username = kwargs['task']['tm_username']           # pylint: disable=W0201
        self.userproxy = kwargs['task']['user_proxy']           # pylint: disable=W0201
        self.logger.debug("Data discovery through %s for %s", self.dbs, self.taskName)

        inputDataset = kwargs['task']['tm_input_dataset']
        secondaryDataset = kwargs['task'].get('tm_secondary_input_dataset', None)

        # the isUserDataset flag is used to look for data location in DBS instead of Rucio
        isUserDataset = (self.dbsInstance.split('/')[1] != 'global') and \
                        (inputDataset.split('/')[-1] == 'USER')

        self.checkDatasetStatus(inputDataset, kwargs)
        if secondaryDataset:
            self.checkDatasetStatus(secondaryDataset, kwargs)

        try:
            # Get the list of blocks for the locations.
            blocks = self.dbs.listFileBlocks(inputDataset)
            if secondaryDataset:
                secondaryBlocks = self.dbs.listFileBlocks(secondaryDataset)
        except DBSReaderError as dbsexc:
            # dataset not found in DBS is a known use case
            if str(dbsexc).find('No matching data'):
                raise TaskWorkerException("CRAB could not find dataset %s in this DBS instance: %s" % inputDataset, dbsurl)
            raise
        ## Create a map for block's locations: for each block get the list of locations.
        ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no
        ## locations are found it gets the original locations from DBS. So it should
        ## never be the case at this point that some blocks have no locations.
        ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example:
        ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'],
        ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL']}

        # remove following line when ready to allow user dataset to have locations tracked in Rucio
        useRucioForLocations = not isUserDataset
        # uncomment followint line to look in Rucio first for any dataset, and fall back to DBS origin for USER ones
        # useRucioForLocations = True
        locationsFoundWithRucio = False

        if not useRucioForLocations:
            self.logger.info("Will not use Rucio for this dataset")

        if useRucioForLocations:
            scope = "cms"
            # If the dataset is a USER one, use the Rucio user scope to find it
            # TODO: we need a way to enable users to indicate others user scopes as source
            if isUserDataset:
                scope = "user.%s" % self.username
            self.logger.info("Looking up data location with Rucio in %s scope.", scope)
            locationsMap = {}
            try:
                for blockName in list(blocks):
                    replicas = set()
                    response = self.rucioClient.list_dataset_replicas(scope, blockName)
                    for item in response:
                        # same as complete='y' used for PhEDEx
                        if item['state'].upper() == 'AVAILABLE':
                            replicas.add(item['rse'])
                    if replicas:  # only fill map for blocks which have at least one location
                        locationsMap[blockName] = replicas
            except Exception as exc:
                msg = "Rucio lookup failed with\n%s" % str(exc)
                self.logger.warning(msg)
                locationsMap = None

            if locationsMap:
                locationsFoundWithRucio = True
            else:
                msg = "No locations found with Rucio for this dataset"
                self.logger.warning(msg)

        if not locationsFoundWithRucio:
            self.logger.info("No locations found with Rucio for %s", inputDataset)
            if isUserDataset:
                self.logger.info("USER dataset. Looking up data locations using origin site in DBS")
                try:
                    locationsMap = self.dbs.listFileBlockLocation(list(blocks))
                except Exception as ex:
                    raise TaskWorkerException(
                        "CRAB server could not get file locations from DBS for a USER dataset.\n"+\
                        "This is could be a temporary DBS glitch, please try to submit a new task (resubmit will not work)"+\
                        " and contact the experts if the error persists.\nError reason: %s" % str(ex)
                    )
            else:
                # datasets other than USER *must* be in Rucio
                raise TaskWorkerException(
                    "CRAB server could not get file locations from Rucio.\n" + \
                    "This is could be a temporary Rucio glitch, please try to submit a new task (resubmit will not work)" + \
                    " and contact the experts if the error persists."
                    )

        if secondaryDataset:
            if secondaryDataset.endswith('USER'):
                self.logger.info("Secondary dataset is USER. Looking up data locations using origin site in DBS")
                try:
                    secondaryLocationsMap = self.dbs.listFileBlockLocation(list(secondaryBlocks))
                except Exception as ex:
                    raise TaskWorkerException(
                        "CRAB server could not get file locations from DBS for secondary dataset of USER tier.\n"+\
                        "This is could be a temporary DBS glitch, please try to submit a new task (resubmit will not work)"+\
                        " and contact the experts if the error persists.\nError reason: %s" % str(ex)
                    )
            else:
                self.logger.info("Trying data location of secondary dataset blocks with Rucio")
                secondaryLocationsMap = {}
                try:
                    for blockName in list(secondaryBlocks):
                        replicas = set()
                        response = self.rucioClient.list_dataset_replicas(scope, blockName)
                        for item in response:
                            # same as complete='y' used for PhEDEx
                            if item['state'].upper() == 'AVAILABLE':
                                replicas.add(item['rse'])
                        if replicas:  # only fill map for blocks which have at least one location
                            secondaryLocationsMap[blockName] = replicas
                except Exception as exc:
                    msg = "Rucio lookup failed with\n%s" % str(exc)
                    self.logger.warning(msg)
                    secondaryLocationsMap = None
            if not secondaryLocationsMap:
                msg = "No locations found for secondaryDataset %s." % secondaryDataset
                raise TaskWorkerException(msg)


        # From now on code is not dependent from having used Rucio or PhEDEx

        blocksWithLocation = locationsMap.keys()
        if secondaryDataset:
            secondaryBlocksWithLocation = secondaryLocationsMap.keys()

        # filter out TAPE locations
        self.keepOnlyDiskRSEs(locationsMap)
        if not locationsMap:
            msg = "Task could not be submitted because there is no DISK replica for dataset %s" % inputDataset
            if self.tapeLocations:
                msg += "\nN.B.: the input dataset is stored at %s, but those are TAPE locations." % ', '.join(sorted(self.tapeLocations))
                # following function will always raise error and stop flow here, but will first
                # try to trigger a tape recall and place the task in tapeRecall status
                msg += "\nWill try to request a disk copy for you. See: https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ#crab_submit_fails_with_Task_coul"
                self.requestTapeRecall(blockList=blocksWithLocation, system='Rucio', msgHead=msg)

        # will not need lumi info if user has asked for split by file with no run/lumi mask
        splitAlgo = kwargs['task']['tm_split_algo']
        lumiMask = kwargs['task']['tm_split_args']['lumis']
        runRange = kwargs['task']['tm_split_args']['runs']

        needLumiInfo = splitAlgo != 'FileBased' or lumiMask != [] or runRange != []
        # secondary dataset access relies on run/lumi info
        if secondaryDataset:
            needLumiInfo = True
        if needLumiInfo:
            self.checkBlocksSize(blocksWithLocation) # Interested only in blocks with locations, 'blocks' may contain invalid ones and trigger an Exception
            if secondaryDataset:
                self.checkBlocksSize(secondaryBlocksWithLocation)
        try:
            filedetails = self.dbs.listDatasetFileDetails(inputDataset, getParents=True, getLumis=needLumiInfo, validFileOnly=0)
            if secondaryDataset:
                moredetails = self.dbs.listDatasetFileDetails(secondaryDataset, getParents=False, getLumis=needLumiInfo, validFileOnly=0)

                for secfilename, secinfos in moredetails.items():
                    secinfos['lumiobj'] = LumiList(runsAndLumis=secinfos['Lumis'])

                self.logger.info("Beginning to match files from secondary dataset")
                for dummyFilename, infos in filedetails.items():
                    infos['Parents'] = []
                    lumis = LumiList(runsAndLumis=infos['Lumis'])
                    for secfilename, secinfos in moredetails.items():
                        if lumis & secinfos['lumiobj']:
                            infos['Parents'].append(secfilename)
                self.logger.info("Done matching files from secondary dataset")
                kwargs['task']['tm_use_parent'] = 1
        except Exception as ex: #TODO should we catch HttpException instead?
            self.logger.exception(ex)
            raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\
                                "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\
                                " and contact the experts if the error persists.\nError reason: %s" % str(ex))
        if not filedetails:
            raise TaskWorkerException(("Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n" +\
                                "Aborting submission. Resubmitting your task will not help.") %\
                                ("https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s") %\
                                (self.dbsInstance, inputDataset))

        ## Format the output creating the data structures required by WMCore. Filters out invalid files,
        ## files whose block has no location, and figures out the PSN
        result = self.formatOutput(task=kwargs['task'], requestname=self.taskName,
                                   datasetfiles=filedetails, locations=locationsMap,
                                   tempDir=kwargs['tempDir'])

        if not result.result:
            raise TaskWorkerException(("Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n" +
                                       "Aborting submission. Resubmitting your task will not help.") %
                                      ("https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s") %
                                      (self.dbsInstance, inputDataset))

        self.logger.debug("Got %s files", len(result.result.getFiles()))

        return result

Esempio n. 8

Mostra file

File: HTCondorDataWorkflow.py Progetto: HassenRiahi/CRABServer

    def report(self, workflow, userdn, usedbs):
        """
        Computes the report for workflow. If usedbs is used also query DBS and return information about the input and output datasets
        """

        def _compactLumis(datasetInfo):
            """ Help function that allow to convert from runLumis divided per file (result of listDatasetFileDetails)
                to an aggregated result.
            """
            lumilist = {}
            for dummyfile, info in datasetInfo.iteritems():
                for run, lumis in info['Lumis'].iteritems():
                    lumilist.setdefault(str(run), []).extend(lumis)
            return lumilist

        res = {}
        self.logger.info("About to compute report of workflow: %s with usedbs=%s. Getting status first." % (workflow, usedbs))
        statusRes = self.status(workflow, userdn)[0]

        #get the information we need from the taskdb/initilize variables
        row = next(self.api.query(None, None, self.Task.ID_sql, taskname = workflow))
        row = self.Task.ID_tuple(*row)
        inputDataset = row.input_dataset
        outputDatasets = literal_eval(row.output_dataset.read() if row.output_dataset else 'None')
        dbsUrl = row.dbs_url

        #load the lumimask
        splitArgs = literal_eval(row.split_args.read())
        res['lumiMask'] = buildLumiMask(splitArgs['runs'], splitArgs['lumis'])
        self.logger.info("Lumi mask was: %s" % res['lumiMask'])

        #extract the finished jobs from filemetadata
        jobids = [x[1] for x in statusRes['jobList'] if x[0] in ['finished']]
        rows = self.api.query(None, None, self.FileMetaData.GetFromTaskAndType_sql, filetype='EDM,TFILE,POOLIN', taskname=workflow)

        res['runsAndLumis'] = {}
        for row in rows:
            if row[GetFromTaskAndType.PANDAID] in jobids:
                if str(row[GetFromTaskAndType.PANDAID]) not in res['runsAndLumis']:
                    res['runsAndLumis'][str(row[GetFromTaskAndType.PANDAID])] = []
                res['runsAndLumis'][str(row[GetFromTaskAndType.PANDAID])].append( { 'parents': row[GetFromTaskAndType.PARENTS].read(),
                        'runlumi': row[GetFromTaskAndType.RUNLUMI].read(),
                        'events': row[GetFromTaskAndType.INEVENTS],
                        'type': row[GetFromTaskAndType.TYPE],
                        'lfn': row[GetFromTaskAndType.LFN],
                })
        self.logger.info("Got %s edm files for workflow %s" % (len(res['runsAndLumis']), workflow))

        if usedbs:
            if not outputDatasets:
                raise ExecutionError("Cannot find any information about the output datasets names. You can try to execute 'crab report' with --dbs=no")
            try:
                #load the input dataset's lumilist
                dbs = DBSReader(dbsUrl)
                inputDetails = dbs.listDatasetFileDetails(inputDataset)
                res['dbsInLumilist'] = _compactLumis(inputDetails)
                self.logger.info("Aggregated input lumilist: %s" % res['dbsInLumilist'])
                #load the output datasets' lumilist
                res['dbsNumEvents'] = 0
                res['dbsNumFiles'] = 0
                res['dbsOutLumilist'] = {}
                dbs = DBSReader("https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader") #We can only publish here with DBS3
                outLumis = []
                for outputDataset in outputDatasets:
                    outputDetails = dbs.listDatasetFileDetails(outputDataset)
                    outLumis.append(_compactLumis(outputDetails))
                    res['dbsNumEvents'] += sum(x['NumberOfEvents'] for x in outputDetails.values())
                    res['dbsNumFiles'] += sum(len(x['Parents']) for x in outputDetails.values())

                outLumis = LumiList(runsAndLumis = outLumis).compactList
                for run, lumis in outLumis.iteritems():
                    res['dbsOutLumilist'][run] = reduce(lambda x1, x2: x1+x2, map(lambda x: range(x[0], x[1]+1), lumis))
                self.logger.info("Aggregated output lumilist: %s" % res['dbsOutLumilist'])
            except Exception as ex:
                msg = "Failed to contact DBS: %s" % str(ex)
                self.logger.exception(msg)
                raise ExecutionError("Exception while contacting DBS. Cannot get the input/output lumi lists. You can try to execute 'crab report' with --dbs=no")

        yield res

Esempio n. 9

Mostra file

File: DBSReader_t.py Progetto: mmascher/WMCore

class DBSReaderTest(EmulatedUnitTestCase):

    def setUp(self):
        """
        _setUp_

        Initialize the PhEDEx API to point at the test server.
        """
        #self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
        self.endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader'
        self.dbs = None
        super(DBSReaderTest, self).setUp()
        return

    def tearDown(self):
        """
        _tearDown_


        :return:
        """

        super(DBSReaderTest, self).tearDown()
        return

    @attr("integration")
    def testListDatatiers(self):
        """
        listDatatiers returns all datatiers available
        """
        self.dbs = DBSReader(self.endpoint)
        results = self.dbs.listDatatiers()
        self.assertTrue('RAW' in results)
        self.assertTrue('GEN-SIM-RECO' in results)
        self.assertTrue('GEN-SIM' in results)
        self.assertFalse('RAW-ALAN' in results)
        return

    @attr("integration")
    def testListPrimaryDatasets(self):
        """
        listPrimaryDatasets returns known primary datasets
        """
        self.dbs = DBSReader(self.endpoint)
        results = self.dbs.listPrimaryDatasets('Jet*')
        self.assertTrue('Jet' in results)
        self.assertTrue('JetMET' in results)
        self.assertTrue('JetMETTau' in results)
        self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist'))
        return

    @attr("integration")
    def testMatchProcessedDatasets(self):
        """
        matchProcessedDatasets returns known processed datasets
        """
        self.dbs = DBSReader(self.endpoint)
        dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1')
        self.assertEqual(1, len(dataset))
        self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList'])
        self.assertEqual('Run2011A-v1', dataset[0]['Name'])
        self.assertFalse(self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666'))

    def testlistRuns(self):
        """listRuns returns known runs"""
        self.dbs = DBSReader(self.endpoint)
        runs = self.dbs.listRuns(dataset=DATASET)
        self.assertEqual(46, len(runs))
        self.assertTrue(174074 in runs)
        runs = self.dbs.listRuns(block=BLOCK)
        self.assertEqual(1, len(runs))
        self.assertEqual([173657], runs)

    def testlistRunLumis(self):
        """listRunLumis returns known runs and lumicounts (None for DBS3)"""
        self.dbs = DBSReader(self.endpoint)
        runs = self.dbs.listRunLumis(dataset=DATASET)
        self.assertEqual(46, len(runs))
        self.assertTrue(173692 in runs)
        self.assertEqual(runs[173692], None)
        runs = self.dbs.listRunLumis(block=BLOCK)
        self.assertEqual(1, len(runs))
        self.assertTrue(173657 in runs)
        self.assertEqual(runs[173657], None)

    @attr("integration")
    def testListProcessedDatasets(self):
        """listProcessedDatasets returns known processed datasets"""
        self.dbs = DBSReader(self.endpoint)
        datasets = self.dbs.listProcessedDatasets('Jet', 'RAW')
        self.assertTrue('Run2011A-v1' in datasets)
        self.assertTrue('Run2011B-v1' in datasets)
        self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah'))
        self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW'))

    def testlistDatasetFiles(self):
        """listDatasetFiles returns files in dataset"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.listDatasetFiles(DATASET)
        self.assertEqual(49, len(files))
        self.assertTrue(FILE in files)

    def testlistDatasetFileDetails(self):
        """testlistDatasetFilesDetails returns lumis, events, and parents of a dataset"""
        TESTFILE = '/store/data/Run2011A/HighPileUp/RAW/v1/000/173/658/56484BAB-CBCB-E011-AF00-BCAEC518FF56.root'
        self.dbs = DBSReader(self.endpoint)
        details = self.dbs.listDatasetFileDetails(DATASET)
        self.assertEqual(len(details), 49)
        self.assertTrue(TESTFILE in details)
        self.assertEqual(details[TESTFILE]['NumberOfEvents'], 545)
        self.assertEqual(details[TESTFILE]['file_size'], 286021145)
        self.assertEqual(details[TESTFILE]['BlockName'], '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace')
        self.assertEqual(details[TESTFILE]['Md5'], 'NOTSET')
        self.assertEqual(details[TESTFILE]['md5'], 'NOTSET')
        self.assertEqual(details[TESTFILE]['Adler32'], 'a41a1446')
        self.assertEqual(details[TESTFILE]['adler32'], 'a41a1446')
        self.assertEqual(details[TESTFILE]['Checksum'], '22218315')
        self.assertEqual(details[TESTFILE]['check_sum'], '22218315')
        self.assertTrue(173658 in details[TESTFILE]['Lumis'])
        self.assertEqual(sorted(details[TESTFILE]['Lumis'][173658]),
                         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
                          27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
                          51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
                          75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
                          99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111])

    def testGetDBSSummaryInfo(self):
        """getDBSSummaryInfo returns summary of dataset and block"""
        self.dbs = DBSReader(self.endpoint)
        dataset = self.dbs.getDBSSummaryInfo(DATASET)
        self.assertEqual(dataset['path'], DATASET)
        self.assertEqual(dataset['block'], '')
        self.assertEqual(dataset['NumberOfEvents'], 22075)
        self.assertEqual(dataset['NumberOfBlocks'], 46)
        self.assertEqual(dataset['FileSize'], 4001680824)
        self.assertEqual(dataset['file_size'], 4001680824)
        self.assertEqual(dataset['NumberOfFiles'], 49)
        self.assertEqual(dataset['NumberOfLumis'], 7223)

        block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK)
        self.assertEqual(block['path'], '')
        self.assertEqual(block['block'], BLOCK)
        self.assertEqual(block['NumberOfEvents'], 377)
        self.assertEqual(block['NumberOfBlocks'], 1)
        self.assertEqual(block['FileSize'], 150780132)
        self.assertEqual(block['file_size'], 150780132)
        self.assertEqual(block['NumberOfFiles'], 2)
        self.assertEqual(block['NumberOfLumis'], 94)

        self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET + 'blah')
        self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET, BLOCK + 'asas')

    @attr("integration")
    def testGetFileBlocksInfo(self):
        """getFileBlocksInfo returns block info, including location lookup"""
        self.dbs = DBSReader(self.endpoint)
        blocks = self.dbs.getFileBlocksInfo(DATASET)
        block = self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK)
        self.assertEqual(1, len(block))
        block = block[0]
        self.assertEqual(46, len(blocks))
        self.assertTrue(block['Name'] in [x['Name'] for x in blocks])
        self.assertEqual(BLOCK, block['Name'])
        self.assertEqual(0, block['OpenForWriting'])
        self.assertEqual(150780132, block['BlockSize'])
        self.assertEqual(2, block['NumberOfFiles'])
        # possibly fragile but assume block located at least at cern
        sites = [x['Name'] for x in block['PhEDExNodeList'] if x['Name'].find('CH_CERN') > -1]
        self.assertTrue(sites)

        # weird error handling - depends on whether block or dataset is missing
        self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET + 'blah')
        self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET, blockName=BLOCK + 'asas')

    def testListFileBlocks(self):
        """listFileBlocks returns block names in dataset"""
        self.dbs = DBSReader(self.endpoint)
        blocks = self.dbs.listFileBlocks(DATASET)
        self.assertTrue(BLOCK in blocks)
        # block is closed
        block = self.dbs.listFileBlocks(DATASET, blockName=BLOCK, onlyClosedBlocks=True)[0]
        self.assertEqual(block, BLOCK)
        self.assertTrue(BLOCK in block)

    def testListOpenFileBlocks(self):
        """listOpenFileBlocks finds open blocks"""
        # hard to find a dataset with open blocks, so don't bother
        self.dbs = DBSReader(self.endpoint)
        self.assertFalse(self.dbs.listOpenFileBlocks(DATASET))

    def testBlockExists(self):
        """blockExists returns existence of blocks"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(self.dbs.blockExists(BLOCK))
        self.assertRaises(DBSReaderError, self.dbs.blockExists, DATASET + '#somethingelse')

    def testListFilesInBlock(self):
        """listFilesInBlock returns files in block"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(FILE in [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)])
        self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock, DATASET + '#blah')

    def testListFilesInBlockWithParents(self):
        """listFilesInBlockWithParents gets files with parents for a block"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.listFilesInBlockWithParents('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0')
        self.assertEqual(4, len(files))
        self.assertEqual('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0', files[0]['block_name'])
        self.assertEqual('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0', files[0]['BlockName'])
        self.assertEqual('/store/data/Commissioning2015/Cosmics/RAW/v1/000/238/545/00000/1043E89F-2DCF-E411-9CAE-02163E013751.root',
                         files[0]['ParentList'][0]['LogicalFileName'])

        self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents, BLOCK + 'asas')

    def testLfnsInBlock(self):
        """lfnsInBlock returns lfns in block"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(FILE in [x['logical_file_name'] for x in self.dbs.lfnsInBlock(BLOCK)])
        self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas')

    @attr("integration")
    def testListFileBlockLocation(self):
        """listFileBlockLocation returns block location"""
        WRONG_BLOCK = BLOCK[:-4]+'abcd'
        BLOCK2 = '/HighPileUp/Run2011A-v1/RAW#6021175e-cbfb-11e0-80a9-003048caaace'
        DBS_BLOCK = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\
                                    'ea0972193530f531086947d06eb0f121/USER#fb978442-a61b-413a-b4f4-526e6cdb142e'
        DBS_BLOCK2 = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\
                                    'ea0972193530f531086947d06eb0f121/USER#0b04d417-d734-4ef2-88b0-392c48254dab'
        self.dbs = DBSReader('https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader/')
        # assume one site is cern
        sites = [x for x in self.dbs.listFileBlockLocation(BLOCK) if x and x.find('CH_CERN') > -1]
        self.assertTrue(sites)
        #This block is only found on DBS
        self.assertTrue(self.dbs.listFileBlockLocation(DBS_BLOCK))
        # doesn't raise on non-existant block
        self.assertFalse(self.dbs.listFileBlockLocation(WRONG_BLOCK))
        #test bulk call:
        ## two blocks in phedex
        self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, BLOCK2])))
        ## one block in phedex one does not exist
        self.assertEqual(1, len(self.dbs.listFileBlockLocation([BLOCK, WRONG_BLOCK])))
        ## one in phedex one in dbs
        self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, DBS_BLOCK])))
        ## two in dbs
        self.assertEqual(2, len(self.dbs.listFileBlockLocation([DBS_BLOCK, DBS_BLOCK2])))
        ## one in DBS and one does not exist
        self.assertEqual(1, len(self.dbs.listFileBlockLocation([DBS_BLOCK, WRONG_BLOCK])))

    def testGetFileBlock(self):
        """getFileBlock returns block"""
        self.dbs = DBSReader(self.endpoint)
        block = self.dbs.getFileBlock(BLOCK)
        self.assertEqual(len(block), 1)
        block = block[BLOCK]
        self.assertEqual(2, len(block['Files']))

        self.assertRaises(DBSReaderError, self.dbs.getFileBlock, BLOCK + 'asas')

    def testGetFileBlockWithParents(self):
        """getFileBlockWithParents returns block and parents"""
        self.dbs = DBSReader(self.endpoint)
        block = self.dbs.getFileBlockWithParents('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0')
        self.assertEqual(len(block), 1)
        block = block['/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0']
        self.assertEqual('/store/data/Commissioning2015/Cosmics/RAW/v1/000/238/545/00000/1043E89F-2DCF-E411-9CAE-02163E013751.root',
                         block['Files'][0]['ParentList'][0]['LogicalFileName'])

        self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents, BLOCK + 'asas')

    def testGetFiles(self):
        """getFiles returns files in dataset"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.getFiles(DATASET)
        self.assertEqual(len(files), 46)

    def testListBlockParents(self):
        """listBlockParents returns block parents"""
        self.dbs = DBSReader(self.endpoint)
        parents = self.dbs.listBlockParents('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0')
        self.assertEqual(1, len(parents))
        self.assertEqual('/Cosmics/Commissioning2015-v1/RAW#942d76fe-cf0e-11e4-afad-001e67ac06a0', parents[0]['Name'])
        sites = [x for x in parents[0]['PhEDExNodeList'] if x.find("CH_CERN") > -1]
        self.assertTrue(sites)

        self.assertFalse(self.dbs.listBlockParents('/Cosmics/Commissioning2015-v1/RAW#942d76fe-cf0e-11e4-afad-001e67ac06a0'))

    def testBlockIsOpen(self):
        """blockIsOpen checks if a block is open"""
        self.dbs = DBSReader(self.endpoint)
        self.assertFalse(self.dbs.blockIsOpen(BLOCK))

    def testBlockToDatasetPath(self):
        """blockToDatasetPath extracts path from block name"""
        self.dbs = DBSReader(self.endpoint)
        self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET)
        self.assertRaises(DBSReaderError, self.dbs.blockToDatasetPath, BLOCK + 'asas')

Esempio n. 10

Mostra file

File: DBSDataDiscovery.py Progetto: vlimant/CRABServer

class DBSDataDiscovery(DataDiscovery):
    """Performing the data discovery through CMS DBS service.
    """
    def checkDatasetStatus(self, dataset, kwargs):
        res = self.dbs.dbs.listDatasets(dataset=dataset,
                                        detail=1,
                                        dataset_access_type='*')
        if len(res) > 1:
            raise TaskWorkerException(
                "Found more than one dataset while checking in DBS the status of %s"
                % dataset)
        if len(res) == 0:
            raise TaskWorkerException(
                "Cannot find dataset %s in %s DBS instance" %
                (dataset, self.dbsInstance))
        res = res[0]
        self.logger.info("Input dataset details: %s" % pprint.pformat(res))
        accessType = res['dataset_access_type']
        if accessType != 'VALID':
            #as per Dima's suggestion https://github.com/dmwm/CRABServer/issues/4739
            msgForDeprecDS = "Please contact your physics group if you think the dataset should not be deprecated."
            if kwargs['task']['tm_nonvalid_input_dataset'] != 'T':
                msg = "CRAB refuses to proceed in getting the details of the dataset %s from DBS, because the dataset is not 'VALID' but '%s'." % (
                    dataset, accessType)
                if accessType == 'DEPRECATED':
                    msg += " (%s)" % (msgForDeprecDS)
                msg += " To allow CRAB to consider a dataset that is not 'VALID', set Data.allowNonValidInputDataset = True in the CRAB configuration."
                msg += " Notice that this will not force CRAB to run over all files in the dataset;"
                msg += " CRAB will still check if there are any valid files in the dataset and run only over those files."
                raise TaskWorkerException(msg)
            msg = "The input dataset %s is not 'VALID' but '%s'." % (
                dataset, accessType)
            msg += " CRAB will check if there are any valid files in the dataset and run only over those files."
            if accessType == 'DEPRECATED':
                msg += " %s" % (msgForDeprecDS)
            self.uploadWarning(msg, kwargs['task']['user_proxy'],
                               kwargs['task']['tm_taskname'])
        return

    def keepOnlyDisks(self, locationsMap):
        self.otherLocations = set()
        phedex = PhEDEx()  #TODO use certs from the config!
        #get all the PNN that are of kind disk
        try:
            diskLocations = set([
                pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node']
                if pnn['kind'] == 'Disk'
            ])
        except HTTPException as ex:
            self.logger.error(ex.headers)
            raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\
                                "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\
                                " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves
        for block, locations in locationsMap.iteritems():
            locationsMap[block] = set(locations) & diskLocations
            self.otherLocations = self.otherLocations.union(
                set(locations) - diskLocations)
        #remove any key with value that has set([])
        for key, value in locationsMap.items():  #wont work in python3!
            if value == set([]):
                locationsMap.pop(key)

    def checkBlocksSize(self, blocks):
        """ Make sure no single blocks has more than 100k lumis. See
            https://hypernews.cern.ch/HyperNews/CMS/get/dmDevelopment/2022/1/1/1/1/1/1/2.html
        """
        MAX_LUMIS = 100000
        for block in blocks:
            blockInfo = self.dbs.getDBSSummaryInfo(block=block)
            if blockInfo['NumberOfLumis'] > MAX_LUMIS:
                msg = "Block %s contains more than %s lumis and cannot be processed for splitting. " % (
                    block, MAX_LUMIS)
                msg += "For memory/time contraint big blocks are not allowed. Use another dataset as input."
                raise TaskWorkerException(msg)

    def execute(self, *args, **kwargs):
        self.logger.info(
            "Data discovery with DBS")  ## to be changed into debug
        old_cert_val = os.getenv("X509_USER_CERT")
        old_key_val = os.getenv("X509_USER_KEY")
        try:
            os.environ['X509_USER_CERT'] = self.config.TaskWorker.cmscert
            os.environ['X509_USER_KEY'] = self.config.TaskWorker.cmskey
            # DBS3 requires X509_USER_CERT to be set - but we don't want to leak that to other modules
            dbsurl = self.config.Services.DBSUrl
            if kwargs['task']['tm_dbs_url']:
                dbsurl = kwargs['task']['tm_dbs_url']
            self.dbs = DBSReader(dbsurl)
            self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"]
        finally:
            if old_cert_val != None:
                os.environ['X509_USER_CERT'] = old_cert_val
            else:
                del os.environ['X509_USER_CERT']
            if old_key_val != None:
                os.environ['X509_USER_KEY'] = old_key_val
            else:
                del os.environ['X509_USER_KEY']
        self.logger.debug("Data discovery through %s for %s" %
                          (self.dbs, kwargs['task']['tm_taskname']))
        self.checkDatasetStatus(kwargs['task']['tm_input_dataset'], kwargs)
        try:
            # Get the list of blocks for the locations and then call dls.
            # The WMCore DBS3 implementation makes one call to dls for each block
            # with locations = True so we are using locations=False and looking up location later
            blocks = [
                x['Name'] for x in self.dbs.getFileBlocksInfo(
                    kwargs['task']['tm_input_dataset'], locations=False)
            ]
        except DBSReaderError as dbsexc:
            #dataset not found in DBS is a known use case
            if str(dbsexc).find('No matching data'):
                raise TaskWorkerException(
                    "The CRAB3 server backend could not find dataset %s in this DBS instance: %s"
                    % (kwargs['task']['tm_input_dataset'], dbsurl))
            raise
        self.checkBlocksSize(blocks)
        ## Create a map for block's locations: for each block get the list of locations.
        ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no
        ## locations are found it gets the original locations from DBS. So it should
        ## never be the case at this point that some blocks have no locations.
        ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example:
        ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'],
        ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'}
        try:
            dbsOnly = self.dbsInstance.split('/')[1] != 'global'
            locationsMap = self.dbs.listFileBlockLocation(list(blocks),
                                                          dbsOnly=dbsOnly)
        except Exception as ex:  #TODO should we catch HttpException instead?
            self.logger.exception(ex)
            raise TaskWorkerException("The CRAB3 server backend could not get the location of the files from dbs or phedex.\n"+\
                                "This is could be a temporary phedex/dbs glitch, please try to submit a new task (resubmit will not work)"+\
                                " and contact the experts if the error persists.\nError reason: %s" % str(ex))
        self.keepOnlyDisks(locationsMap)
        if not locationsMap:
            msg = "Task could not be submitted because there is no DISK replica for dataset %s ." % (
                kwargs['task']['tm_input_dataset'])
            msg += " Please, check DAS, https://cmsweb.cern.ch/das, and make sure the dataset is accessible on DISK"
            msg += " You might want to contact your physics group if you need a disk replica."
            if self.otherLocations:
                msg += "\nN.B.: your dataset is stored at %s, but those are TAPE locations." % ','.join(
                    sorted(self.otherLocations))
            raise TaskWorkerException(msg)
        if len(blocks) != len(locationsMap):
            self.logger.warning(
                "The locations of some blocks have not been found: %s" %
                (set(blocks) - set(locationsMap)))
        try:
            filedetails = self.dbs.listDatasetFileDetails(
                kwargs['task']['tm_input_dataset'],
                getParents=True,
                validFileOnly=0)

            secondary = kwargs['task'].get('tm_secondary_input_dataset', None)
            if secondary:
                moredetails = self.dbs.listDatasetFileDetails(secondary,
                                                              getParents=False,
                                                              validFileOnly=0)

                for secfilename, secinfos in moredetails.items():
                    secinfos['lumiobj'] = LumiList(
                        runsAndLumis=secinfos['Lumis'])

                self.logger.info(
                    "Beginning to match files from secondary dataset")
                for dummyFilename, infos in filedetails.items():
                    infos['Parents'] = []
                    lumis = LumiList(runsAndLumis=infos['Lumis'])
                    for secfilename, secinfos in moredetails.items():
                        if (lumis & secinfos['lumiobj']):
                            infos['Parents'].append(secfilename)
                self.logger.info("Done matching files from secondary dataset")
                kwargs['task']['tm_use_parent'] = 1
        except Exception as ex:  #TODO should we catch HttpException instead?
            self.logger.exception(ex)
            raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\
                                "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\
                                " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves
        if not filedetails:
            raise TaskWorkerException((
                "Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n"
                "Aborting submission. Resubmitting your task will not help."
            ) % (
                "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s"
            ) % (self.dbsInstance, kwargs['task']['tm_input_dataset']))

        ## Format the output creating the data structures required by wmcore. Filters out invalid files,
        ## files whose block has no location, and figures out the PSN
        result = self.formatOutput(task=kwargs['task'],
                                   requestname=kwargs['task']['tm_taskname'],
                                   datasetfiles=filedetails,
                                   locations=locationsMap,
                                   tempDir=kwargs['tempDir'])

        if not result.result:
            raise TaskWorkerException((
                "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n"
                "Aborting submission. Resubmitting your task will not help."
            ) % (
                "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s"
            ) % (self.dbsInstance, kwargs['task']['tm_input_dataset']))

        self.logger.debug("Got %s files" % len(result.result.getFiles()))
        return result

Esempio n. 11

Mostra file

File: DBSReader_t.py Progetto: AndrewLevin/WMCore

class DBSReaderTest(unittest.TestCase):

    def setUp(self):
        """
        _setUp_

        Initialize the PhEDEx API to point at the test server.
        """
        #self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
        self.endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader'
        self.dbs      = None
        return

    @attr("integration")
    def testListPrimaryDatasets(self):
        """
        listPrimaryDatasets returns known primary datasets
        """
        self.dbs = DBSReader(self.endpoint)
        results = self.dbs.listPrimaryDatasets('Jet*')
        self.assertTrue('Jet' in results)
        self.assertTrue('JetMET' in results)
        self.assertTrue('JetMETTau' in results)
        self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist'))
        return

    @attr("integration")
    def testMatchProcessedDatasets(self):
        """
        matchProcessedDatasets returns known processed datasets
        """
        self.dbs = DBSReader(self.endpoint)
        dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1')
        self.assertEqual(1, len(dataset))
        self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList'])
        self.assertEqual('Run2011A-v1', dataset[0]['Name'])
        self.assertFalse(self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666'))

    @attr("integration")
    def testlistRuns(self):
        """listRuns returns known runs"""
        self.dbs = DBSReader(self.endpoint)
        runs = self.dbs.listRuns(dataset = DATASET)
        self.assertEqual(46, len(runs))
        self.assertTrue(174074 in runs)
        runs = self.dbs.listRuns(dataset = DATASET, block = BLOCK)
        self.assertEqual([173657], runs)

    @attr("integration")
    def testlistRunLumis(self):
        """listRunLumis returns known runs and lumicounts"""
        self.dbs = DBSReader(self.endpoint)
        runs = self.dbs.listRunLumis(dataset = DATASET)
        self.assertEqual(46, len(runs))
        self.assertTrue(173692 in runs)
        self.assertEqual(runs[173692], 2782)
        runs = self.dbs.listRuns(dataset = DATASET, block = BLOCK)
        self.assertEqual({173657 : 94}, runs)

    @attr("integration")
    def testListProcessedDatasets(self):
        """listProcessedDatasets returns known processed datasets"""
        self.dbs = DBSReader(self.endpoint)
        datasets = self.dbs.listProcessedDatasets('Jet', 'RAW')
        self.assertTrue('Run2011A-v1' in datasets)
        self.assertTrue('Run2011B-v1' in datasets)
        self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah'))
        self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW'))

    @attr("integration")
    def testlistDatasetFiles(self):
        """listDatasetFiles returns files in dataset"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.listDatasetFiles(DATASET)
        self.assertEqual(49, len(files))
        self.assertTrue(FILE in files)

    @attr("integration")
    def testlistDatasetFileDetails(self):
        """testlistDatasetFilesDetails returns lumis, events, and parents of a dataset"""
        TESTFILE = '/store/data/Run2011A/HighPileUp/RAW/v1/000/173/658/56484BAB-CBCB-E011-AF00-BCAEC518FF56.root'
        for endpoint in [self.endpoint, 'test/python/WMCore_t/Services_t/DBS_t/DBSReader_t.py:']:
            self.dbs = DBSReader(endpoint)
            details = self.dbs.listDatasetFileDetails(DATASET)
            self.assertEqual(len(details), 49)
            self.assertTrue(TESTFILE in details)
            self.assertEqual(details[TESTFILE]['NumberOfEvents'], 545)
            self.assertEqual(details[TESTFILE]['Size'], 286021145)
            self.assertEqual(details[TESTFILE]['BlockName'], '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace')
            self.assertEqual(details[TESTFILE]['Checksums'],
                {'Checksum': '22218315', 'Adler32': 'a41a1446', 'Md5': 'NOTSET'}
            )
            self.assertTrue( 173658 in details[TESTFILE]['Lumis'])
            self.assertEqual( sorted(details[TESTFILE]['Lumis'][173658]),
                sorted( map( long, [8, 12, 9, 14, 10, 6, 2, 1, 4, 3, 36, 49, 16, 11, 27, 35, 46, 39, 20, 24, 52, 23, 40, 42, 45, 21, 32, 37,  \
                                    25, 22, 5, 33, 17, 15, 26, 50, 18, 29, 51, 44, 69, 43, 30, 73, 19, 41, 13, 38, 7, 31, 75, 48, 59, 65, 55, \
                                    57, 34, 28, 74, 47, 64, 61, 68, 77, 66, 71, 60, 76, 70, 67, 62, 78, 82, 79, 88, 56, 101, 92, 58, 72, 54,  \
                                    63, 96, 53, 84, 95, 89, 85, 99, 81, 91, 102, 80, 100, 107, 94, 93, 90, 86, 87, 83, 97, 104, 110, 111, 106,\
                                    108, 98, 103, 109, 105]))
            )

    @attr("integration")
    def testGetDBSSummaryInfo(self):
        """getDBSSummaryInfo returns summary of dataset and block"""
        self.dbs = DBSReader(self.endpoint)
        dataset = self.dbs.getDBSSummaryInfo(DATASET)
        self.assertEqual(dataset['path'], DATASET)
        self.assertEqual(dataset['block'], '')
        self.assertEqual(dataset['NumberOfEvents'], '22075')
        self.assertEqual(dataset['NumberOfBlocks'], '46')
        self.assertEqual(dataset['total_size'], '4001680824')
        self.assertEqual(dataset['NumberOfFiles'], '49')
        self.assertEqual(dataset['NumberOfLumis'], '7223')

        block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK)
        self.assertEqual(block['path'], '')
        self.assertEqual(block['block'], BLOCK)
        self.assertEqual(block['NumberOfEvents'], '377')
        self.assertEqual(block['NumberOfBlocks'], '1')
        self.assertEqual(block['total_size'], '150780132')
        self.assertEqual(block['NumberOfFiles'], '2')
        self.assertEqual(block['NumberOfLumis'], '94')

        self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET + 'blah')
        self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET, BLOCK + 'asas')

    @attr("integration")
    def testGetFileBlocksInfo(self):
        """getFileBlocksInfo returns block info, including location lookup"""
        self.dbs = DBSReader(self.endpoint)
        blocks = self.dbs.getFileBlocksInfo(DATASET)
        block = self.dbs.getFileBlocksInfo(DATASET, blockName = BLOCK)
        self.assertEqual(1, len(block))
        block = block[0]
        self.assertEqual(46, len(blocks))
        self.assertTrue(block['Name'] in [x['Name'] for x in blocks])
        self.assertEqual(BLOCK, block['Name'])
        #self.assertEqual(377, block['NumberOfEvents'])
        self.assertEqual(150780132, block['BlockSize'])
        self.assertEqual(2, block['NumberOfFiles'])
        # possibly fragile but assume block located at least at cern
        sites = [x['Name'] for x in block['StorageElementList'] if x['Name'].find('cern.ch') > -1]
        self.assertTrue(sites)

        # weird error handling - depends on whether block or dataset is missing
        self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET + 'blah')
        self.assertFalse(self.dbs.getFileBlocksInfo(DATASET, blockName = BLOCK + 'asas'))

    @attr("integration")
    def testListFileBlocks(self):
        """listFileBlocks returns block names in dataset"""
        self.dbs = DBSReader(self.endpoint)
        blocks = self.dbs.listFileBlocks(DATASET)
        # block is closed
        block = self.dbs.listFileBlocks(DATASET, blockName = BLOCK, onlyClosedBlocks = True)[0]
        self.assertEqual(block, BLOCK)
        self.assertTrue(BLOCK in block)

    @attr("integration")
    def testListOpenFileBlocks(self):
        """listOpenFileBlocks finds open blocks"""
        # hard to find a dataset with open blocks, so don't bother
        self.dbs = DBSReader(self.endpoint)
        self.assertFalse(self.dbs.listOpenFileBlocks(DATASET))

    @attr("integration")
    def testBlockExists(self):
        """blockExists returns existence of blocks"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(self.dbs.blockExists(BLOCK))
        self.assertFalse(self.dbs.blockExists(DATASET + '#somethingelse'))

    @attr("integration")
    def testListFilesInBlock(self):
        """listFilesInBlock returns files in block"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(FILE in [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)])
        self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock, DATASET + '#blah')

    @attr("integration")
    def testListFilesInBlockWithParents(self):
        """listFilesInBlockWithParents gets files with parents for a block"""
        # hope PromptReco doesn't get deleted
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.listFilesInBlockWithParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60')
        self.assertEqual(1, len(files))
        self.assertEqual('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60', files[0]['Block']['Name'])
        self.assertEqual('/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root',
                         files[0]['ParentList'][0]['LogicalFileName'])

        self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents, BLOCK + 'asas')

    @attr("integration")
    def testLfnsInBlock(self):
        """lfnsInBlock returns lfns in block"""
        self.dbs = DBSReader(self.endpoint)
        self.assertTrue(FILE in self.dbs.lfnsInBlock(BLOCK))
        self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas')

    @attr("integration")
    def testListFileBlockLocation(self):
        """listFileBlockLocation returns block location"""
        WRONG_BLOCK = BLOCK[:-4]+'abcd'
        BLOCK2 = '/HighPileUp/Run2011A-v1/RAW#6021175e-cbfb-11e0-80a9-003048caaace'
        DBS_BLOCK = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\
                                    'ea0972193530f531086947d06eb0f121/USER#fb978442-a61b-413a-b4f4-526e6cdb142e'
        DBS_BLOCK2 = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\
                                    'ea0972193530f531086947d06eb0f121/USER#0b04d417-d734-4ef2-88b0-392c48254dab'
        self.dbs = DBSReader('https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader/')
        # assume one site is cern
        sites = [x for x in self.dbs.listFileBlockLocation(BLOCK) if x and x.find('cern.ch') > -1]
        self.assertTrue(sites)
        #This block is only found on DBS
        self.assertTrue(self.dbs.listFileBlockLocation(DBS_BLOCK))
        # doesn't raise on non-existant block
        self.assertFalse(self.dbs.listFileBlockLocation(WRONG_BLOCK))
        #test bulk call:
        ## two blocks in phedex
        self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, BLOCK2])))
        ## one block in phedex one does not exist
        self.assertEqual(1, len(self.dbs.listFileBlockLocation([BLOCK, WRONG_BLOCK])))
        ## one in phedex one in dbs
        self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, DBS_BLOCK])))
        ## two in dbs
        self.assertEqual(2, len(self.dbs.listFileBlockLocation([DBS_BLOCK, DBS_BLOCK2])))
        ## one in DBS and one does not exist
        self.assertEqual(1, len(self.dbs.listFileBlockLocation([DBS_BLOCK, WRONG_BLOCK])))

    @attr("integration")
    def testGetFileBlock(self):
        """getFileBlock returns block"""
        self.dbs = DBSReader(self.endpoint)
        block = self.dbs.getFileBlock(BLOCK)
        self.assertEqual(len(block), 1)
        block = block[BLOCK]
        self.assertEqual(2, len(block['Files']))

        self.assertRaises(DBSReaderError, self.dbs.getFileBlock, BLOCK + 'asas')

    @attr("integration")
    def testGetFileBlockWithParents(self):
        """getFileBlockWithParents returns block and parents"""
        self.dbs = DBSReader(self.endpoint)
        block = self.dbs.getFileBlockWithParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60')
        self.assertEqual(len(block), 1)
        block = block['/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60']
        self.assertEqual('/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root',
                         block['Files'][0]['ParentList'][0]['LogicalFileName'])

        self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents, BLOCK + 'asas')

    @attr("integration")
    def testGetFiles(self):
        """getFiles returns files in dataset"""
        self.dbs = DBSReader(self.endpoint)
        files = self.dbs.getFiles(DATASET)
        self.assertEqual(len(files), 46)

    @attr("integration")
    def testListBlockParents(self):
        """listBlockParents returns block parents"""
        self.dbs = DBSReader(self.endpoint)
        parents = self.dbs.listBlockParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60')
        self.assertEqual(1, len(parents))
        self.assertEqual('/Jet/Run2011A-v1/RAW#37cf2a40-4e0e-11e0-9833-00151755cb60',
                         parents[0]['Name'])
        sites = [x for x in parents[0]['StorageElementList'] if x.find("cern.ch") > -1]
        self.assertTrue(sites)

        self.assertFalse(self.dbs.listBlockParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60dsl'))

    @attr("integration")
    def testBlockIsOpen(self):
        """blockIsOpen checks if a block is open"""
        self.dbs = DBSReader(self.endpoint)
        self.assertFalse(self.dbs.blockIsOpen(BLOCK))

    @attr("integration")
    def testBlockToDatasetPath(self):
        """blockToDatasetPath extracts path from block name"""
        self.dbs = DBSReader(self.endpoint)
        self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET)
        self.assertFalse(self.dbs.blockToDatasetPath(BLOCK + 'asas'))

Esempio n. 12

Mostra file

File: checkRequest.py Progetto: lucacopa/WmAgentScripts

def getDBSSummary(requestInfo):
    print "Loading DBS full information for %s... %s" % (
        requestInfo['InputDataset'], time.strftime('%H:%M:%S'))
    reader = DBSReader(
        'http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet')
    files = reader.listDatasetFileDetails(
        datasetPath=requestInfo['InputDataset'])
    inputRunAndLumis = {}
    for lfn in files.keys():
        fileInfo = files[lfn]
        runAndLumis = fileInfo['Lumis']
        block = fileInfo['BlockName']
        if requestInfo['BlockWhitelist'] and block not in requestInfo[
                'BlockWhitelist']:
            del files[lfn]
            continue
        if requestInfo['BlockBlacklist'] and block in requestInfo[
                'BlockBlacklist']:
            del files[lfn]
            continue
        for run in runAndLumis.keys():
            if requestInfo['RunWhitelist'] and run not in requestInfo[
                    'RunWhitelist']:
                del runAndLumis[run]
                continue
            if requestInfo['RunBlacklist'] and run not in requestInfo[
                    'RunBlacklist']:
                del runAndLumis[run]
                continue
            if run not in inputRunAndLumis:
                inputRunAndLumis[run] = set()
            inputRunAndLumis[run].update(set(runAndLumis[run]))
        if not runAndLumis:
            del files[lfn]
            continue
    outputRunAndLumis = {}
    for outputDataset in requestInfo['OutputDatasets']:
        print "Loading DBS full information for %s... %s" % (
            outputDataset, time.strftime('%H:%M:%S'))
        outputRunAndLumis[outputDataset] = {}
        outputFiles = reader.listDatasetFileDetails(datasetPath=outputDataset)
        for lfn in outputFiles:
            fileInfo = outputFiles[lfn]
            runAndLumis = fileInfo['Lumis']
            for run in runAndLumis:
                if run not in outputRunAndLumis[outputDataset]:
                    outputRunAndLumis[outputDataset][run] = set()
                outputRunAndLumis[outputDataset][run].update(
                    set(runAndLumis[run]))
    differences = {}
    for outputDataset in outputRunAndLumis:
        print "Analyzing differences in %s... %s" % (outputDataset,
                                                     time.strftime('%H:%M:%S'))
        differences[outputDataset] = {}
        for run in inputRunAndLumis:
            diff = inputRunAndLumis[run] - outputRunAndLumis[
                outputDataset].get(run, set())
            if diff:
                differences[outputDataset][run] = diff
        jsonizedMissingLumis = {}
        for run in differences[outputDataset]:
            interval = []
            jsonizedMissingLumis[run] = []
            for lumi in sorted(differences[outputDataset][run]):
                if not interval:
                    interval = [lumi, lumi]
                elif lumi == interval[1] + 1:
                    interval[1] = lumi
                else:
                    jsonizedMissingLumis[run].append(interval)
                    interval = [lumi, lumi]
            if interval:
                jsonizedMissingLumis[run].append(interval)
        try:
            if not jsonizedMissingLumis:
                continue
            outFileName = 'MissingLumis_%s.json' % outputDataset.replace(
                '/', '_')
            outFileHandle = open(outFileName, 'w')
            json.dump(jsonizedMissingLumis, outFileHandle)
            outFileHandle.close()
        except:
            print "Error writing to %s" % outFileName

Esempio n. 13

Mostra file

File: checkRequest.py Progetto: AlexVanSpilbeeck/WmAgentScripts

def getDBSSummary(requestInfo):
    print "Loading DBS full information for %s... %s" % (requestInfo['InputDataset'], time.strftime('%H:%M:%S'))
    reader = DBSReader('http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet')
    files = reader.listDatasetFileDetails(datasetPath = requestInfo['InputDataset'])
    inputRunAndLumis = {}
    for lfn in files.keys():
        fileInfo = files[lfn]
        runAndLumis = fileInfo['Lumis']
        block = fileInfo['BlockName']
        if requestInfo['BlockWhitelist'] and block not in requestInfo['BlockWhitelist']:
            del files[lfn]
            continue
        if requestInfo['BlockBlacklist'] and block in requestInfo['BlockBlacklist']:
            del files[lfn]
            continue
        for run in runAndLumis.keys():
            if requestInfo['RunWhitelist'] and run not in requestInfo['RunWhitelist']:
                del runAndLumis[run]
                continue
            if requestInfo['RunBlacklist'] and run not in requestInfo['RunBlacklist']:
                del runAndLumis[run]
                continue
            if run not in inputRunAndLumis:
                inputRunAndLumis[run] = set()
            inputRunAndLumis[run].update(set(runAndLumis[run]))
        if not runAndLumis:
            del files[lfn]
            continue
    outputRunAndLumis = {}
    for outputDataset in requestInfo['OutputDatasets']:
        print "Loading DBS full information for %s... %s" % (outputDataset, time.strftime('%H:%M:%S'))
        outputRunAndLumis[outputDataset] = {}
        outputFiles = reader.listDatasetFileDetails(datasetPath = outputDataset)
        for lfn in outputFiles:
            fileInfo = outputFiles[lfn]
            runAndLumis = fileInfo['Lumis']
            for run in runAndLumis:
                if run not in outputRunAndLumis[outputDataset]:
                    outputRunAndLumis[outputDataset][run] = set()
                outputRunAndLumis[outputDataset][run].update(set(runAndLumis[run]))
    differences = {}
    for outputDataset in outputRunAndLumis:
        print "Analyzing differences in %s... %s" % (outputDataset, time.strftime('%H:%M:%S'))
        differences[outputDataset] = {}
        for run in inputRunAndLumis:
            diff = inputRunAndLumis[run] - outputRunAndLumis[outputDataset].get(run, set())
            if diff:
                differences[outputDataset][run] = diff
        jsonizedMissingLumis = {}
        for run in differences[outputDataset]:
            interval = []
            jsonizedMissingLumis[run] = []
            for lumi in sorted(differences[outputDataset][run]):
                if not interval:
                    interval = [lumi,lumi]
                elif lumi == interval[1] + 1:
                    interval[1] = lumi
                else:
                    jsonizedMissingLumis[run].append(interval)
                    interval = [lumi,lumi]
            if interval:
                jsonizedMissingLumis[run].append(interval)
        try:
            if not jsonizedMissingLumis:
                continue
            outFileName = 'MissingLumis_%s.json' % outputDataset.replace('/', '_')
            outFileHandle = open(outFileName , 'w')
            json.dump(jsonizedMissingLumis, outFileHandle)
            outFileHandle.close()
        except:
            print "Error writing to %s" % outFileName

Esempio n. 14

Mostra file

    def report(self, workflow, userdn, usedbs):
        """
        Computes the report for workflow. If usedbs is used also query DBS and return information about the input and output datasets
        """

        def _compactLumis(datasetInfo):
            """ Help function that allow to convert from runLumis divided per file (result of listDatasetFileDetails)
                to an aggregated result.
            """
            lumilist = {}
            for file, info in datasetInfo.iteritems():
                for run, lumis in info['Lumis'].iteritems():
                    lumilist.setdefault(str(run), []).extend(lumis)
            return lumilist

        res = {}
        self.logger.info("About to compute report of workflow: %s with usedbs=%s. Getting status first." % (workflow,usedbs))
        statusRes = self.status(workflow, userdn)[0]

        #get the information we need from the taskdb/initilize variables
        row = self.api.query(None, None, self.Task.ID_sql, taskname = workflow).next()
        row = self.Task.ID_tuple(*row)
        inputDataset = row.input_dataset
        outputDatasets = literal_eval(row.output_dataset.read() if row.output_dataset else 'None')
        dbsUrl = row.dbs_url

        #load the lumimask
        splitArgs = literal_eval(row.split_args.read())
        res['lumiMask'] = buildLumiMask(splitArgs['runs'], splitArgs['lumis'])
        self.logger.info("Lumi mask was: %s" % res['lumiMask'])

        #extract the finished jobs from filemetadata
        jobids = [x[1] for x in statusRes['jobList'] if x[0] in ['finished']]
        rows = self.api.query(None, None, self.FileMetaData.GetFromTaskAndType_sql, filetype='EDM,TFILE,POOLIN', taskname=workflow)

        res['runsAndLumis'] = {}
        for row in rows:
            if row[GetFromTaskAndType.PANDAID] in jobids:
                if str(row[GetFromTaskAndType.PANDAID]) not in res['runsAndLumis']:
                    res['runsAndLumis'][str(row[GetFromTaskAndType.PANDAID])] = []
                res['runsAndLumis'][str(row[GetFromTaskAndType.PANDAID])].append( { 'parents' : row[GetFromTaskAndType.PARENTS].read(),
                        'runlumi' : row[GetFromTaskAndType.RUNLUMI].read(),
                        'events'  : row[GetFromTaskAndType.INEVENTS],
                        'type'    : row[GetFromTaskAndType.TYPE],
                })
        self.logger.info("Got %s edm files for workflow %s" % (len(res['runsAndLumis']), workflow))

        if usedbs:
            if not outputDatasets:
                raise ExecutionError("Cannot find any information about the output datasets names. You can try to execute 'crab report' with --dbs=no")
            try:
                #load the input dataset's lumilist
                dbs = DBSReader(dbsUrl)
                inputDetails = dbs.listDatasetFileDetails(inputDataset)
                res['dbsInLumilist'] = _compactLumis(inputDetails)
                self.logger.info("Aggregated input lumilist: %s" % res['dbsInLumilist'])
                #load the output datasets' lumilist
                res['dbsNumEvents'] = 0
                res['dbsNumFiles'] = 0
                res['dbsOutLumilist'] = {}
                dbs = DBSReader("https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader") #We can only publish here with DBS3
                outLumis = []
                for outputDataset in outputDatasets:
                    outputDetails = dbs.listDatasetFileDetails(outputDataset)
                    outLumis.append(_compactLumis(outputDetails))
                    res['dbsNumEvents'] += sum(x['NumberOfEvents'] for x in outputDetails.values())
                    res['dbsNumFiles'] += sum(len(x['Parents']) for x in outputDetails.values())

                outLumis = LumiList(runsAndLumis = outLumis).compactList
                for run,lumis in outLumis.iteritems():
                    res['dbsOutLumilist'][run] = reduce(lambda x1,x2: x1+x2, map(lambda x: range(x[0], x[1]+1), lumis))
                self.logger.info("Aggregated output lumilist: %s" % res['dbsOutLumilist'])
            except Exception, ex:
                msg = "Failed to contact DBS: %s" % str(ex)
                self.logger.exception(msg)
                raise ExecutionError("Exception while contacting DBS. Cannot get the input/output lumi lists. You can try to execute 'crab report' with --dbs=no")