Exemple #1
0
    def __init__(self, config):
        """
        ___init___

        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        self.enabled = config.RucioInjector.enabled
        # dataset rule creation has a larger polling cycle
        self.pollRules = config.RucioInjector.pollIntervalRules
        self.lastRulesExecTime = 0
        self.createBlockRules = config.RucioInjector.createBlockRules
        self.skipRulesForTiers = config.RucioInjector.skipRulesForTiers
        self.listTiersToInject = config.RucioInjector.listTiersToInject

        # setup cache for container and blocks (containers can be much longer, make 6 days now)
        self.containersCache = MemoryCache(
            config.RucioInjector.cacheExpiration * 3, set())
        self.blocksCache = MemoryCache(config.RucioInjector.cacheExpiration,
                                       set())

        self.scope = getattr(config.RucioInjector, "scope", "cms")
        self.rucioAcct = config.RucioInjector.rucioAccount
        self.rucio = Rucio(acct=self.rucioAcct,
                           hostUrl=config.RucioInjector.rucioUrl,
                           authUrl=config.RucioInjector.rucioAuthUrl,
                           configDict={'logger': self.logger})

        # metadata dictionary information to be added to block/container rules
        # cannot be a python dictionary, but a JSON string instead
        self.metaData = json.dumps(
            dict(agentHost=config.Agent.hostName,
                 userAgent=config.Agent.agentName))

        self.testRSEs = config.RucioInjector.RSEPostfix
        self.filesToRecover = []

        logging.info(
            "Component configured to only inject data for data tiers: %s",
            self.listTiersToInject)
        logging.info(
            "Component configured to skip container rule creation for data tiers: %s",
            self.skipRulesForTiers)
        logging.info("Component configured to create block rules: %s",
                     self.createBlockRules)
Exemple #2
0
 def _getDatasetLocation(self, dset, blockDict):
     """
     Given a dataset name, query PhEDEx or Rucio and resolve the block location
     :param dset: string with the dataset name
     :param blockDict: dictionary with DBS summary info
     :return: update blockDict in place
     """
     # initialize Rucio here to avoid this authentication on T0-WMAgent
     self.rucio = Rucio(self.rucioAcct)
     blockReplicas = self.rucio.getPileupLockedAndAvailable(
         dset, account=self.rucioAcct)
     for blockName, blockLocation in viewitems(blockReplicas):
         try:
             blockDict[blockName]['PhEDExNodeNames'] = list(blockLocation)
         except KeyError:
             logging.warning("Block '%s' present in Rucio but not in DBS",
                             blockName)
Exemple #3
0
    def __init__(self, config):
        """
        ___init___

        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        # dataset rule creation has a larger polling cycle
        self.pollRules = config.RucioInjector.pollIntervalRules
        self.lastRulesExecTime = 0
        self.createBlockRules = config.RucioInjector.createBlockRules
        self.containerDiskRuleParams = config.RucioInjector.containerDiskRuleParams
        self.containerDiskRuleRSEExpr = config.RucioInjector.containerDiskRuleRSEExpr
        if config.RucioInjector.metaDIDProject not in RUCIO_VALID_PROJECT:
            msg = "Component configured with an invalid 'project' DID: %s"
            raise RucioInjectorException(msg % config.RucioInjector.metaDIDProject)
        self.metaDIDProject = dict(project=config.RucioInjector.metaDIDProject)

        # setup cache for container and blocks (containers can be much longer, make 6 days now)
        self.containersCache = MemoryCache(config.RucioInjector.cacheExpiration * 3, set())
        self.blocksCache = MemoryCache(config.RucioInjector.cacheExpiration, set())

        self.scope = getattr(config.RucioInjector, "scope", "cms")
        self.rucioAcct = config.RucioInjector.rucioAccount
        self.rucio = Rucio(acct=self.rucioAcct,
                           hostUrl=config.RucioInjector.rucioUrl,
                           authUrl=config.RucioInjector.rucioAuthUrl,
                           configDict={'logger': self.logger})

        # metadata dictionary information to be added to block/container rules
        # cannot be a python dictionary, but a JSON string instead
        self.metaData = json.dumps(dict(agentHost=config.Agent.hostName,
                                        userAgent=config.Agent.agentName))

        self.testRSEs = config.RucioInjector.RSEPostfix
        self.filesToRecover = []

        # output data placement has a different behaviour between T0 and Production agents
        if hasattr(config, "Tier0Feeder"):
            logging.info("RucioInjector running on a T0 WMAgent")
            self.isT0agent = True
        else:
            self.isT0agent = False

        logging.info("Component configured to create block rules: %s", self.createBlockRules)
Exemple #4
0
    def setUp(self):
        """
        Setup for unit tests
        """
        super(RucioTest, self).setUp()

        self.myRucio = Rucio(self.acct,
                             hostUrl=self.defaultArgs['host'],
                             authUrl=self.defaultArgs['auth_host'],
                             configDict=self.defaultArgs)

        self.client = testClient(rucio_host=self.defaultArgs['host'],
                                 auth_host=self.defaultArgs['auth_host'],
                                 account=self.acct,
                                 ca_cert=self.defaultArgs['ca_cert'],
                                 auth_type=self.defaultArgs['auth_type'],
                                 creds=self.defaultArgs['creds'],
                                 timeout=self.defaultArgs['timeout'])
Exemple #5
0
    def testGetReplicaInfoForBlocksRucio(self):
        """
        Test `getReplicaInfoForBlocks` method, however not using
        the output compatibility with PhEDEx
        """
        theseArgs = self.defaultArgs.copy()
        theseArgs['phedexCompatible'] = False
        myRucio = Rucio(self.acct,
                        hostUrl=theseArgs['host'],
                        authUrl=theseArgs['auth_host'],
                        configDict=theseArgs)

        res = myRucio.getReplicaInfoForBlocks(dataset=DSET)
        self.assertTrue(isinstance(res, list))
        self.assertTrue(len(res) >= 1)  # at this very moment, there are 11 replicas
        blocks = [item['name'] for item in res]
        self.assertTrue(BLOCK in blocks)
        for item in res:
            self.assertTrue(len(item['replica']) > 0)
Exemple #6
0
def getFromRucio(dataset, logger):
    """
    Using the WMCore Rucio object and fetch all the blocks and files
    for a given container.
    Returns a dictionary key'ed by the block name, value is the amount of files.
    """
    rucio = Rucio(acct=RUCIO_ACCT,
                  hostUrl=RUCIO_HOST,
                  authUrl=RUCIO_AUTH,
                  configDict={
                      'logger': logger,
                      'phedexCompatible': False
                  })

    result = dict()
    for block in rucio.getBlocksInContainer(dataset):
        data = rucio.getDID(block)
        result.setdefault(block, data['length'])
    return result
Exemple #7
0
    def setUp(self):
        """
        _setUp_

        """
        super(WMBSHelperTest, self).setUp()

        self.testInit = TestInitCouchApp(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase=True)
        self.testInit.setupCouch("wmbshelper_t/jobs", "JobDump")
        self.testInit.setupCouch("wmbshelper_t/fwjrs", "FWJRDump")
        self.testInit.setupCouch("config_test", "GroupUser", "ConfigCache")
        os.environ["COUCHDB"] = "wmbshelper_t"
        self.testInit.setSchema(customModules=[
            "WMCore.WMBS", "WMComponent.DBS3Buffer", "WMCore.BossAir",
            "WMCore.ResourceControl"
        ],
                                useDefault=False)

        self.workDir = self.testInit.generateWorkDir()

        self.wmspec = self.createWMSpec()
        self.topLevelTask = getFirstTask(self.wmspec)
        self.inputDataset = self.topLevelTask.inputDataset()
        self.dataset = self.topLevelTask.getInputDatasetPath()
        self.dbs = DBSReader(self.inputDataset.dbsurl)
        self.rucioAcct = "wmcore_transferor"
        self.rucio = Rucio(self.rucioAcct)
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=threading.currentThread().logger,
                                     dbinterface=threading.currentThread().dbi)

        self.configFile = EmulatorSetup.setupWMAgentConfig()
        self.config = loadConfigurationFile(self.configFile)

        self.config.component_("JobSubmitter")
        self.config.JobSubmitter.submitDir = self.workDir
        self.config.JobSubmitter.submitScript = os.path.join(
            getTestBase(), 'WMComponent_t/JobSubmitter_t', 'submit.sh')

        return
 def __init__(self, **args):
     PolicyInterface.__init__(self, **args)
     self.workQueueElements = []
     self.wmspec = None
     self.team = None
     self.initialTask = None
     self.splitParams = None
     self.dbs_pool = {}
     self.data = {}
     self.lumi = None
     self.couchdb = None
     self.rejectedWork = []  # List of inputs that were rejected
     self.badWork = [
     ]  # list of bad work unit (e.g. without any valid files)
     self.pileupData = {}
     self.cric = CRIC()
     if usingRucio():
         self.rucio = Rucio(self.args['rucioAcct'],
                            configDict={'logger': self.logger})
     else:
         self.phedex = PhEDEx()  # this will go away eventually
    def _queryAndCompareWithDBS(self, pileupDict, pileupConfig, dbsUrl):
        """
        pileupDict is a Python dictionary containing particular pileup
        configuration information. Query DBS on given dataset contained
        now in both input pileupConfig as well as in the pileupDict
        and compare values.
        """
        self.assertItemsEqual(list(pileupDict), list(pileupConfig))
        reader = DBS3Reader(dbsUrl)
        rucioObj = Rucio(self.rucioAcct)

        # now query DBS and compare the blocks and files from DBS
        # against those returned by the PileupFetcher
        for pileupType, datasets in viewitems(pileupConfig):
            # this is from the pileup configuration produced by PileupFetcher
            blockDict = pileupDict[pileupType]

            for dataset in datasets:
                dbsBlocks = reader.listFileBlocks(dataset=dataset)
                rucioBlocksLocation = rucioObj.getPileupLockedAndAvailable(dataset,
                                                                           account=self.rucioAcct)

                # first, validate the number of blocks and their names
                self.assertItemsEqual(list(blockDict), dbsBlocks)
                self.assertItemsEqual(list(blockDict), list(rucioBlocksLocation))
                # now validate the block location between Rucio and PileupFetcher
                for block, blockLocation in viewitems(blockDict):
                    self.assertItemsEqual(blockLocation['PhEDExNodeNames'], rucioBlocksLocation[block])

                    # finally, validate the files
                    fileList = []
                    # now get list of files in the block
                    dbsFiles = reader.listFilesInBlock(block)
                    for dbsFile in dbsFiles:
                        fileList.append(dbsFile["LogicalFileName"])
                    self.assertItemsEqual(blockDict[block]["FileList"], fileList)
Exemple #10
0
 def __init__(self, **args):
     # We need to pop this object instance from args because otherwise
     # the super class blows up when doing a deepcopy(args)
     self.rucio = args.pop("rucioObject", None)
     PolicyInterface.__init__(self, **args)
     self.workQueueElements = []
     self.wmspec = None
     self.team = None
     self.initialTask = None
     self.splitParams = None
     self.dbs_pool = {}
     self.data = {}
     self.lumi = None
     self.couchdb = None
     self.rejectedWork = []  # List of inputs that were rejected
     self.badWork = [
     ]  # list of bad work unit (e.g. without any valid files)
     self.pileupData = {}
     self.cric = CRIC()
     # FIXME: for the moment, it will always use the default value
     self.rucioAcct = self.args.get("rucioAcct", "wmcore_transferor")
     if not self.rucio:
         self.rucio = Rucio(self.rucioAcct,
                            configDict={'logger': self.logger})
Exemple #11
0
    def __init__(self, msConfig, **kwargs):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param kwargs: can be used to skip the initialization of specific services, such as:
            logger: logger object
            skipReqMgr: boolean to skip ReqMgr initialization
            skipReqMgrAux: boolean to skip ReqMgrAux initialization
            skipRucio: boolean to skip Rucio initialization
            skipPhEDEx: boolean to skip PhEDEx initialization
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False),
                                  kwargs.get("logger"))
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s",
                         self.msConfig)

        if not kwargs.get("skipReqMgr", False):
            self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'],
                                  logger=self.logger)
        if not kwargs.get("skipReqMgrAux", False):
            self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                       httpDict={'cacheduration': 1.0},
                                       logger=self.logger)

        self.phedex = None
        self.rucio = None
        if self.msConfig.get('useRucio',
                             False) and not kwargs.get("skipRucio", False):
            self.rucio = Rucio(acct=self.msConfig['rucioAccount'],
                               hostUrl=self.msConfig['rucioUrl'],
                               authUrl=self.msConfig['rucioAuthUrl'],
                               configDict={
                                   "logger": self.logger,
                                   "user_agent": "wmcore-microservices"
                               })
        elif not kwargs.get("skipPhEDEx", False):
            # hard code it to production DBS otherwise PhEDEx subscribe API fails to match TMDB data
            dbsUrl = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
            self.phedex = PhEDEx(httpDict={'cacheduration': 0.5},
                                 dbsUrl=dbsUrl,
                                 logger=self.logger)
Exemple #12
0
    def __init__(self, **kwargs):
        if not kwargs.get('logger'):
            import logging
            kwargs['logger'] = logging
        self.logger = kwargs['logger']
        self.rucio = Rucio(kwargs.get("rucioAccount", "wmcore_transferor"),
                           configDict=dict(logger=self.logger))
        # this will break all in one test
        self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None))

        centralurl = kwargs.get("central_logdb_url", "")
        identifier = kwargs.get("log_reporter", "")

        # set the thread name before creat the log db.
        # only sets that when it is not set already
        myThread = threading.currentThread()
        if myThread.getName() == "MainThread":
            myThread.setName(self.__class__.__name__)

        self.logdb = LogDB(centralurl, identifier, logger=self.logger)
Exemple #13
0
    def testConfig(self):
        """
        Test service attributes and the override mechanism
        """
        for key in self.defaultArgs:
            self.assertEqual(getattr(self.myRucio.cli, key), self.defaultArgs[key])
        self.assertTrue(getattr(self.myRucio.cli, "user_agent").startswith("wmcore-client/"))
        self.assertTrue(getattr(self.client, "user_agent").startswith("rucio-clients/"))

        newParams = {"host": 'http://cms-rucio-int.cern.ch',
                     "auth_host": 'https://cms-rucio-auth-int.cern.ch',
                     "auth_type": "x509", "account": self.acct,
                     "ca_cert": False, "timeout": 5, "phedexCompatible": False}
        newKeys = newParams.keys()
        newKeys.remove("phedexCompatible")

        rucio = Rucio(newParams['account'], hostUrl=newParams['host'],
                      authUrl=newParams['auth_host'], configDict=newParams)

        self.assertEqual(getattr(rucio, "phedexCompat"), False)
        for key in newKeys:
            self.assertEqual(getattr(rucio.cli, key), newParams[key])
Exemple #14
0
class MSMonitor(MSCore):
    """
    MSMonitor class provide whole logic behind
    the transferor monitoring module.
    """

    def __init__(self, msConfig, logger=None):
        super(MSMonitor, self).__init__(msConfig, logger=logger)
        # update interval is used to check records in CouchDB and update them
        # after this interval, default 6h
        self.updateInterval = self.msConfig.get('updateInterval', 6 * 60 * 60)
        self.rucio = Rucio(acct=self.msConfig['rucioAccount'],
                           hostUrl=self.msConfig['rucioUrl'],
                           authUrl=self.msConfig['rucioAuthUrl'],
                           configDict={"logger": self.logger, "user_agent": "WMCore-MSMonitor"})

    def updateCaches(self):
        """
        Fetch some data required for the monitoring logic, e.g.:
         * all campaign configuration
         * all transfer records from backend DB
        :return: True if all of them succeeded, else False
        """
        campaigns = self.reqmgrAux.getCampaignConfig("ALL_DOCS")
        transferRecords = self.reqmgrAux.getTransferInfo('ALL_DOCS')
        cdict = {}
        if not campaigns:
            self.logger.warning("Failed to fetch campaign configurations")
        if not transferRecords:
            self.logger.warning("Failed to fetch transfer records")
        else:
            for camp in campaigns:
                cdict[camp['CampaignName']] = camp
        return cdict, transferRecords

    def filterTransferDocs(self, requests, transferDocs):
        """
        Given a list of requests in the `staging` status and all the
        transfer documents; select the transfer documents that:
         * match against a workflow in requests
         * haven't been updated over the last updateInterval seconds
        :param requests: list of workflow names
        :param transferDocs: list of transfer documents
        :return: a filtered out list of transfer documents
        """
        now = time.time()
        newTransferDocs = []
        self.logger.info("Matching %d requests to %d transfer documents...",
                         len(requests), len(transferDocs))
        for record in transferDocs:
            if record['workflowName'] in requests:
                if now - record['lastUpdate'] > self.updateInterval:
                    newTransferDocs.append(record)
        msg = "Only %d transfer documents passed the status and timestamp filter."
        self.logger.info(msg, len(newTransferDocs))
        return newTransferDocs

    def execute(self, reqStatus):
        """
        Executes the MS monitoring logic, see
        https://github.com/dmwm/WMCore/wiki/ReqMgr2-MicroService-Monitor

        :param reqStatus: request status to process
        :return: a summary of the activity of the last cycle
        """
        summary = dict(MONITOR_REPORT)
        try:
            # get requests from ReqMgr2 data-service for given status
            # here with detail=False we get back list of records
            requests = self.reqmgr2.getRequestByStatus([reqStatus], detail=False)
            self.logger.info('  retrieved %s requests in status: %s', len(requests), reqStatus)

            campaigns, transferRecords = self.updateCaches()
            self.updateReportDict(summary, "total_num_campaigns", len(campaigns))
            self.updateReportDict(summary, "total_num_transfers", len(transferRecords))
            if not campaigns or not transferRecords:
                # then wait until the next cycle
                msg = "Failed to fetch data from one of the data sources. Retrying again in the next cycle"
                self.logger.error(msg)
                self.updateReportDict(summary, "error", msg)
                return summary
            transferRecords = self.filterTransferDocs(requests, transferRecords)
            self.updateReportDict(summary, "filtered_transfer_docs", len(transferRecords))
        except Exception as ex:  # general error
            msg = 'Unknown exception bootstrapping the MSMonitor thread. Error: %s', str(ex)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)
            return summary

        try:
            # keep track of request and their new statuses
            skippedWorkflows = self.getTransferInfo(transferRecords)
            requestsToStage = self.getCompletedWorkflows(transferRecords, campaigns)
            failedDocs = self.updateTransferDocs(transferRecords, skippedWorkflows)
            self.updateReportDict(summary, "success_transfer_doc_update",
                                  len(transferRecords) - len(failedDocs) - len(skippedWorkflows))
            self.updateReportDict(summary, "failed_transfer_doc_update", len(failedDocs))
            # finally, update statuses for requests
            for reqName in requestsToStage:
                if reqName in failedDocs:
                    msg = "Can't proceed with status transition for %s, because" % reqName
                    msg += "the transfer document failed to get updated"
                    self.logger.warning(msg)
                    continue
                self.change(reqName, 'staged', self.__class__.__name__)
            self.updateReportDict(summary, "request_status_updated",
                                  summary['success_transfer_doc_update'] - summary['failed_transfer_doc_update'])
            msg = "%s processed %d transfer records, where " % (self.__class__.__name__, len(transferRecords))
            msg += "%d completed their data transfers, " % len(requestsToStage)
            msg += "%d failed to contact the DM system and were skipped in this cycle and " % len(skippedWorkflows)
            msg += "%d failed to get their transfer documents updated in CouchDB." % len(failedDocs)
            self.logger.info(msg)
        except Exception as ex:
            msg = "Unknown exception processing the transfer records. Error: %s" % str(ex)
            self.logger.exception(msg)
            self.updateReportDict(summary, "error", msg)
        return summary

    def getTransferInfo(self, transferRecords):
        """
        Contact the data management tool in order to get a status
        update for the transfer request.
        :param transferRecords: list of transfer records
        :return skippedWorkflows: a list of workflow names which a call to the data
        management system did not succeed
        """
        # FIXME: create concurrent rucio calls using multi_getdata
        skippedWorkflows = []
        tstamp = int(time.time())
        for doc in transferRecords:
            self.logger.debug("Checking transfers for: %s", doc['workflowName'])
            if not doc['transfers']:
                # nothing to be done, simply update the document last timestamp
                doc['lastUpdate'] = tstamp
                continue

            try:
                for rec in doc['transfers']:
                    # obtain new transfer ids and completion for given dataset
                    completion = self._getRucioTransferstatus(rec['transferIDs'])
                    rec['completion'].append(round(completion, 3))
                doc['lastUpdate'] = tstamp
            except Exception as exc:
                msg = "Unknown exception checking workflow %s. Error: %s"
                self.logger.exception(msg, doc['workflowName'], str(exc))
                skippedWorkflows.append(doc['workflowName'])
        return skippedWorkflows

    def _getRucioTransferstatus(self, rulesList):
        """
        Given a list of Rucio rules ID - for a given input data - check the
        overall transfer status from Rucio
        :param rulesList: list of rules ID
        :return: the overall transfers percent completion

        The Rucio getRule API returns data in the form of:
            {u'account': u'transfer_ops',
             u'grouping': u'ALL',
             u'id': u'40cbe787a42b4f6e991611f6fac3bb11',
             u'locked': True,
             u'locks_ok_cnt': 8,
             u'locks_replicating_cnt': 0,
             u'locks_stuck_cnt': 0,
             u'meta': None,
             etc etc
        NOTE: completion in Rucio is different than in PhEDEx. PhEDEx gives the
        percentage value; while Rucio gives the ratio (0 - 1).
        """
        completion = []
        for ruleID in rulesList:
            # if we query by dataset and the subscription was at block level,
            # we get an empty response. So always wildcard the block parameter
            data = self.rucio.getRule(ruleID)
            if not data:
                msg = "Failed to retrieve rule information from Rucio for rule ID: {}".format(ruleID)
                raise RuntimeError(msg)

            if data['state'] == "OK":
                lockCompletion = 100.0
            else:
                totalLocks = data['locks_ok_cnt'] + data['locks_replicating_cnt'] + data['locks_stuck_cnt']
                try:
                    lockCompletion = (data['locks_ok_cnt'] / totalLocks) * 100
                except ZeroDivisionError:
                    self.logger.warning("Rule does not have any lock counts yet. Rule data: %s", data)
                    lockCompletion = 0
            completion.append(lockCompletion)
            self.logger.info("Rule ID: %s has a completion rate of: %s%%", ruleID, lockCompletion)
            self.logger.debug("Rule ID: %s, DID: %s, state: %s, grouping: %s, rse_expression: %s",
                              ruleID, data['name'], data['state'], data['grouping'], data['rse_expression'])
        if not completion:
            return 0
        return sum(completion) / len(completion)


    def getCompletedWorkflows(self, transfers, campaigns):
        """
        Parse the transfer documents, compare against the campaign settings
        and decide whether the workflow is completed or not.
        :param transfers: list of transfers records
        :param campaigns: dictionary of campaigns
        :return: completion status
        """
        completedWfs = []
        for record in transfers:
            reqName = record['workflowName']
            if not record['transfers']:
                self.logger.info("%s OK, no input data transfers, move it on.", reqName)
                completedWfs.append(reqName)
                continue
            # check completion of all transfers
            statuses = []
            for transfer in record['transfers']:
                cdict = campaigns[transfer['campaignName']]
                # compare against the last completion number, which is from the last cycle execution
                if transfer['completion'][-1] >= cdict['PartialCopy'] * 100:
                    status = 1
                else:
                    status = 0
                statuses.append(status)
            if all(statuses):
                self.logger.info("%s OK, all transfers completed or above threshold, move it on.", reqName)
                completedWfs.append(reqName)
        return completedWfs

    def updateTransferDocs(self, docs, workflowsToSkip):
        """
        Given a list of transfer documents, update all of them in
        ReqMgrAux database.
        :param docs: list of transfer docs
        :param workflowsToSkip: list of workflow names that should not be updated in CouchDB
        :return: a list of request names that failed to be updated
        """
        failedWfs = []
        for rec in docs:
            if rec['workflowName'] in workflowsToSkip:
                self.logger.warning("Not updating transfer record in CouchDB for: %s", rec['workflowName'])
                continue
            if not self.reqmgrAux.updateTransferInfo(rec['workflowName'], rec):
                # then it failed to update the doc, ReqMgrAux client is logging it already
                failedWfs.append(rec['workflowName'])
        return failedWfs
Exemple #15
0
def loggerSetup(logLevel=logging.INFO):
    """
    Return a logger which writes everything to stdout.
    """
    logger = logging.getLogger(__name__)
    outHandler = logging.StreamHandler(sys.stdout)
    outHandler.setFormatter(logging.Formatter("%(asctime)s:%(levelname)s:%(module)s: %(message)s"))
    outHandler.setLevel(logLevel)
    logger.addHandler(outHandler)
    logger.setLevel(logLevel)
    return logger


if __name__ == '__main__':
    args = parseArgs()
    logger = loggerSetup()

    rucio = Rucio(acct=RUCIO_ACCT, hostUrl=RUCIO_URL, authUrl=RUCIO_AUTH_URL,
                  configDict={"logger": logger, "user_agent": "amaltaro/makeRucioRules"})
    rule = {'copies': 1,
            'activity': 'Production Input',
            'lifetime': None,
            'account': RUCIO_ACCT,
            'grouping': "ALL",
            'comment': 'WMCore MSTransferor input data placement'}
    logger.info("\nCreating rule for DID: %s, with RSE: %s and other attrs: %s",
                args.container, args.rse, rule)
    resp = rucio.createReplicationRule(args.container, args.rse, **rule)
    logger.info("Response: %s", resp)
Exemple #16
0
class PileupFetcher(FetcherInterface):
    """
    Pull dataset block/SE : LFN list from DBS for the
    pileup datasets required by the steps in the job.

    Save these maps as files in the sandbox

    """
    def __init__(self):
        """
        Prepare module setup
        """
        super(PileupFetcher, self).__init__()
        # FIXME: find a way to pass the Rucio account name to this fetcher module
        self.rucioAcct = "wmcore_transferor"
        self.rucio = Rucio(self.rucioAcct)

    def _queryDbsAndGetPileupConfig(self, stepHelper, dbsReader):
        """
        Method iterates over components of the pileup configuration input
        and queries DBS for valid files in the dataset, plus some extra
        information about each file.

        Information is organized at block level, listing all its files,
        number of events in the block, and its data location (to be resolved
        by a different method using either PhEDEx or Rucio), such as:

        {"pileupTypeA": {"BlockA": {"FileList": [], "PhEDExNodeNames": [], "NumberOfEvents": 123},
                         "BlockB": {"FileList": [], "PhEDExNodeName": []}, ....}
        """
        resultDict = {}
        # iterate over input pileup types (e.g. "cosmics", "minbias")
        for pileupType in stepHelper.data.pileup.listSections_():
            # the format here is: step.data.pileup.cosmics.dataset = [/some/data/set]
            datasets = getattr(getattr(stepHelper.data.pileup, pileupType), "dataset")
            # each dataset input can generally be a list, iterate over dataset names
            blockDict = {}
            for dataset in datasets:

                for fileInfo in dbsReader.getFileListByDataset(dataset=dataset, detail=True):
                    blockDict.setdefault(fileInfo['block_name'], {'FileList': [],
                                                                  'NumberOfEvents': 0,
                                                                  'PhEDExNodeNames': []})
                    blockDict[fileInfo['block_name']]['FileList'].append(fileInfo['logical_file_name'])
                    blockDict[fileInfo['block_name']]['NumberOfEvents'] += fileInfo['event_count']

                self._getDatasetLocation(dataset, blockDict)

            resultDict[pileupType] = blockDict
        return resultDict

    def _getDatasetLocation(self, dset, blockDict):
        """
        Given a dataset name, query PhEDEx or Rucio and resolve the block location
        :param dset: string with the dataset name
        :param blockDict: dictionary with DBS summary info
        :return: update blockDict in place
        """
        blockReplicas = self.rucio.getPileupLockedAndAvailable(dset, account=self.rucioAcct)
        for blockName, blockLocation in viewitems(blockReplicas):
            try:
                blockDict[blockName]['PhEDExNodeNames'] = list(blockLocation)
            except KeyError:
                logging.warning("Block '%s' present in Rucio but not in DBS", blockName)

    def _getCacheFilePath(self, stepHelper):

        fileName = ""
        for pileupType in stepHelper.data.pileup.listSections_():
            datasets = getattr(getattr(stepHelper.data.pileup, pileupType), "dataset")
            fileName += ("_").join(datasets)
        # TODO cache is not very effective if the dataset combination is different between workflow
        # here is possibility of hash value collision
        cacheFile = "%s/pileupconf-%s.json" % (self.cacheDirectory(), hash(fileName))
        return cacheFile

    def _getStepFilePath(self, stepHelper):
        stepPath = "%s/%s" % (self.workingDirectory(), stepHelper.name())
        fileName = "%s/%s" % (stepPath, "pileupconf.json")

        return fileName

    def _writeFile(self, filePath, jsonPU):

        directory = filePath.rsplit('/', 1)[0]

        if not os.path.exists(directory):
            os.mkdir(directory)
        try:
            with open(filePath, 'w') as f:
                f.write(jsonPU)
        except IOError:
            m = "Could not save pileup JSON configuration file: '%s'" % filePath
            raise RuntimeError(m)

    def _copyFile(self, src, dest):

        directory = dest.rsplit('/', 1)[0]

        if not os.path.exists(directory):
            os.mkdir(directory)
        shutil.copyfile(src, dest)

    def _isCacheExpired(self, cacheFilePath, delta=24):
        """Is the cache expired? At delta hours (default 24) in the future.
        """
        # cache can either be a file name or an already opened file object

        if not os.path.exists(cacheFilePath):
            return True

        delta = datetime.timedelta(hours=delta)
        t = datetime.datetime.now() - delta
        # cache file mtime has been set to cache expiry time
        if os.path.getmtime(cacheFilePath) < time.mktime(t.timetuple()):
            return True

        return False

    def _isCacheValid(self, stepHelper):
        """
        Check whether cache is exits
        TODO: if the cacheDirectory is not inside the Sandbox it should not autormatically deleted.
              We can add cache refresh policy here
        """
        cacheFile = self._getCacheFilePath(stepHelper)

        if not self._isCacheExpired(cacheFile, delta=0.5) and os.path.getsize(cacheFile) > 0:
            # if file already exist don't make a new dbs call and overwrite the file.
            # just return
            fileName = self._getStepFilePath(stepHelper)
            if not os.path.isfile(fileName) or os.path.getsize(fileName) != os.path.getsize(cacheFile):
                self._copyFile(cacheFile, fileName)
            return True
        else:
            return False

    def _saveFile(self, stepHelper, jsonPU):

        cacheFile = self._getCacheFilePath(stepHelper)
        self._writeFile(cacheFile, jsonPU)
        fileName = self._getStepFilePath(stepHelper)
        self._copyFile(cacheFile, fileName)

    def createPileupConfigFile(self, helper):
        """
        Stores pileup JSON configuration file in the working
        directory / sandbox.

        """
        if self._isCacheValid(helper):
            # if file already exist don't make a new dbs call and overwrite the file.
            # just return
            return

        encoder = JSONEncoder()
        # this should have been set in CMSSWStepHelper along with
        # the pileup configuration
        url = helper.data.dbsUrl
        dbsReader = DBSReader(url)

        configDict = self._queryDbsAndGetPileupConfig(helper, dbsReader)

        # create JSON and save into a file
        jsonPU = encoder.encode(configDict)
        self._saveFile(helper, jsonPU)

    def __call__(self, wmTask):
        """
        Method is called  when WorkQueue creates the sandbox for a job.
        Need to look at the pileup configuration in the spec and query dbs to
        determine the lfns for the files in the datasets and what sites they're
        located at (WQ creates the job sandbox).

        wmTask is instance of WMTask.WMTaskHelper

        """
        for step in wmTask.steps().nodeIterator():
            helper = WMStep.WMStepHelper(step)
            # returns e.g. instance of CMSSWHelper
            # doesn't seem to be necessary ... strangely (some inheritance involved?)
            # typeHelper = helper.getTypeHelper()
            if hasattr(helper.data, "pileup"):
                self.createPileupConfigFile(helper)
Exemple #17
0
class RucioTest(EmulatedUnitTestCase):
    """
    Unit tests for Rucio Service module
    """
    def __init__(self, methodName='runTest'):
        # TODO figure out what's going on with CRIC mock
        super(RucioTest, self).__init__(methodName=methodName, mockCRIC=False)

        self.acct = "wmagent_testing"

        # HACK: do not verify the SSL certificate because docker images
        # do not contain the CA certificate bundle
        # Relying on the config file in the jenkins infrastructure is a PITA
        # so let's make sure to pass all the necessary arguments
        self.creds = {
            "client_cert": os.getenv("X509_USER_CERT", "Unknown"),
            "client_key": os.getenv("X509_USER_KEY", "Unknown")
        }

        self.defaultArgs = {
            "host": 'http://cms-rucio-dev.cern.ch',
            "auth_host": 'https://cms-rucio-auth-dev.cern.ch',
            "auth_type": "x509",
            "account": self.acct,
            "ca_cert": False,
            "timeout": 30,
            "request_retries": 3,
            "creds": self.creds
        }

    def setUp(self):
        """
        Setup for unit tests
        """
        super(RucioTest, self).setUp()

        self.myRucio = Rucio(self.acct,
                             hostUrl=self.defaultArgs['host'],
                             authUrl=self.defaultArgs['auth_host'],
                             configDict=self.defaultArgs)

        self.client = testClient(rucio_host=self.defaultArgs['host'],
                                 auth_host=self.defaultArgs['auth_host'],
                                 account=self.acct,
                                 ca_cert=self.defaultArgs['ca_cert'],
                                 auth_type=self.defaultArgs['auth_type'],
                                 creds=self.defaultArgs['creds'],
                                 timeout=self.defaultArgs['timeout'])

    def tearDown(self):
        """
        Nothing to be done for this case
        """
        pass

    def testConfig(self):
        """
        Test service attributes and the override mechanism
        """
        for key in self.defaultArgs:
            self.assertEqual(getattr(self.myRucio.cli, key),
                             self.defaultArgs[key])
        self.assertTrue(
            getattr(self.myRucio.cli,
                    "user_agent").startswith("wmcore-client/"))
        self.assertTrue(
            getattr(self.client, "user_agent").startswith("rucio-clients/"))

        newParams = {
            "host": 'http://cms-rucio-dev.cern.ch',
            "auth_host": 'https://cms-rucio-auth-dev.cern.ch',
            "auth_type": "x509",
            "account": self.acct,
            "ca_cert": False,
            "timeout": 5,
            "phedexCompatible": False
        }
        newKeys = newParams.keys()
        newKeys.remove("phedexCompatible")

        rucio = Rucio(newParams['account'],
                      hostUrl=newParams['host'],
                      authUrl=newParams['auth_host'],
                      configDict=newParams)

        self.assertEqual(getattr(rucio, "phedexCompat"), False)
        for key in newKeys:
            self.assertEqual(getattr(rucio.cli, key), newParams[key])

    def testGetAccount(self):
        """
        Test whether we can fetch data about a specific rucio account
        """
        res = self.client.get_account(self.acct)
        res2 = self.myRucio.getAccount(self.acct)
        self.assertEqual(res['account'], self.acct)
        self.assertEqual(res['status'], "ACTIVE")
        self.assertEqual(res['account_type'], "USER")
        self.assertTrue({"status", "account",
                         "account_type"}.issubset(set(res2.keys())))
        self.assertTrue({self.acct, "ACTIVE",
                         "USER"}.issubset(set(res2.values())))

    # @attr('integration')
    def testWhoAmI(self):
        """
        Test user mapping information from the request headers
        """
        res = dict(self.client.whoami())
        res2 = dict(self.myRucio.whoAmI())
        self.assertTrue({"status", "account"}.issubset(set(res.keys())))
        self.assertTrue(set(res.keys()) == set(res2.keys()))

    def testPing(self):
        """
        Tests server ping
        """
        res = self.client.ping()
        res2 = self.myRucio.pingServer()
        self.assertTrue("version" in res)
        self.assertItemsEqual(res, res2)

    def testGetBlocksInContainer(self):
        """
        Test `getBlocksInContainer` method, the ability to retrieve blocks
        inside a container.
        """
        # test a CMS dataset that does not exist
        res = self.myRucio.getBlocksInContainer("Alan")
        self.assertEqual(res, [])

        # provide a CMS block instead of a dataset
        res = self.myRucio.getBlocksInContainer(BLOCK)
        self.assertEqual(res, [])

        # finally provide a real CMS dataset
        res = self.myRucio.getBlocksInContainer(DSET)
        self.assertTrue(len(res) >= len([BLOCK]))
        self.assertIn(BLOCK, res)

    def testGetReplicaInfoForBlocks(self):
        """
        Test `getReplicaInfoForBlocks` method, the ability to retrieve replica
        locations provided a dataset or block. Same output as PhEDEx.
        """
        res = self.myRucio.getReplicaInfoForBlocks(block=BLOCK)
        self.assertEqual(len(res['phedex']['block']), 1)
        block = res['phedex']['block'].pop()
        self.assertEqual(block['name'], BLOCK)
        replicas = [item['node'] for item in block['replica']]
        self.assertTrue(len(replicas) > 0)

        # same test, but providing a dataset as input (which has 4 blocks)
        res = self.myRucio.getReplicaInfoForBlocks(dataset=DSET)
        self.assertTrue(len(res['phedex']['block']) >=
                        1)  # at this very moment, there are 11 replicas
        blocks = [item['name'] for item in res['phedex']['block']]
        self.assertTrue(BLOCK in blocks)
        for item in res['phedex']['block']:
            self.assertTrue(len(item['replica']) > 0)

    def testGetReplicaInfoForBlocksRucio(self):
        """
        Test `getReplicaInfoForBlocks` method, however not using
        the output compatibility with PhEDEx
        """
        theseArgs = self.defaultArgs.copy()
        theseArgs['phedexCompatible'] = False
        myRucio = Rucio(self.acct,
                        hostUrl=theseArgs['host'],
                        authUrl=theseArgs['auth_host'],
                        configDict=theseArgs)

        res = myRucio.getReplicaInfoForBlocks(dataset=DSET)
        self.assertTrue(isinstance(res, list))
        self.assertTrue(
            len(res) >= 1)  # at this very moment, there are 11 replicas
        blocks = [item['name'] for item in res]
        self.assertTrue(BLOCK in blocks)
        for item in res:
            self.assertTrue(len(item['replica']) > 0)

    def testGetPFN(self):
        """
        Test `getPFN` method
        """
        self.assertRaises(NotImplementedError, self.myRucio.getPFN)
Exemple #18
0
class RucioTest(EmulatedUnitTestCase):
    """
    Unit tests for Rucio Service module
    """
    def __init__(self, methodName='runTest'):
        # TODO figure out what's going on with CRIC mock
        super(RucioTest, self).__init__(methodName=methodName, mockCRIC=False)

        self.acct = "wma_test"

        # HACK: do not verify the SSL certificate because docker images
        # do not contain the CA certificate bundle
        # Relying on the config file in the jenkins infrastructure is a PITA
        # so let's make sure to pass all the necessary arguments
        self.creds = {
            "client_cert": os.getenv("X509_USER_CERT", "Unknown"),
            "client_key": os.getenv("X509_USER_KEY", "Unknown")
        }

        self.defaultArgs = {
            "host": 'http://cms-rucio-int.cern.ch',
            "auth_host": 'https://cms-rucio-auth-int.cern.ch',
            "auth_type": "x509",
            "account": self.acct,
            "ca_cert": False,
            "timeout": 30,
            "request_retries": 3,
            "creds": self.creds
        }

    def setUp(self):
        """
        Setup for unit tests
        """
        super(RucioTest, self).setUp()

        self.myRucio = Rucio(self.acct,
                             hostUrl=self.defaultArgs['host'],
                             authUrl=self.defaultArgs['auth_host'],
                             configDict=self.defaultArgs)

        self.client = testClient(rucio_host=self.defaultArgs['host'],
                                 auth_host=self.defaultArgs['auth_host'],
                                 account=self.acct,
                                 ca_cert=self.defaultArgs['ca_cert'],
                                 auth_type=self.defaultArgs['auth_type'],
                                 creds=self.defaultArgs['creds'],
                                 timeout=self.defaultArgs['timeout'])

    def tearDown(self):
        """
        Nothing to be done for this case
        """
        pass

    def testConfig(self):
        """
        Test service attributes and the override mechanism
        """
        for key in self.defaultArgs:
            self.assertEqual(getattr(self.myRucio.cli, key),
                             self.defaultArgs[key])
        self.assertTrue(
            getattr(self.myRucio.cli,
                    "user_agent").startswith("wmcore-client/"))
        self.assertTrue(
            getattr(self.client, "user_agent").startswith("rucio-clients/"))

        newParams = {
            "host": 'http://cms-rucio-int.cern.ch',
            "auth_host": 'https://cms-rucio-auth-int.cern.ch',
            "auth_type": "x509",
            "account": self.acct,
            "ca_cert": False,
            "timeout": 5,
            "phedexCompatible": False
        }
        newKeys = newParams.keys()
        newKeys.remove("phedexCompatible")

        rucio = Rucio(newParams['account'],
                      hostUrl=newParams['host'],
                      authUrl=newParams['auth_host'],
                      configDict=newParams)

        self.assertEqual(getattr(rucio, "phedexCompat"), False)
        for key in newKeys:
            self.assertEqual(getattr(rucio.cli, key), newParams[key])

    def testGetAccount(self):
        """
        Test whether we can fetch data about a specific rucio account
        """
        res = self.client.get_account(self.acct)
        res2 = self.myRucio.getAccount(self.acct)
        self.assertEqual(res['account'], self.acct)
        self.assertEqual(res['status'], "ACTIVE")
        self.assertEqual(res['account_type'], "USER")
        self.assertTrue({"status", "account",
                         "account_type"}.issubset(set(res2.keys())))
        self.assertTrue({self.acct, "ACTIVE",
                         "USER"}.issubset(set(res2.values())))

    def testGetAccountUsage(self):
        """
        Test whether we can fetch data about a specific rucio account
        """
        res = list(self.client.get_account_usage(self.acct))
        res2 = self.myRucio.getAccountUsage(self.acct)
        # I have manually created a rule for this account, so it will be there...
        self.assertEqual(res, res2)

        # now test against an account that either does not exist or that we cannot access
        res = self.myRucio.getAccountUsage("admin")
        self.assertIsNone(res)

    # @attr('integration')
    def testWhoAmI(self):
        """
        Test user mapping information from the request headers
        """
        res = dict(self.client.whoami())
        res2 = dict(self.myRucio.whoAmI())
        self.assertTrue({"status", "account"}.issubset(set(res.keys())))
        self.assertTrue(set(res.keys()) == set(res2.keys()))

    def testPing(self):
        """
        Tests server ping
        """
        res = self.client.ping()
        res2 = self.myRucio.pingServer()
        self.assertTrue("version" in res)
        self.assertItemsEqual(res, res2)

    def testGetBlocksInContainer(self):
        """
        Test `getBlocksInContainer` method, the ability to retrieve blocks
        inside a container.
        """
        # test a CMS dataset that does not exist
        res = self.myRucio.getBlocksInContainer("Alan")
        self.assertEqual(res, [])

        # provide a CMS block instead of a dataset
        res = self.myRucio.getBlocksInContainer(BLOCK)
        self.assertEqual(res, [])

        # finally provide a real CMS dataset
        res = self.myRucio.getBlocksInContainer(DSET)
        self.assertTrue(len(res) >= len([BLOCK]))
        self.assertIn(BLOCK, res)

    def testGetReplicaInfoForBlocks(self):
        """
        Test `getReplicaInfoForBlocks` method, the ability to retrieve replica
        locations provided a dataset or block. Same output as PhEDEx.
        """
        res = self.myRucio.getReplicaInfoForBlocks(block=BLOCK)
        self.assertEqual(len(res['phedex']['block']), 1)
        block = res['phedex']['block'].pop()
        self.assertEqual(block['name'], BLOCK)
        replicas = [item['node'] for item in block['replica']]
        self.assertTrue(len(replicas) > 0)

        # same test, but providing a dataset as input (which has 4 blocks)
        res = self.myRucio.getReplicaInfoForBlocks(dataset=DSET)
        self.assertTrue(len(res['phedex']['block']) >=
                        1)  # at this very moment, there are 11 replicas
        blocks = [item['name'] for item in res['phedex']['block']]
        self.assertTrue(BLOCK in blocks)
        for item in res['phedex']['block']:
            self.assertTrue(len(item['replica']) > 0)

    def testGetReplicaInfoForBlocksRucio(self):
        """
        Test `getReplicaInfoForBlocks` method, however not using
        the output compatibility with PhEDEx
        """
        theseArgs = self.defaultArgs.copy()
        theseArgs['phedexCompatible'] = False
        myRucio = Rucio(self.acct,
                        hostUrl=theseArgs['host'],
                        authUrl=theseArgs['auth_host'],
                        configDict=theseArgs)

        res = myRucio.getReplicaInfoForBlocks(dataset=DSET)
        self.assertTrue(isinstance(res, list))
        self.assertTrue(
            len(res) >= 1)  # at this very moment, there are 11 replicas
        blocks = [item['name'] for item in res]
        self.assertTrue(BLOCK in blocks)
        for item in res:
            self.assertTrue(len(item['replica']) > 0)

    def testGetPFN(self):
        """
        Test `getPFN` method
        """
        self.assertRaises(NotImplementedError, self.myRucio.getPFN)

    def testListContent(self):
        """
        Test `listContent` method, to list content of a given DID
        """
        # listing blocks for a dataset
        res = self.myRucio.listContent(DSET)
        self.assertTrue(len(res) > 10)
        self.assertEqual(res[0]["type"], "DATASET")

        # listing files for a block
        res = self.myRucio.listContent(BLOCK)
        self.assertTrue(len(res) > 10)
        self.assertEqual(res[0]["type"], "FILE")

        res = self.myRucio.listContent("/Primary/ProcStr-v1/tier")
        self.assertItemsEqual(res, [])

    def testListDataRules(self):
        """
        Test `listContent` method
        """
        res = self.myRucio.listDataRules(DSET)
        self.assertItemsEqual(res, [])

    def testGetRule(self):
        """
        Test `getRule` method
        """
        # Badly formatted rule id, raises/catches a general exception
        res = self.myRucio.getRule("blah")
        self.assertItemsEqual(res, {})

        # Properly formatted rule, but inexistent id
        res = self.myRucio.getRule("1d6ea1d916d5492e81b1bb30ed4aebc0")
        self.assertItemsEqual(res, {})

        # Properly formatted rule, rule manually created
        res = self.myRucio.getRule("1d6ea1d916d5492e81b1bb30ed4aebc1")
        self.assertTrue(res)

    def testMetaDataValidation(self):
        """
        Test the `validateMetaData` validation function
        """
        for thisProj in RUCIO_VALID_PROJECT:
            response = validateMetaData("any_DID_name", dict(project=thisProj),
                                        self.myRucio.logger)
            self.assertTrue(response)

        # test with no "project" meta data at all
        response = validateMetaData("any_DID_name", dict(),
                                    self.myRucio.logger)
        self.assertTrue(response)

        # now an invalid "project" meta data
        response = validateMetaData("any_DID_name", dict(project="mistake"),
                                    self.myRucio.logger)
        self.assertFalse(response)
Exemple #19
0
class RucioInjectorPoller(BaseWorkerThread):
    """
    _RucioInjectorPoller_

    Poll the DBSBuffer database and inject files as they are created.

    The logic of this component is:
      * create a rucio container (or reuse a pre-existent one)
      * create a CMS block (or reuse a pre-existent one), block gets automatically attached
      * create file/replicas, which get automatically attached to its block as well
      * now create a CMS block rule to protect this data
      * if the block has been inserted into DBS, close the block in Rucio

    In addition to that, it has logic for rucio container subscription (rule creation),
    and block rule removal. Those follow a different polling cycle though.
    """
    def __init__(self, config):
        """
        ___init___

        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        self.enabled = config.RucioInjector.enabled
        # dataset rule creation has a larger polling cycle
        self.pollRules = config.RucioInjector.pollIntervalRules
        self.lastRulesExecTime = 0
        self.createBlockRules = config.RucioInjector.createBlockRules
        self.skipRulesForTiers = config.RucioInjector.skipRulesForTiers
        self.listTiersToInject = config.RucioInjector.listTiersToInject

        # setup cache for container and blocks (containers can be much longer, make 6 days now)
        self.containersCache = MemoryCache(
            config.RucioInjector.cacheExpiration * 3, set())
        self.blocksCache = MemoryCache(config.RucioInjector.cacheExpiration,
                                       set())

        self.scope = getattr(config.RucioInjector, "scope", "cms")
        self.rucioAcct = config.RucioInjector.rucioAccount
        self.rucio = Rucio(acct=self.rucioAcct,
                           hostUrl=config.RucioInjector.rucioUrl,
                           authUrl=config.RucioInjector.rucioAuthUrl,
                           configDict={'logger': self.logger})

        # metadata dictionary information to be added to block/container rules
        # cannot be a python dictionary, but a JSON string instead
        self.metaData = json.dumps(
            dict(agentHost=config.Agent.hostName,
                 userAgent=config.Agent.agentName))

        self.testRSEs = config.RucioInjector.RSEPostfix
        self.filesToRecover = []

        logging.info(
            "Component configured to only inject data for data tiers: %s",
            self.listTiersToInject)
        logging.info(
            "Component configured to skip container rule creation for data tiers: %s",
            self.skipRulesForTiers)
        logging.info("Component configured to create block rules: %s",
                     self.createBlockRules)

    def setup(self, parameters):
        """
        _setup_

        Create DAO Factory and setup some DAO.
        """
        myThread = threading.currentThread()
        daofactory = DAOFactory(package="WMComponent.RucioInjector.Database",
                                logger=self.logger,
                                dbinterface=myThread.dbi)

        self.getUninjected = daofactory(classname="GetUninjectedFiles")
        self.getMigrated = daofactory(classname="GetMigratedBlocks")

        self.getUnsubscribedBlocks = daofactory(
            classname="GetUnsubscribedBlocks")
        self.setBlockRules = daofactory(classname="SetBlocksRule")

        self.findDeletableBlocks = daofactory(classname="GetDeletableBlocks")
        self.markBlocksDeleted = daofactory(classname="MarkBlocksDeleted")
        self.getUnsubscribedDsets = daofactory(
            classname="GetUnsubscribedDatasets")
        self.markSubscribed = daofactory(classname="MarkDatasetSubscribed")

        daofactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                logger=self.logger,
                                dbinterface=myThread.dbi)
        self.setStatus = daofactory(classname="DBSBufferFiles.SetPhEDExStatus")
        self.setBlockClosed = daofactory(classname="SetBlockClosed")

    @timeFunction
    def algorithm(self, parameters):
        """
        _algorithm_

        Poll the database for uninjected files and inject them into Rucio.
        """
        if not self.enabled:
            logging.info(
                "RucioInjector component is disabled in the configuration, exiting."
            )
            return

        logging.info("Running Rucio injector poller algorithm...")

        try:
            # files that failed to get their status updated in dbsbuffer
            self._updateLFNState(self.filesToRecover, recovery=True)

            # get dbsbuffer_file.in_phedex = 0
            uninjectedFiles = self.getUninjected.execute()

            # while we commission Rucio within WM, not all datatiers are supposed
            # to be injected by this component. Remove any data that we are not
            # meant to process!
            uninjectedFiles = filterDataByTier(uninjectedFiles,
                                               self.listTiersToInject)

            # create containers in rucio  (and update local cache)
            containersAdded = self.insertContainers(uninjectedFiles)
            if self.containersCache.isCacheExpired():
                self.containersCache.setCache(containersAdded)
            else:
                self.containersCache.addItemToCache(containersAdded)

            # create blocks. Only update the cache once a rule gets created...
            blocksAdded = self.insertBlocks(uninjectedFiles)
            if self.blocksCache.isCacheExpired():
                self.blocksCache.setCache(blocksAdded)
            else:
                self.blocksCache.addItemToCache(blocksAdded)

            # create file replicas
            self.insertReplicas(uninjectedFiles)

            # now close blocks already uploaded to DBS
            self.closeBlocks()

            if self.lastRulesExecTime + self.pollRules <= int(time.time()):
                self.insertContainerRules()
                self.insertBlockRules()
                self.deleteBlocks()
        except Exception as ex:
            msg = "Caught unexpected exception in RucioInjector. Details:\n%s" % str(
                ex)
            logging.exception(msg)
            raise RucioInjectorException(msg)

        return

    def insertContainers(self, uninjectedData):
        """
        This method will insert containers into Rucio, provided they cannot be found in
        the local cache.
        :param uninjectedData: same data as it's returned from the uninjectedFiles
        :return: set of containers successfully inserted into Rucio
        """
        logging.info("Preparing to insert containers into Rucio...")
        newContainers = set()
        for location in uninjectedData:
            for container in uninjectedData[location]:
                # same container can be at multiple locations
                if container not in self.containersCache and container not in newContainers:
                    if self.rucio.createContainer(container):
                        logging.info("Container %s inserted into Rucio",
                                     container)
                        newContainers.add(container)
                    else:
                        logging.error("Failed to create container: %s",
                                      container)
        logging.info("Successfully inserted %d containers into Rucio",
                     newContainers)
        return newContainers

    def insertBlocks(self, uninjectedData):
        """
        This method will insert blocks into Rucio and attach them to their correspondent
        containers, when attaching this block, we also need to provide the RSE that it
        will be available.
        :param uninjectedData: same data as it's returned from the uninjectedFiles
        :return: a dictionary of successfully inserted blocks and their correspondent location
        """
        logging.info("Preparing to insert blocks into Rucio...")
        newBlocks = set()
        for location in uninjectedData:
            rseName = "%s_Test" % location if self.testRSEs else location
            for container in uninjectedData[location]:
                for block in uninjectedData[location][container]:
                    if block not in self.blocksCache:
                        if self.rucio.createBlock(block, rse=rseName):
                            logging.info("Block %s inserted into Rucio", block)
                            newBlocks.add(block)
                        else:
                            logging.error("Failed to create block: %s", block)
        logging.info("Successfully inserted %d blocks into Rucio", newBlocks)
        return newBlocks

    # TODO: this will likely go away once the phedex to rucio migration is over
    def _isBlockTierAllowed(self, blockName):
        """
        Performs a couple of checks on the block datatier, such as:
          * is the datatier supposed to be injected by this component
          * is the datatier supposed to get rules created by this component
        :return: True if the component can proceed with this block, False otherwise
        """
        endBlock = blockName.rsplit('/', 1)[1]
        endTier = endBlock.split('#')[0]
        if endTier not in self.listTiersToInject:
            return False
        if endTier in self.skipRulesForTiers:
            return False
        return True

    def insertBlockRules(self):
        """
        Creates a simple replication rule for every single block that
        is under production in a given site/RSE.
        Also persist the rule ID in the database.
        """
        if not self.createBlockRules:
            return

        logging.info("Preparing to create block rules into Rucio...")

        unsubBlocks = self.getUnsubscribedBlocks.execute()

        for item in unsubBlocks:
            if not self._isBlockTierAllowed(item['blockname']):
                logging.debug(
                    "Component configured to skip block rule for: %s",
                    item['blockname'])
                continue
            rseName = "%s_Test" % item['pnn'] if self.testRSEs else item['pnn']
            # DATASET = replicates all files in the same block to the same RSE
            resp = self.rucio.createReplicationRule(
                item['blockname'],
                rseExpression="rse=%s" % rseName,
                account=self.rucioAcct,
                grouping="DATASET",
                comment="WMAgent production site",
                meta=self.metaData)
            if resp:
                msg = "Block rule created for block: %s, at: %s, with rule id: %s"
                logging.info(msg, item['blockname'], item['pnn'], resp[0])
                binds = {'RULE_ID': resp[0], 'BLOCKNAME': item['blockname']}
                self.setBlockRules.execute(binds)
            else:
                logging.error("Failed to create rule for block: %s at %s",
                              item['blockname'], rseName)
        return

    def insertReplicas(self, uninjectedData):
        """
        Inserts replicas into Rucio and attach them to its specific block.
        If the insertion succeeds, also switch their database state to injected.

        :param uninjectedData: dictionary with blocks as key, and RSEs as value
        """
        # FIXME: I think we need a different data struct from the database
        # this method is very expensive O(n^4)
        logging.info("Preparing to insert replicas into Rucio...")

        for location in uninjectedData.keys():
            rseName = "%s_Test" % location if self.testRSEs else location
            for container in uninjectedData[location]:
                for block in uninjectedData[location][container]:
                    injectData = []
                    listLfns = []
                    for fileInfo in uninjectedData[location][container][block][
                            'files']:
                        listLfns.append(fileInfo['lfn'])
                        injectData.append(
                            dict(name=fileInfo['lfn'],
                                 scope=self.scope,
                                 bytes=fileInfo['size'],
                                 state="A",
                                 adler32=fileInfo['checksum']['adler32']))

                    if self.rucio.createReplicas(rse=rseName,
                                                 files=injectData,
                                                 block=block):
                        logging.info(
                            "Successfully inserted %d files on block %s",
                            len(listLfns), block)
                        self._updateLFNState(listLfns)
        return

    def _updateLFNState(self, listLfns, recovery=False):
        """
        Given a list of LFNs, update their state in dbsbuffer table.
        :param listLfns: list of LFNs
        :param recovery: True if we are recovering previously injected files
        :return: nothing
        """
        if not listLfns:
            return
        try:
            self.setStatus.execute(listLfns, 1)
        except Exception as ex:
            # save it to try to inject them again in the next cycle
            self.filesToRecover.extend(listLfns)
            if 'Deadlock found' in str(ex) or 'deadlock detected' in str(ex):
                logging.error(
                    "Deadlock during file status update. Retrying again in the next cycle."
                )
                self.filesToRecover.extend(listLfns)
            else:
                msg = "Failed to update file status in the database, reason: %s" % str(
                    ex)
                logging.error(msg)
                raise RucioInjectorException(msg)
        else:
            if recovery:
                self.filesToRecover = []

    def closeBlocks(self):
        """
        Close any blocks that have been migrated to global DBS
        """
        logging.info("Starting closeBlocks method")

        # in short, dbsbuffer_file.in_phedex = 1 AND dbsbuffer_block.status = 'InDBS'
        migratedBlocks = self.getMigrated.execute()
        ### FIXME the data format returned by this DAO
        for location in migratedBlocks:
            for container in migratedBlocks[location]:
                if not self._isContainerTierAllowed(container,
                                                    checkRulesList=False):
                    continue
                for block in migratedBlocks[location][container]:
                    if self.rucio.closeBlockContainer(block):
                        self.setBlockClosed.execute(block)
                    else:
                        logging.error(
                            "Failed to close block: %s. Will retry again later.",
                            block)

    def deleteBlocks(self):
        """
        _deleteBlocks_
        Find deletable blocks, then decide if to delete based on:
        Is there an active subscription for dataset or block ?
          If yes => set deleted=2
          If no => next check
        Has transfer to all destinations finished ?
          If yes => request block deletion, approve request, set deleted=1
          If no => do nothing (check again next cycle)
        """
        # FIXME: figure out the proper logic for rule block deletion
        logging.info("Starting deleteBlocks methods --> IMPLEMENT-ME!!!")

    # TODO: this will likely go away once the phedex to rucio migration is over
    def _isContainerTierAllowed(self, containerName, checkRulesList=True):
        """
        It compares the container datatier name to check whether the component
        should inject data for it or not.
        In addition to that, it can also evaluate whether it's allowed to create
        rules for such datatier or not.
        :param containerName: string with the name of the container
        :param checkRulesList: boolean to check or not against the list of tiers
          to be skipped in the rule creation
        :return: True if the component can proceed with this container, False otherwise
        """
        endTier = containerName.rsplit('/', 1)[1]
        if endTier not in self.listTiersToInject:
            return False
        if checkRulesList and endTier in self.skipRulesForTiers:
            return False
        return True

    def insertContainerRules(self):
        """
        _insertContainerRules_
        Poll the database for datasets meant to be subscribed and create
        a container level rule to replicate all files to a given RSE
        """
        logging.info("Starting insertContainerRules method")

        # FIXME also adapt the format returned by this DAO
        # Check for completely unsubscribed datasets
        # in short, files in phedex, file status in "GLOBAL" or "InDBS", and subscribed=0
        unsubscribedDatasets = self.getUnsubscribedDsets.execute()

        # Keep a list of subscriptions to tick as subscribed in the database
        subscriptionsMade = []

        # Create the subscription objects and add them to the list
        # The list takes care of the sorting internally
        for subInfo in unsubscribedDatasets:
            rse = subInfo['site']
            container = subInfo['path']
            if not self._isContainerTierAllowed(container):
                logging.debug(
                    "Component configured to skip container rule for: %s",
                    container)
                continue
            logging.info("Creating container rule for %s against RSE %s",
                         container, rse)

            rseName = "%s_Test" % rse if self.testRSEs else rse
            # ALL = replicates all files to the same RSE
            resp = self.rucio.createReplicationRule(
                container,
                rseExpression="rse=%s" % rseName,
                account=self.rucioAcct,
                grouping="ALL",
                comment="WMAgent automatic container rule",
                meta=self.metaData)
            if resp:
                logging.info("Container rule created for %s under rule id: %s",
                             container, resp)
                subscriptionsMade.append(subInfo['id'])
            else:
                logging.error("Failed to create rule for block: %s", container)

        # Register the result in DBSBuffer
        if subscriptionsMade:
            self.markSubscribed.execute(subscriptionsMade)

        return
Exemple #20
0
class PileupFetcher(FetcherInterface):
    """
    Pull dataset block/SE : LFN list from DBS for the
    pileup datasets required by the steps in the job.

    Save these maps as files in the sandbox

    """
    def __init__(self):
        """
        Prepare module setup
        """
        super(PileupFetcher, self).__init__()
        if usingRucio():
            # Too much work to pass the rucio account name all the way to here
            # just use the production rucio account for resolving pileup location
            self.rucio = Rucio("wma_prod",
                               configDict={'phedexCompatible': False})
        else:
            self.phedex = PhEDEx()  # this will go away eventually

    def _queryDbsAndGetPileupConfig(self, stepHelper, dbsReader):
        """
        Method iterates over components of the pileup configuration input
        and queries DBS. Then iterates over results from DBS.

        There needs to be a list of files and their locations for each
        dataset name.
        Use dbsReader
        the result data structure is a Python dict following dictionary:
            FileList is a list of LFNs

        {"pileupTypeA": {"BlockA": {"FileList": [], "PhEDExNodeNames": []},
                         "BlockB": {"FileList": [], "PhEDExNodeName": []}, ....}

        this structure preserves knowledge of where particular files of dataset
        are physically (list of PNNs) located. DBS only lists sites which
        have all files belonging to blocks but e.g. BlockA of dataset DS1 may
        be located at site1 and BlockB only at site2 - it's possible that only
        a subset of the blocks in a dataset will be at a site.

        """
        resultDict = {}
        # iterate over input pileup types (e.g. "cosmics", "minbias")
        for pileupType in stepHelper.data.pileup.listSections_():
            # the format here is: step.data.pileup.cosmics.dataset = [/some/data/set]
            datasets = getattr(getattr(stepHelper.data.pileup, pileupType),
                               "dataset")
            # each dataset input can generally be a list, iterate over dataset names
            blockDict = {}
            for dataset in datasets:

                blockFileInfo = dbsReader.getFileListByDataset(dataset=dataset,
                                                               detail=True)

                for fileInfo in blockFileInfo:
                    blockDict.setdefault(fileInfo['block_name'], {
                        'FileList': [],
                        'NumberOfEvents': 0,
                        'PhEDExNodeNames': []
                    })
                    blockDict[fileInfo['block_name']]['FileList'].append(
                        {'logical_file_name': fileInfo['logical_file_name']})
                    blockDict[fileInfo['block_name']][
                        'NumberOfEvents'] += fileInfo['event_count']

                self._getDatasetLocation(dataset, blockDict)

            resultDict[pileupType] = blockDict
        return resultDict

    def _getDatasetLocation(self, dset, blockDict):
        """
        Given a dataset name, query PhEDEx or Rucio and resolve the block location
        :param dset: string with the dataset name
        :param blockDict: dictionary with DBS summary info
        :return: update blockDict in place
        """
        node_filter = set(['UNKNOWN', None])

        if hasattr(self, "rucio"):
            # then it's Rucio!!
            blockReplicasInfo = self.rucio.getReplicaInfoForBlocks(
                dataset=dset)
            for item in blockReplicasInfo:
                block = item['name']
                try:
                    blockDict[block]['PhEDExNodeNames'] = item['replica']
                    blockDict[block]['FileList'] = sorted(
                        blockDict[block]['FileList'])
                except KeyError:
                    logging.warning(
                        "Block '%s' does not have any complete Rucio replica",
                        block)
        else:
            blockReplicasInfo = self.phedex.getReplicaPhEDExNodesForBlocks(
                dataset=dset, complete='y')
            for block in blockReplicasInfo:
                nodes = set(blockReplicasInfo[block]) - node_filter
                try:
                    blockDict[block]['PhEDExNodeNames'] = list(nodes)
                    blockDict[block]['FileList'] = sorted(
                        blockDict[block]['FileList'])
                except KeyError:
                    logging.warning(
                        "Block '%s' does not have any complete PhEDEx replica",
                        block)

    def _getCacheFilePath(self, stepHelper):

        fileName = ""
        for pileupType in stepHelper.data.pileup.listSections_():
            datasets = getattr(getattr(stepHelper.data.pileup, pileupType),
                               "dataset")
            fileName += ("_").join(datasets)
        # TODO cache is not very effective if the dataset combination is different between workflow
        # here is possibility of hash value collision
        cacheFile = "%s/pileupconf-%s.json" % (self.cacheDirectory(),
                                               hash(fileName))
        return cacheFile

    def _getStepFilePath(self, stepHelper):
        stepPath = "%s/%s" % (self.workingDirectory(), stepHelper.name())
        fileName = "%s/%s" % (stepPath, "pileupconf.json")

        return fileName

    def _writeFile(self, filePath, jsonPU):

        directory = filePath.rsplit('/', 1)[0]

        if not os.path.exists(directory):
            os.mkdir(directory)
        try:
            with open(filePath, 'w') as f:
                f.write(jsonPU)
        except IOError:
            m = "Could not save pileup JSON configuration file: '%s'" % filePath
            raise RuntimeError(m)

    def _copyFile(self, src, dest):

        directory = dest.rsplit('/', 1)[0]

        if not os.path.exists(directory):
            os.mkdir(directory)
        shutil.copyfile(src, dest)

    def _isCacheExpired(self, cacheFilePath, delta=24):
        """Is the cache expired? At delta hours (default 24) in the future.
        """
        # cache can either be a file name or an already opened file object

        if not os.path.exists(cacheFilePath):
            return True

        delta = datetime.timedelta(hours=delta)
        t = datetime.datetime.now() - delta
        # cache file mtime has been set to cache expiry time
        if os.path.getmtime(cacheFilePath) < time.mktime(t.timetuple()):
            return True

        return False

    def _isCacheValid(self, stepHelper):
        """
        Check whether cache is exits
        TODO: if the cacheDirectory is not inside the Sandbox it should not autormatically deleted.
              We can add cache refresh policy here
        """
        cacheFile = self._getCacheFilePath(stepHelper)

        if not self._isCacheExpired(
                cacheFile, delta=0.5) and os.path.getsize(cacheFile) > 0:
            # if file already exist don't make a new dbs call and overwrite the file.
            # just return
            fileName = self._getStepFilePath(stepHelper)
            if not os.path.isfile(fileName) or os.path.getsize(
                    fileName) != os.path.getsize(cacheFile):
                self._copyFile(cacheFile, fileName)
            return True
        else:
            return False

    def _saveFile(self, stepHelper, jsonPU):

        cacheFile = self._getCacheFilePath(stepHelper)
        self._writeFile(cacheFile, jsonPU)
        fileName = self._getStepFilePath(stepHelper)
        self._copyFile(cacheFile, fileName)

    def createPileupConfigFile(self, helper):
        """
        Stores pileup JSON configuration file in the working
        directory / sandbox.

        """
        if self._isCacheValid(helper):
            # if file already exist don't make a new dbs call and overwrite the file.
            # just return
            return

        encoder = JSONEncoder()
        # this should have been set in CMSSWStepHelper along with
        # the pileup configuration
        url = helper.data.dbsUrl
        dbsReader = DBSReader(url)

        configDict = self._queryDbsAndGetPileupConfig(helper, dbsReader)

        # create JSON and save into a file
        jsonPU = encoder.encode(configDict)
        self._saveFile(helper, jsonPU)

    def __call__(self, wmTask):
        """
        Method is called  when WorkQueue creates the sandbox for a job.
        Need to look at the pileup configuration in the spec and query dbs to
        determine the lfns for the files in the datasets and what sites they're
        located at (WQ creates the job sandbox).

        wmTask is instance of WMTask.WMTaskHelper

        """
        for step in wmTask.steps().nodeIterator():
            helper = WMStep.WMStepHelper(step)
            # returns e.g. instance of CMSSWHelper
            # doesn't seem to be necessary ... strangely (some inheritance involved?)
            # typeHelper = helper.getTypeHelper()
            if hasattr(helper.data, "pileup"):
                self.createPileupConfigFile(helper)
Exemple #21
0
    def executeInternal(self, *args, **kwargs):

        self.logger.info(
            "Data discovery with DBS")  ## to be changed into debug

        dbsurl = self.config.Services.DBSUrl
        if kwargs['task']['tm_dbs_url']:
            dbsurl = kwargs['task']['tm_dbs_url']
        self.dbs = DBSReader(dbsurl)
        self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"]
        isUserDataset = self.dbsInstance.split('/')[1] != 'global'
        # where to look locations in pre-Rucio world
        PhEDExOrDBS = 'PhEDEx' if not isUserDataset else 'DBS origin site'

        taskName = kwargs['task']['tm_taskname']
        userProxy = kwargs['task']['user_proxy']
        self.logger.debug("Data discovery through %s for %s", self.dbs,
                          taskName)

        inputDataset = kwargs['task']['tm_input_dataset']
        secondaryDataset = kwargs['task'].get('tm_secondary_input_dataset',
                                              None)

        self.checkDatasetStatus(inputDataset, kwargs)
        if secondaryDataset:
            self.checkDatasetStatus(secondaryDataset, kwargs)

        try:
            # Get the list of blocks for the locations.
            # The WMCore DBS3 implementation makes one call to DBS for each block
            # when using locations=True so we are using locations=False and looking up location later
            blocks = [
                x['Name'] for x in self.dbs.getFileBlocksInfo(inputDataset,
                                                              locations=False)
            ]
            if secondaryDataset:
                secondaryBlocks = [
                    x['Name']
                    for x in self.dbs.getFileBlocksInfo(secondaryDataset,
                                                        locations=False)
                ]
        except DBSReaderError as dbsexc:
            # dataset not found in DBS is a known use case
            if str(dbsexc).find('No matching data'):
                raise TaskWorkerException(
                    "CRAB could not find dataset %s in this DBS instance: %s" %
                    inputDataset, dbsurl)
            raise
        ## Create a map for block's locations: for each block get the list of locations.
        ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no
        ## locations are found it gets the original locations from DBS. So it should
        ## never be the case at this point that some blocks have no locations.
        ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example:
        ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'],
        ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL']}

        # For now apply Rucio data location only to NANOAOD*
        # in time useRucioForLocations may become a more rich expression
        isNano = blocks[0].split("#")[0].split("/")[-1] in [
            "NANOAOD", "NANOAODSIM"
        ]
        if isNano:
            self.logger.info(
                "NANOAOD* datset. Will use Rucio for data location")
        useRucioForLocations = isNano
        locationsFoundWithRucio = False

        if not useRucioForLocations:
            self.logger.info("Will not use Rucio for this dataset")
        # if locations should be in Rucio, try it first and fall back to old ways if Rucio calls fail
        # of if they return no locations (possible Rucio teething pain). If Rucio returns a list, trust it.
        if useRucioForLocations:
            locationsMap = {}
            scope = "cms"
            # If the dataset is a USER one, use the Rucio user scope to find it
            # TODO: we need a way to enable users to indicate others user scopes as source
            if isUserDataset:
                scope = "user.%s" % kwargs['task']['tm_username']
            rucio_config_dict = {
                "phedexCompatible": True,
                "auth_type": "x509",
                "ca_cert": self.config.Services.Rucio_caPath,
                "logger": self.logger,
                "creds": {
                    "client_cert": self.config.TaskWorker.cmscert,
                    "client_key": self.config.TaskWorker.cmskey
                }
            }
            try:
                self.logger.info("Initializing Rucio client")
                # WMCore is awfully verbose
                with tempSetLogLevel(logger=self.logger, level=logging.ERROR):
                    rucioClient = Rucio(
                        self.config.Services.Rucio_account,
                        hostUrl=self.config.Services.Rucio_host,
                        authUrl=self.config.Services.Rucio_authUrl,
                        configDict=rucio_config_dict)
                rucioClient.whoAmI()
                self.logger.info(
                    "Looking up data location with Rucio in %s scope.", scope)
                with tempSetLogLevel(logger=self.logger, level=logging.ERROR):
                    locations = rucioClient.getReplicaInfoForBlocks(
                        scope=scope, block=list(blocks))
            except Exception as exc:
                msg = "Rucio lookup failed with\n%s" % str(exc)
                # TODO when removing fall-back to PhEDEx, this should be a fatal error
                # raise TaskWorkerException(msg)
                self.logger.warn(msg)
                locations = None

            # TODO when removing fall-back to PhEDEx, above code will raise if it fails, therefore
            # the following "if" must be removed and the code shifted left
            if locations:
                located_blocks = locations['phedex']['block']
                for element in located_blocks:
                    if element[
                            'replica']:  # only fill map for blocks which have at least one location
                        locationsMap.update({
                            element['name']:
                            [x['node'] for x in element['replica']]
                        })
                if locationsMap:
                    locationsFoundWithRucio = True
                else:
                    msg = "No locations found with Rucio for this dataset"
                    # since NANO* are not in PhEDEx, this should be a fatal error
                    if isNano:
                        raise TaskWorkerException(msg)
                    else:
                        # note it down and try with PhEDEx
                        self.logger.warn(msg)

        if not locationsFoundWithRucio:  # fall back to pre-Rucio methods
            try:
                self.logger.info("Looking up data locations using %s",
                                 PhEDExOrDBS)
                locationsMap = self.dbs.listFileBlockLocation(
                    list(blocks), dbsOnly=isUserDataset)
            except Exception as ex:
                raise TaskWorkerException(
                    "The CRAB3 server backend could not get the location of the files from dbs nor phedex nor rucio.\n"+\
                    "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)"+\
                    " and contact the experts if the error persists.\nError reason: %s" % str(ex)
                    )
            # only fill map for blocks which have at least one location
            locationsMap = {
                key: value
                for key, value in locationsMap.iteritems() if value
            }

        if secondaryDataset:
            secondaryLocationsMap = {}
            # see https://github.com/dmwm/CRABServer/issues/6075#issuecomment-641569446
            self.logger.info(
                "Trying data location of secondary blocks with Rucio")
            try:
                locations = rucioClient.getReplicaInfoForBlocks(
                    scope=scope, block=list(secondaryBlocks))
            except Exception as exc:
                locations = None
                secondaryLocationsMap = {}
                self.logger.warn("Rucio lookup failed with. %s", exc)
            if locations:
                located_blocks = locations['phedex']['block']
                for element in located_blocks:
                    if element[
                            'replica']:  # only fill map for blocks which have at least one location
                        secondaryLocationsMap.update({
                            element['name']:
                            [x['node'] for x in element['replica']]
                        })
            if not secondaryLocationsMap:
                msg = "No locations found with Rucio for secondaryDataset."
                # TODO when removing fall-back to PhEDEx, this should be a fatal error
                # raise TaskWorkerException(msg)
                self.logger.warn(msg)
                self.logger.info(
                    "Trying data location of secondary blocks with PhEDEx")
                try:
                    secondaryLocationsMap = self.dbs.listFileBlockLocation(
                        list(secondaryBlocks), dbsOnly=isUserDataset)
                except Exception as ex:
                    raise TaskWorkerException(
                        "The CRAB3 server backend could not get the location of the secondary dataset files from dbs or phedex or rucio.\n" + \
                        "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)" + \
                        " and contact the experts if the error persists.\nError reason: %s" % str(ex)
                    )
                # only fill map for blocks which have at least one location
                secondaryLocationsMap = {
                    key: value
                    for key, value in secondaryLocationsMap.iteritems()
                    if value
                }

        # From now on code is not dependent from having used Rucio or PhEDEx

        blocksWithLocation = locationsMap.keys()
        if secondaryDataset:
            secondaryBlocksWithLocation = secondaryLocationsMap.keys()

        self.keepOnlyDisks(locationsMap)
        if not locationsMap:
            msg = "Task could not be submitted because there is no DISK replica for dataset %s" % inputDataset
            if self.tapeLocations:
                msg += "\nN.B.: the input dataset is stored at %s, but those are TAPE locations." % ', '.join(
                    sorted(self.tapeLocations))
                # submit request to DDM
                ddmRequest = None
                ddmServer = self.config.TaskWorker.DDMServer
                try:
                    ddmRequest = blocksRequest(blocksWithLocation,
                                               ddmServer,
                                               self.config.TaskWorker.cmscert,
                                               self.config.TaskWorker.cmskey,
                                               verbose=False)
                except HTTPException as hte:
                    self.logger.exception(hte)
                    msg += "\nThe automatic stage-out failed, please try again later. If the error persists contact the experts and provide this error message:"
                    msg += "\nHTTP Error while contacting the DDM server %s:\n%s" % (
                        ddmServer, str(hte))
                    msg += "\nHTTP Headers are: %s" % hte.headers
                    msg += "\nYou might want to contact your physics group if you need a disk replica."
                    raise TaskWorkerException(msg, retry=True)

                self.logger.info("Contacted %s using %s and %s, got:\n%s",
                                 self.config.TaskWorker.DDMServer,
                                 self.config.TaskWorker.cmscert,
                                 self.config.TaskWorker.cmskey, ddmRequest)
                # The query above returns a JSON with a format {"result": "OK", "message": "Copy requested", "data": [{"request_id": 18, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:57:37", "last_request": "2018-02-26 23:57:37", "request_count": 1}]}
                if ddmRequest["result"] == "OK":
                    # set status to TAPERECALL
                    tapeRecallStatus = 'TAPERECALL'
                    ddmReqId = ddmRequest["data"][0]["request_id"]
                    configreq = {
                        'workflow': taskName,
                        'taskstatus': tapeRecallStatus,
                        'ddmreqid': ddmReqId,
                        'subresource': 'addddmreqid',
                    }
                    try:
                        tapeRecallStatusSet = self.server.post(
                            self.restURInoAPI + '/task',
                            data=urllib.urlencode(configreq))
                    except HTTPException as hte:
                        self.logger.exception(hte)
                        msg = "HTTP Error while contacting the REST Interface %s:\n%s" % (
                            self.config.TaskWorker.restHost, str(hte))
                        msg += "\nSetting %s status and DDM request ID (%d) failed for task %s" % (
                            tapeRecallStatus, ddmReqId, taskName)
                        msg += "\nHTTP Headers are: %s" % hte.headers
                        raise TaskWorkerException(msg, retry=True)

                    msg += "\nA disk replica has been requested on %s to CMS DDM (request ID: %d)" % (
                        ddmRequest["data"][0]["first_request"], ddmReqId)
                    if tapeRecallStatusSet[2] == "OK":
                        self.logger.info("Status for task %s set to '%s'",
                                         taskName, tapeRecallStatus)
                        msg += "\nThis task will be automatically submitted as soon as the stage-out is completed."
                        self.uploadWarning(msg, userProxy, taskName)

                        raise TapeDatasetException(msg)
                    else:
                        msg += ", please try again in two days."

                else:
                    msg += "\nThe disk replica request failed with this error:\n %s" % ddmRequest[
                        "message"]

            msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK."
            raise TaskWorkerException(msg)

        # will not need lumi info if user has asked for split by file with no run/lumi mask
        splitAlgo = kwargs['task']['tm_split_algo']
        lumiMask = kwargs['task']['tm_split_args']['lumis']
        runRange = kwargs['task']['tm_split_args']['runs']

        needLumiInfo = splitAlgo != 'FileBased' or lumiMask != [] or runRange != []
        # secondary dataset access relies on run/lumi info
        if secondaryDataset:
            needLumiInfo = True
        if needLumiInfo:
            self.checkBlocksSize(
                blocksWithLocation
            )  # Interested only in blocks with locations, 'blocks' may contain invalid ones and trigger an Exception
            if secondaryDataset:
                self.checkBlocksSize(secondaryBlocksWithLocation)
        try:
            filedetails = self.dbs.listDatasetFileDetails(
                inputDataset,
                getParents=True,
                getLumis=needLumiInfo,
                validFileOnly=0)
            if secondaryDataset:
                moredetails = self.dbs.listDatasetFileDetails(
                    secondaryDataset,
                    getParents=False,
                    getLumis=needLumiInfo,
                    validFileOnly=0)

                for secfilename, secinfos in moredetails.items():
                    secinfos['lumiobj'] = LumiList(
                        runsAndLumis=secinfos['Lumis'])

                self.logger.info(
                    "Beginning to match files from secondary dataset")
                for dummyFilename, infos in filedetails.items():
                    infos['Parents'] = []
                    lumis = LumiList(runsAndLumis=infos['Lumis'])
                    for secfilename, secinfos in moredetails.items():
                        if lumis & secinfos['lumiobj']:
                            infos['Parents'].append(secfilename)
                self.logger.info("Done matching files from secondary dataset")
                kwargs['task']['tm_use_parent'] = 1
        except Exception as ex:  #TODO should we catch HttpException instead?
            self.logger.exception(ex)
            raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\
                                "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\
                                " and contact the experts if the error persists.\nError reason: %s" % str(ex))
            #TODO addo the nodes phedex so the user can check themselves
        if not filedetails:
            raise TaskWorkerException(("Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n" +\
                                "Aborting submission. Resubmitting your task will not help.") %\
                                ("https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s") %\
                                (self.dbsInstance, inputDataset))

        ## Format the output creating the data structures required by WMCore. Filters out invalid files,
        ## files whose block has no location, and figures out the PSN
        result = self.formatOutput(task=kwargs['task'],
                                   requestname=taskName,
                                   datasetfiles=filedetails,
                                   locations=locationsMap,
                                   tempDir=kwargs['tempDir'])

        if not result.result:
            raise TaskWorkerException((
                "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n"
                + "Aborting submission. Resubmitting your task will not help."
            ) % (
                "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s"
            ) % (self.dbsInstance, inputDataset))

        self.logger.debug("Got %s files", len(result.result.getFiles()))

        return result
Exemple #22
0
class StartPolicyInterface(PolicyInterface):
    """Interface for start policies"""
    def __init__(self, **args):
        # We need to pop this object instance from args because otherwise
        # the super class blows up when doing a deepcopy(args)
        self.rucio = args.pop("rucioObject", None)
        PolicyInterface.__init__(self, **args)
        self.workQueueElements = []
        self.wmspec = None
        self.team = None
        self.initialTask = None
        self.splitParams = None
        self.dbs_pool = {}
        self.data = {}
        self.lumi = None
        self.couchdb = None
        self.rejectedWork = []  # List of inputs that were rejected
        self.badWork = [
        ]  # list of bad work unit (e.g. without any valid files)
        self.pileupData = {}
        self.cric = CRIC()
        # FIXME: for the moment, it will always use the default value
        self.rucioAcct = self.args.get("rucioAcct", "wmcore_transferor")
        if not self.rucio:
            self.rucio = Rucio(self.rucioAcct,
                               configDict={'logger': self.logger})

    def split(self):
        """Apply policy to spec"""
        raise NotImplementedError

    def validate(self):
        """Check params and spec are appropriate for the policy"""
        raise NotImplementedError

    def validateCommon(self):
        """Common validation stuff"""
        try:
            Lexicon.requestName(self.wmspec.name())
        except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(
                self.wmspec, "Workflow name validation error: %s" % str(ex))
            raise error

        if self.initialTask.siteWhitelist():
            if isinstance(self.initialTask.siteWhitelist(), (newstr, bytes)):
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    'Invalid site whitelist: Must be tuple/list but is %s' %
                    type(self.initialTask.siteWhitelist()))
                raise error
            try:
                [
                    Lexicon.cmsname(site)
                    for site in self.initialTask.siteWhitelist()
                ]
            except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    "Site whitelist validation error: %s" % str(ex))
                raise error
        else:
            error = WorkQueueWMSpecError(
                self.wmspec,
                "Site whitelist validation error: Empty site whitelist")
            raise error

        if self.initialTask.siteBlacklist():
            if isinstance(self.initialTask.siteBlacklist(), (newstr, bytes)):
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    'Invalid site blacklist: Must be tuple/list but is %s' %
                    type(self.initialTask.siteBlacklist()))
                raise error
            try:
                [
                    Lexicon.cmsname(site)
                    for site in self.initialTask.siteBlacklist()
                ]
            except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    "Site blacklist validation error: %s" % str(ex))
                raise error

        # splitter settings
        if self.args.get('SliceSize', 1) <= 0:
            error = WorkQueueWMSpecError(
                self.wmspec, 'Zero or negative SliceSize parameter')
            raise error
        if self.args.get('SubSliceSize', 1) <= 0:
            error = WorkQueueWMSpecError(
                self.wmspec, 'Zero or negative SubSliceSize parameter')
            raise error

        # check input dataset is valid
        try:
            if self.initialTask.getInputDatasetPath():
                Lexicon.dataset(self.initialTask.getInputDatasetPath())
        except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(
                self.wmspec, "Dataset validation error: %s" % str(ex))
            raise error

        # if pileup is found, check that they are valid datasets
        try:
            pileupDatasets = self.wmspec.listPileupDatasets()
            for dbsUrl in pileupDatasets:
                for dataset in pileupDatasets[dbsUrl]:
                    Lexicon.dataset(dataset)
        except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(
                self.wmspec, "Pileup dataset validation error: %s" % str(ex))
            raise error

    def newQueueElement(self, **args):
        # DBS Url may not be available in the initial task
        # but in the pileup data (MC pileup)
        dbsUrl = self.initialTask.dbsUrl()
        if dbsUrl is None and self.pileupData:
            # Get the first DBS found
            dbsUrl = next(iter(self.wmspec.listPileupDatasets()))

        args.setdefault('Status', 'Available')
        args.setdefault('WMSpec', self.wmspec)
        args.setdefault('Task', self.initialTask)
        args.setdefault('RequestName', self.wmspec.name())
        args.setdefault('TaskName', self.initialTask.name())
        args.setdefault('Dbs', dbsUrl)
        args.setdefault('SiteWhitelist', self.initialTask.siteWhitelist())
        args.setdefault('SiteBlacklist', self.initialTask.siteBlacklist())
        args.setdefault('StartPolicy', self.wmspec.startPolicy())
        args.setdefault('EndPolicy', self.wmspec.endPolicyParameters())
        args.setdefault('Priority', self.wmspec.priority())
        args.setdefault('PileupData', self.pileupData)
        if not args['Priority']:
            args['Priority'] = 0
        ele = WorkQueueElement(**args)
        for data, sites in viewitems(ele['Inputs']):
            if not sites:
                raise WorkQueueWMSpecError(
                    self.wmspec, 'Input data has no locations "%s"' % data)

        # catch infinite splitting loops
        if len(self.workQueueElements) > self.args.get('maxRequestSize', 1e8):
            raise WorkQueueWMSpecError(
                self.wmspec, 'Too many elements (%d)' %
                self.args.get('MaxRequestElements', 1e8))
        self.workQueueElements.append(ele)

    def __call__(self,
                 wmspec,
                 task,
                 data=None,
                 mask=None,
                 team=None,
                 continuous=False,
                 rucioObj=None):
        self.wmspec = wmspec
        # bring in spec specific settings
        self.args.update(self.wmspec.startPolicyParameters())
        self.initialTask = task
        if data:
            self.data = data
        self.mask = mask
        self.validate()
        try:
            pileupDatasets = self.wmspec.listPileupDatasets()
            if pileupDatasets:
                self.pileupData = self.getDatasetLocations(pileupDatasets)
            self.split()
        # For known exceptions raise custom error that will fail the workflow.
        except dbsClientException as ex:
            # A dbs configuration error implies the spec is invalid
            error = WorkQueueWMSpecError(self.wmspec,
                                         "DBS config error: %s" % str(ex))
            raise error
        except AssertionError as ex:
            # Assertion generally means validation of an input field failed
            error = WorkQueueWMSpecError(self.wmspec,
                                         "Assertion error: %s" % str(ex))
            raise error
        except DBSReaderError as ex:
            # Hacky way of identifying non-existant data, DbsBadRequest chomped by DBSReader
            if 'Invalid parameters' in str(ex):
                data = task.data.input.pythonise_(
                ) if task.data.input else 'None'
                msg = """data: %s, mask: %s, pileup: %s. %s""" % (
                    str(data), str(mask), str(pileupDatasets), str(ex))
                error = WorkQueueNoWorkError(self.wmspec, msg)
                raise error
            raise  # propagate other dbs errors

        # if we have no new elements and we are not adding work to request
        # already running, then raise exception
        if not self.workQueueElements and not continuous:
            data = task.data.input.pythonise_() if task.data.input else 'None'
            msg = "Failed to add work. Input data: %s, mask: %s." % (str(data),
                                                                     str(mask))
            error = WorkQueueNoWorkError(self.wmspec, msg)
            raise error

        return self.workQueueElements, self.rejectedWork, self.badWork

    def dbs(self, dbs_url=None):
        """Get DBSReader"""
        from WMCore.WorkQueue.WorkQueueUtils import get_dbs
        if dbs_url is None:
            dbs_url = self.initialTask.dbsUrl()
        return get_dbs(dbs_url)

    @staticmethod
    def supportsWorkAddition():
        """Indicates if a given policy supports addition of new work"""
        return False

    def getMaskedBlocks(self, task, dbs, datasetPath):
        """
        Get the blocks which pass the lumi mask restrictions. For each block
        return the list of lumis which were ok (given the lumi mask). The data
        structure returned is the following:
        {
            "block1" : {"file1" : LumiList(), "file5" : LumiList(), ...}
            "block2" : {"file2" : LumiList(), "file7" : LumiList(), ...}
        }
        """
        # Get the task mask as a LumiList object to make operations easier
        maskedBlocks = {}
        taskMask = task.getLumiMask()

        # for performance reasons, we first get all the blocknames
        blocks = [
            x['block_name'] for x in dbs.dbs.listBlocks(dataset=datasetPath)
        ]

        for block in blocks:
            fileLumis = dbs.dbs.listFileLumis(block_name=block,
                                              validFileOnly=1)
            for fileLumi in fileLumis:
                lfn = fileLumi['logical_file_name']
                runNumber = str(fileLumi['run_num'])
                lumis = fileLumi['lumi_section_num']
                fileMask = LumiList(runsAndLumis={runNumber: lumis})
                commonMask = taskMask & fileMask
                if commonMask:
                    maskedBlocks.setdefault(block, {})
                    maskedBlocks[block].setdefault(lfn, LumiList())
                    maskedBlocks[block][lfn] += commonMask

        return maskedBlocks

    def modifyPolicyForWorkAddition(self, inboxElement):
        """Set modifiers to the policy based on the inboxElement information so that after a splitting pass
        with this policy strictly new work is returned, the inbox element must have information
        about already existing work"""
        raise NotImplementedError(
            "This can't be called on a base StartPolicyInterface object")

    def newDataAvailable(self, task, inbound):
        """
            Returns True if there is data in the future could be included as an element
            for the inbound parent. However it doesn't guarantee that the new data
            will be included if the inbound element is split (i.e. the new data could be open blocks for the Block policy).
        """
        raise NotImplementedError(
            "This can't be called on a base StartPolicyInterface object")

    def getDatasetLocations(self, datasets):
        """
        Returns a dictionary with the location of the datasets according to Rucio
        The definition of "location" here is a union of all sites holding at least
        part of the dataset (defined by the DATASET grouping).
        :param datasets: dictionary with a list of dataset names (key'ed by the DBS URL)
        :return: a dictionary of dataset locations, key'ed by the dataset name
        """
        result = {}
        for dbsUrl in datasets:
            for datasetPath in datasets[dbsUrl]:
                locations = self.rucio.getDataLockedAndAvailable(
                    name=datasetPath, account=self.rucioAcct)
                result[datasetPath] = self.cric.PNNstoPSNs(locations)
        return result

    def blockLocationRucioPhedex(self, blockName):
        """
        Wrapper around Rucio and PhEDEx systems.
        Fetch the current location of the block name (if Rucio,
        also consider the locks made on that block)
        :param blockName: string with the block name
        :return: a list of RSEs
        """
        location = self.rucio.getDataLockedAndAvailable(name=blockName,
                                                        account=self.rucioAcct)
        return location
Exemple #23
0
class RucioInjectorPoller(BaseWorkerThread):
    """
    _RucioInjectorPoller_

    Poll the DBSBuffer database and inject files as they are created.

    The logic of this component is:
      * create a rucio container (or reuse a pre-existent one)
      * create a CMS block (or reuse a pre-existent one), block gets automatically attached
      * create file/replicas, which get automatically attached to its block as well
      * now create a CMS block rule to protect this data
      * if the block has been inserted into DBS, close the block in Rucio

    In addition to that, it has logic for rucio container subscription (rule creation),
    and block rule removal. Those follow a different polling cycle though.
    """
    def __init__(self, config):
        """
        ___init___

        Initialise class members
        """
        BaseWorkerThread.__init__(self)

        # dataset rule creation has a larger polling cycle
        self.pollRules = config.RucioInjector.pollIntervalRules
        self.lastRulesExecTime = 0
        self.createBlockRules = config.RucioInjector.createBlockRules
        self.containerDiskRuleParams = config.RucioInjector.containerDiskRuleParams
        self.containerDiskRuleRSEExpr = config.RucioInjector.containerDiskRuleRSEExpr
        if config.RucioInjector.metaDIDProject not in RUCIO_VALID_PROJECT:
            msg = "Component configured with an invalid 'project' DID: %s"
            raise RucioInjectorException(msg %
                                         config.RucioInjector.metaDIDProject)
        self.metaDIDProject = dict(project=config.RucioInjector.metaDIDProject)

        # setup cache for container and blocks (containers can be much longer, make 6 days now)
        self.containersCache = MemoryCache(
            config.RucioInjector.cacheExpiration * 3, set())
        self.blocksCache = MemoryCache(config.RucioInjector.cacheExpiration,
                                       set())

        self.scope = getattr(config.RucioInjector, "scope", "cms")
        self.rucioAcct = config.RucioInjector.rucioAccount
        self.rucio = Rucio(acct=self.rucioAcct,
                           hostUrl=config.RucioInjector.rucioUrl,
                           authUrl=config.RucioInjector.rucioAuthUrl,
                           configDict={'logger': self.logger})

        # metadata dictionary information to be added to block/container rules
        # cannot be a python dictionary, but a JSON string instead
        self.metaData = json.dumps(
            dict(agentHost=config.Agent.hostName,
                 userAgent=config.Agent.agentName))

        self.testRSEs = config.RucioInjector.RSEPostfix
        self.filesToRecover = []

        # output data placement has a different behaviour between T0 and Production agents
        if hasattr(config, "Tier0Feeder"):
            logging.info("RucioInjector running on a T0 WMAgent")
            self.isT0agent = True
        else:
            self.isT0agent = False

        logging.info("Component configured to create block rules: %s",
                     self.createBlockRules)

    def setup(self, parameters):
        """
        _setup_

        Create DAO Factory and setup some DAO.
        """
        myThread = threading.currentThread()
        daofactory = DAOFactory(package="WMComponent.RucioInjector.Database",
                                logger=self.logger,
                                dbinterface=myThread.dbi)

        self.getUninjected = daofactory(classname="GetUninjectedFiles")
        self.getMigrated = daofactory(classname="GetMigratedBlocks")

        self.getUnsubscribedBlocks = daofactory(
            classname="GetUnsubscribedBlocks")
        self.setBlockRules = daofactory(classname="SetBlocksRule")

        self.findDeletableBlocks = daofactory(classname="GetDeletableBlocks")
        self.markBlocksDeleted = daofactory(classname="MarkBlocksDeleted")
        self.getUnsubscribedDsets = daofactory(
            classname="GetUnsubscribedDatasets")
        self.markSubscribed = daofactory(classname="MarkDatasetSubscribed")

        daofactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                logger=self.logger,
                                dbinterface=myThread.dbi)
        self.setStatus = daofactory(classname="DBSBufferFiles.SetPhEDExStatus")
        self.setBlockClosed = daofactory(classname="SetBlockClosed")

    @timeFunction
    def algorithm(self, parameters):
        """
        _algorithm_

        Poll the database for uninjected files and inject them into Rucio.
        """
        logging.info("Running Rucio injector poller algorithm...")

        try:
            # files that failed to get their status updated in dbsbuffer
            self._updateLFNState(self.filesToRecover, recovery=True)

            # get dbsbuffer_file.in_phedex = 0
            uninjectedFiles = self.getUninjected.execute()

            # create containers in rucio  (and update local cache)
            containersAdded = self.insertContainers(uninjectedFiles)
            if self.containersCache.isCacheExpired():
                self.containersCache.setCache(containersAdded)
            else:
                self.containersCache.addItemToCache(containersAdded)

            # create blocks. Only update the cache once a rule gets created...
            blocksAdded = self.insertBlocks(uninjectedFiles)
            if self.blocksCache.isCacheExpired():
                self.blocksCache.setCache(blocksAdded)
            else:
                self.blocksCache.addItemToCache(blocksAdded)

            # create file replicas
            self.insertReplicas(uninjectedFiles)

            # now close blocks already uploaded to DBS
            self.closeBlocks()

            if self.lastRulesExecTime + self.pollRules <= int(time.time()):
                self.insertContainerRules()
                self.insertBlockRules()
                self.deleteBlocks()
        except Exception as ex:
            msg = "Caught unexpected exception in RucioInjector. Details:\n%s" % str(
                ex)
            logging.exception(msg)
            raise RucioInjectorException(msg)

        return

    def insertContainers(self, uninjectedData):
        """
        This method will insert containers into Rucio, provided they cannot be found in
        the local cache.
        :param uninjectedData: same data as it's returned from the uninjectedFiles
        :return: set of containers successfully inserted into Rucio
        """
        logging.info("Preparing to insert containers into Rucio...")
        newContainers = set()
        for location in uninjectedData:
            for container in uninjectedData[location]:
                # same container can be at multiple locations
                if container not in self.containersCache and container not in newContainers:
                    if self.rucio.createContainer(container,
                                                  meta=self.metaDIDProject):
                        logging.info("Container %s inserted into Rucio",
                                     container)
                        newContainers.add(container)
                    else:
                        logging.error("Failed to create container: %s",
                                      container)
        logging.info("Successfully inserted %d containers into Rucio",
                     newContainers)
        return newContainers

    def insertBlocks(self, uninjectedData):
        """
        This method will insert blocks into Rucio and attach them to their correspondent
        containers, when attaching this block, we also need to provide the RSE that it
        will be available.
        :param uninjectedData: same data as it's returned from the uninjectedFiles
        :return: a dictionary of successfully inserted blocks and their correspondent location
        """
        logging.info("Preparing to insert blocks into Rucio...")
        newBlocks = set()
        for location in uninjectedData:
            rseName = "%s_Test" % location if self.testRSEs else location
            for container in uninjectedData[location]:
                for block in uninjectedData[location][container]:
                    if block not in self.blocksCache:
                        if self.rucio.createBlock(block,
                                                  rse=rseName,
                                                  meta=self.metaDIDProject):
                            logging.info("Block %s inserted into Rucio", block)
                            newBlocks.add(block)
                        else:
                            logging.error("Failed to create block: %s", block)
        logging.info("Successfully inserted %d blocks into Rucio", newBlocks)
        return newBlocks

    def insertBlockRules(self):
        """
        Creates a simple replication rule for every single block that
        is under production in a given site/RSE.
        Also persist the rule ID in the database.
        """
        if not self.createBlockRules:
            return

        logging.info("Preparing to create block rules into Rucio...")

        unsubBlocks = self.getUnsubscribedBlocks.execute()

        for item in unsubBlocks:
            # first, check if the block has already been created in Rucio
            if not self.rucio.didExist(item['blockname']):
                logging.warning("Block: %s not yet in Rucio. Retrying later..",
                                item['blockname'])
                continue
            kwargs = dict(activity="Production Output",
                          account=self.rucioAcct,
                          grouping="DATASET",
                          comment="WMAgent automatic container rule",
                          ignore_availability=True,
                          meta=self.metaData)
            rseName = "%s_Test" % item['pnn'] if self.testRSEs else item['pnn']
            # DATASET = replicates all files in the same block to the same RSE
            resp = self.rucio.createReplicationRule(item['blockname'],
                                                    rseExpression=rseName,
                                                    **kwargs)
            if resp:
                msg = "Block rule created for block: %s, at: %s, with rule id: %s"
                logging.info(msg, item['blockname'], item['pnn'], resp[0])
                binds = {'RULE_ID': resp[0], 'BLOCKNAME': item['blockname']}
                self.setBlockRules.execute(binds)
            else:
                logging.error("Failed to create rule for block: %s at %s",
                              item['blockname'], rseName)
        return

    def insertReplicas(self, uninjectedData):
        """
        Inserts replicas into Rucio and attach them to its specific block.
        If the insertion succeeds, also switch their database state to injected.

        :param uninjectedData: dictionary with blocks as key, and RSEs as value
        """
        # FIXME: I think we need a different data struct from the database
        # this method is very expensive O(n^4)
        logging.info("Preparing to insert replicas into Rucio...")

        for location in uninjectedData:
            rseName = "%s_Test" % location if self.testRSEs else location
            for container in uninjectedData[location]:
                for block in uninjectedData[location][container]:
                    if block not in self.blocksCache:
                        logging.warning(
                            "Skipping %d file injection for block that failed to be added into Rucio: %s",
                            len(uninjectedData[location][container][block]
                                ['files']), block)
                        continue
                    injectData = []
                    listLfns = []
                    for fileInfo in uninjectedData[location][container][block][
                            'files']:
                        listLfns.append(fileInfo['lfn'])
                        injectData.append(
                            dict(name=fileInfo['lfn'],
                                 scope=self.scope,
                                 bytes=fileInfo['size'],
                                 state="A",
                                 adler32=fileInfo['checksum']['adler32']))

                    if self.rucio.createReplicas(rse=rseName,
                                                 files=injectData,
                                                 block=block):
                        logging.info(
                            "Successfully inserted %d files on block %s",
                            len(listLfns), block)
                        self._updateLFNState(listLfns)
        return

    def _updateLFNState(self, listLfns, recovery=False):
        """
        Given a list of LFNs, update their state in dbsbuffer table.
        :param listLfns: list of LFNs
        :param recovery: True if we are recovering previously injected files
        :return: nothing
        """
        if not listLfns:
            return

        try:
            self.setStatus.execute(listLfns, 1)
        except Exception as ex:
            # save it to try to inject them again in the next cycle
            self.filesToRecover.extend(listLfns)
            if 'Deadlock found' in str(ex) or 'deadlock detected' in str(ex):
                logging.error(
                    "Deadlock during file status update. Retrying again in the next cycle."
                )
                self.filesToRecover.extend(listLfns)
            else:
                msg = "Failed to update file status in the database, reason: %s" % str(
                    ex)
                logging.error(msg)
                raise RucioInjectorException(msg)
        else:
            if recovery:
                self.filesToRecover = []

    def closeBlocks(self):
        """
        Close any blocks that have been migrated to global DBS
        """
        logging.info("Starting closeBlocks method")

        # in short, dbsbuffer_file.in_phedex = 1 AND dbsbuffer_block.status = 'InDBS'
        migratedBlocks = self.getMigrated.execute()
        ### FIXME the data format returned by this DAO
        for location in migratedBlocks:
            for container in migratedBlocks[location]:
                for block in migratedBlocks[location][container]:
                    logging.info("Closing block: %s", block)
                    if self.rucio.closeBlockContainer(block):
                        self.setBlockClosed.execute(block)
                    else:
                        logging.error(
                            "Failed to close block: %s. Will retry again later.",
                            block)

    def deleteBlocks(self):
        """
        _deleteBlocks_
        Find deletable blocks, then decide if to delete based on:
        Has transfer to all destinations finished ?
          If yes => Delete rules associated with the block, set deleted=1
          If no => do nothing (check again next cycle)
        """
        logging.info("Checking if there are block rules to be deleted...")

        # Get list of blocks that can be deleted
        blockDict = self.findDeletableBlocks.execute(transaction=False)

        if not blockDict:
            logging.info("No candidate blocks found for rule deletion")
            return

        logging.info("Found %d candidate blocks for rule deletion",
                     len(blockDict))

        blocksToDelete = []
        containerDict = {}
        # Populate containerDict, assigning each block to its correspondant container
        for blockName in blockDict:
            container = blockDict[blockName]['dataset']
            # If the container is not in the dictionary, create a new entry for it
            if container not in containerDict:
                # Set of sites to which the container needs to be transferred
                sites = set(
                    x.replace("_MSS", "_Tape")
                    for x in blockDict[blockName]['sites'])
                containerDict[container] = {'blocks': [], 'rse': sites}
            containerDict[container]['blocks'].append(blockName)

        for contName in containerDict:
            cont = containerDict[contName]

            # Checks if the container is not requested in any sites.
            # This should never be triggered, but better safe than sorry
            if not cont['rse']:
                logging.warning(
                    "No rules for container: %s. Its blocks won't be deleted.",
                    contName)
                continue

            try:
                # Get RSE in which each block is available
                availableRSEs = self.rucio.getReplicaInfoForBlocks(
                    block=cont['blocks'])
            except Exception as exc:
                msg = "Failed to get replica info for blocks in container: %s.\n" % contName
                msg += "Will retry again in the next cycle. Error: %s" % str(
                    exc)
                logging.error(msg)
                continue

            for blockRSEs in availableRSEs:
                # If block is available at every RSE its container needs to be transferred, the block can be deleted
                blockSites = set(blockRSEs['replica'])
                if cont['rse'].issubset(blockSites):
                    blocksToDelete.append(blockRSEs['name'])

        # Delete agent created rules locking the block
        binds = []
        logging.info("Going to delete %d block rules", len(blocksToDelete))
        for block in blocksToDelete:
            try:
                rules = self.rucio.listDataRules(block,
                                                 scope=self.scope,
                                                 account=self.rucioAcct)
            except WMRucioException as exc:
                logging.warning(
                    "Unable to retrieve replication rules for block: %s. Will retry in the next cycle.",
                    block)
            else:
                if not rules:
                    logging.info(
                        "Block rule for: %s has been deleted by previous cycles",
                        block)
                    binds.append({'DELETED': 1, 'BLOCKNAME': block})
                    continue
                for rule in rules:
                    deletedRules = 0
                    if self.rucio.deleteRule(rule['id'], purgeReplicas=True):
                        logging.info(
                            "Successfully deleted rule: %s, for block %s.",
                            rule['id'], block)
                        deletedRules += 1
                    else:
                        logging.warning(
                            "Failed to delete rule: %s, for block %s. Will retry in the next cycle.",
                            rule['id'], block)
                if deletedRules == len(rules):
                    binds.append({'DELETED': 1, 'BLOCKNAME': block})
                    logging.info(
                        "Successfully deleted all rules for block %s.", block)

        self.markBlocksDeleted.execute(binds)
        logging.info("Marked %d blocks as deleted in the database", len(binds))
        return

    def insertContainerRules(self):
        """
        Polls the database for containers meant to be subscribed and create
        a container level rule to replicate all the files to a given RSE.
        It deals with both Central Production and T0 data rules, which require
        a different approach, such as:
          * Production Tape/Custodial data placement is skipped and data is marked as transferred
          * Production Disk/NonCutodial has a generic RSE expression and some rules override
            from the agent configuration (like number of copies, grouping and weight)
          * T0 Tape is created as defined, with a special rule activity for Tape
          * T0 Disk is created as defined, with a special rule activity for Disk/Export
        """
        logging.info("Starting insertContainerRules method")

        ruleComment = "WMAgent automatic container rule"
        if self.isT0agent:
            ruleComment = "T0 " + ruleComment

        # FIXME also adapt the format returned by this DAO
        # Check for completely unsubscribed datasets that are already marked as in_phedex = 1
        unsubscribedDatasets = self.getUnsubscribedDsets.execute()

        # Keep a list of subscriptions to tick as subscribed in the database
        subscriptionsMade = []

        # Create the subscription objects and add them to the list
        # The list takes care of the sorting internally
        for subInfo in unsubscribedDatasets:
            rseName = subInfo['site'].replace("_MSS", "_Tape")
            container = subInfo['path']
            # Skip central production Tape rules
            if not self.isT0agent and rseName.endswith("_Tape"):
                logging.info(
                    "Bypassing Production container Tape data placement for container: %s and RSE: %s",
                    container, rseName)
                subscriptionsMade.append(subInfo['id'])
                continue
            # then check if the container has already been created in Rucio
            if not self.rucio.didExist(container):
                logging.warning(
                    "Container: %s not yet in Rucio. Retrying later..",
                    container)
                continue

            ruleKwargs = dict(ask_approval=False,
                              activity=self._activityMap(rseName),
                              account=self.rucioAcct,
                              grouping="ALL",
                              comment=ruleComment,
                              meta=self.metaData)
            if not rseName.endswith("_Tape"):
                # add extra parameters to the Disk rule as defined in the component configuration
                ruleKwargs.update(self.containerDiskRuleParams)

            if not self.isT0agent:
                # destination for production Disk rules are always overwritten
                rseName = self.containerDiskRuleRSEExpr
                if self.testRSEs:
                    rseName = rseName.replace("cms_type=real", "cms_type=test")
            else:
                # then it's a T0 container placement
                ruleKwargs['priority'] = 4
                if self.testRSEs:
                    rseName = "%s_Test" % rseName
                #Checking whether we need to ask for rule approval
                try:
                    if self.rucio.requiresApproval(rseName):
                        ruleKwargs['ask_approval'] = True
                except WMRucioException as exc:
                    msg = str(exc)
                    msg += "\nUnable to check approval requirements. Will retry again in the next cycle."
                    logging.error(msg)
                    continue

            logging.info("Creating container rule for %s against RSE %s",
                         container, rseName)
            logging.debug(
                "Container rule will be created with keyword args: %s",
                ruleKwargs)
            try:
                resp = self.rucio.createReplicationRule(container,
                                                        rseExpression=rseName,
                                                        **ruleKwargs)
            except Exception:
                msg = "Failed to create container rule for (retrying with approval): %s" % container
                logging.warning(msg)
                ruleKwargs["ask_approval"] = True
                try:
                    resp = self.rucio.createReplicationRule(
                        container, rseExpression=rseName, **ruleKwargs)
                except Exception as exc:
                    msg = "Failed once again to create container rule for: %s " % container
                    msg += "\nWill retry again in the next cycle. Error: %s" % str(
                        exc)
                    continue
            if resp:
                logging.info("Container rule created for %s under rule id: %s",
                             container, resp)
                subscriptionsMade.append(subInfo['id'])
            else:
                logging.error("Failed to create rule for container: %s",
                              container)

        # Register the result in DBSBuffer
        if subscriptionsMade:
            self.markSubscribed.execute(subscriptionsMade)
            logging.info(
                "%d containers successfully locked in Rucio and local database",
                len(subscriptionsMade))

        return

    def _activityMap(self, rseName):
        """
        It maps the WMAgent type (Production vs T0) and the RSE name to
        properly set the rule activity field
        :param rseName: a string with the RSE name
        :return: a string with the rule activity
        """
        if not self.isT0agent and not rseName.endswith("_Tape"):
            return "Production Output"
        elif self.isT0agent and rseName.endswith("_Tape"):
            return "T0 Tape"
        elif self.isT0agent:
            return "T0 Export"
        else:
            msg = "This code should never be reached. Report it to the developers. "
            msg += "Trying to create container rule for RSE name: {}".format(
                rseName)
            raise WMRucioException(msg)
Exemple #24
0
class WMBSHelperTest(EmulatedUnitTestCase):
    def setUp(self):
        """
        _setUp_

        """
        super(WMBSHelperTest, self).setUp()

        self.testInit = TestInitCouchApp(__file__)
        self.testInit.setLogging()
        self.testInit.setDatabaseConnection(destroyAllDatabase=True)
        self.testInit.setupCouch("wmbshelper_t/jobs", "JobDump")
        self.testInit.setupCouch("wmbshelper_t/fwjrs", "FWJRDump")
        self.testInit.setupCouch("config_test", "GroupUser", "ConfigCache")
        os.environ["COUCHDB"] = "wmbshelper_t"
        self.testInit.setSchema(customModules=[
            "WMCore.WMBS", "WMComponent.DBS3Buffer", "WMCore.BossAir",
            "WMCore.ResourceControl"
        ],
                                useDefault=False)

        self.workDir = self.testInit.generateWorkDir()

        self.wmspec = self.createWMSpec()
        self.topLevelTask = getFirstTask(self.wmspec)
        self.inputDataset = self.topLevelTask.inputDataset()
        self.dataset = self.topLevelTask.getInputDatasetPath()
        self.dbs = DBSReader(self.inputDataset.dbsurl)
        self.rucioAcct = "wmcore_transferor"
        self.rucio = Rucio(self.rucioAcct)
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=threading.currentThread().logger,
                                     dbinterface=threading.currentThread().dbi)

        self.configFile = EmulatorSetup.setupWMAgentConfig()
        self.config = loadConfigurationFile(self.configFile)

        self.config.component_("JobSubmitter")
        self.config.JobSubmitter.submitDir = self.workDir
        self.config.JobSubmitter.submitScript = os.path.join(
            getTestBase(), 'WMComponent_t/JobSubmitter_t', 'submit.sh')

        return

    def tearDown(self):
        """
        _tearDown_

        Clear out the database.
        """
        self.testInit.clearDatabase()
        self.testInit.tearDownCouch()
        self.testInit.delWorkDir()
        EmulatorSetup.deleteConfig(self.configFile)
        super(WMBSHelperTest, self).tearDown()

        return

    def setupForKillTest(self, baAPI=None):
        """
        _setupForKillTest_

        Inject a workflow into WMBS that has a processing task, a merge task and
        a cleanup task.  Inject files into the various tasks at various
        processing states (acquired, complete, available...).  Also create jobs
        for each subscription in various states.
        """
        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMCore.WMBS",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)

        dummyLocationAction = daoFactory(classname="Locations.New")
        changeStateAction = daoFactory(classname="Jobs.ChangeState")
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='site1',
                                   pnn='goodse.cern.ch',
                                   ceName='site1',
                                   plugin="TestPlugin")
        resourceControl.insertThreshold(siteName='site1', taskType='Processing', \
                                        maxSlots=10000, pendingSlots=10000)

        userDN = 'someDN'
        userAction = daoFactory(classname="Users.New")
        userAction.execute(dn=userDN,
                           group_name='DEFAULT',
                           role_name='DEFAULT')

        inputFileset = Fileset("input")
        inputFileset.create()

        inputFileA = File("lfnA", locations="goodse.cern.ch")
        inputFileB = File("lfnB", locations="goodse.cern.ch")
        inputFileC = File("lfnC", locations="goodse.cern.ch")
        inputFileA.create()
        inputFileB.create()
        inputFileC.create()

        inputFileset.addFile(inputFileA)
        inputFileset.addFile(inputFileB)
        inputFileset.addFile(inputFileC)
        inputFileset.commit()

        unmergedOutputFileset = Fileset("unmerged")
        unmergedOutputFileset.create()

        unmergedFileA = File("ulfnA", locations="goodse.cern.ch")
        unmergedFileB = File("ulfnB", locations="goodse.cern.ch")
        unmergedFileC = File("ulfnC", locations="goodse.cern.ch")
        unmergedFileA.create()
        unmergedFileB.create()
        unmergedFileC.create()

        unmergedOutputFileset.addFile(unmergedFileA)
        unmergedOutputFileset.addFile(unmergedFileB)
        unmergedOutputFileset.addFile(unmergedFileC)
        unmergedOutputFileset.commit()

        mainProcWorkflow = Workflow(spec="spec1",
                                    owner="Steve",
                                    name="Main",
                                    task="Proc")
        mainProcWorkflow.create()
        mainProcMergeWorkflow = Workflow(spec="spec1",
                                         owner="Steve",
                                         name="Main",
                                         task="ProcMerge")
        mainProcMergeWorkflow.create()
        mainCleanupWorkflow = Workflow(spec="spec1",
                                       owner="Steve",
                                       name="Main",
                                       task="Cleanup")
        mainCleanupWorkflow.create()

        self.mainProcSub = Subscription(fileset=inputFileset,
                                        workflow=mainProcWorkflow,
                                        type="Processing")
        self.mainProcSub.create()
        self.mainProcSub.acquireFiles(inputFileA)
        self.mainProcSub.completeFiles(inputFileB)

        procJobGroup = JobGroup(subscription=self.mainProcSub)
        procJobGroup.create()
        self.procJobA = Job(name="ProcJobA")
        self.procJobA["state"] = "new"
        self.procJobA["location"] = "site1"
        self.procJobB = Job(name="ProcJobB")
        self.procJobB["state"] = "executing"
        self.procJobB["location"] = "site1"
        self.procJobC = Job(name="ProcJobC")
        self.procJobC["state"] = "complete"
        self.procJobC["location"] = "site1"
        self.procJobA.create(procJobGroup)
        self.procJobB.create(procJobGroup)
        self.procJobC.create(procJobGroup)

        self.mainMergeSub = Subscription(fileset=unmergedOutputFileset,
                                         workflow=mainProcMergeWorkflow,
                                         type="Merge")
        self.mainMergeSub.create()
        self.mainMergeSub.acquireFiles(unmergedFileA)
        self.mainMergeSub.failFiles(unmergedFileB)

        mergeJobGroup = JobGroup(subscription=self.mainMergeSub)
        mergeJobGroup.create()
        self.mergeJobA = Job(name="MergeJobA")
        self.mergeJobA["state"] = "exhausted"
        self.mergeJobA["location"] = "site1"
        self.mergeJobB = Job(name="MergeJobB")
        self.mergeJobB["state"] = "cleanout"
        self.mergeJobB["location"] = "site1"
        self.mergeJobC = Job(name="MergeJobC")
        self.mergeJobC["state"] = "new"
        self.mergeJobC["location"] = "site1"
        self.mergeJobA.create(mergeJobGroup)
        self.mergeJobB.create(mergeJobGroup)
        self.mergeJobC.create(mergeJobGroup)

        self.mainCleanupSub = Subscription(fileset=unmergedOutputFileset,
                                           workflow=mainCleanupWorkflow,
                                           type="Cleanup")
        self.mainCleanupSub.create()
        self.mainCleanupSub.acquireFiles(unmergedFileA)
        self.mainCleanupSub.completeFiles(unmergedFileB)

        cleanupJobGroup = JobGroup(subscription=self.mainCleanupSub)
        cleanupJobGroup.create()
        self.cleanupJobA = Job(name="CleanupJobA")
        self.cleanupJobA["state"] = "new"
        self.cleanupJobA["location"] = "site1"
        self.cleanupJobB = Job(name="CleanupJobB")
        self.cleanupJobB["state"] = "executing"
        self.cleanupJobB["location"] = "site1"
        self.cleanupJobC = Job(name="CleanupJobC")
        self.cleanupJobC["state"] = "complete"
        self.cleanupJobC["location"] = "site1"
        self.cleanupJobA.create(cleanupJobGroup)
        self.cleanupJobB.create(cleanupJobGroup)
        self.cleanupJobC.create(cleanupJobGroup)

        jobList = [
            self.procJobA, self.procJobB, self.procJobC, self.mergeJobA,
            self.mergeJobB, self.mergeJobC, self.cleanupJobA, self.cleanupJobB,
            self.cleanupJobC
        ]

        changeStateAction.execute(jobList)

        if baAPI:
            for job in jobList:
                job['plugin'] = 'TestPlugin'
                job['userdn'] = userDN
                job['usergroup'] = 'DEFAULT'
                job['userrole'] = 'DEFAULT'
                job['custom']['location'] = 'site1'
            baAPI.createNewJobs(wmbsJobs=jobList)

        # We'll create an unrelated workflow to verify that it isn't affected
        # by the killing code.
        bogusFileset = Fileset("dontkillme")
        bogusFileset.create()

        bogusFileA = File("bogus/lfnA", locations="goodse.cern.ch")
        bogusFileA.create()
        bogusFileset.addFile(bogusFileA)
        bogusFileset.commit()

        bogusWorkflow = Workflow(spec="spec2",
                                 owner="Steve",
                                 name="Bogus",
                                 task="Proc")
        bogusWorkflow.create()
        self.bogusSub = Subscription(fileset=bogusFileset,
                                     workflow=bogusWorkflow,
                                     type="Processing")
        self.bogusSub.create()
        self.bogusSub.acquireFiles(bogusFileA)
        return

    def verifyFileKillStatus(self):
        """
        _verifyFileKillStatus_

        Verify that all files were killed correctly.  The status of files in
        Cleanup and LogCollect subscriptions isn't modified.  Status of
        already completed and failed files is not modified.  Also verify that
        the bogus subscription is untouched.
        """
        failedFiles = self.mainProcSub.filesOfStatus("Failed")
        acquiredFiles = self.mainProcSub.filesOfStatus("Acquired")
        completedFiles = self.mainProcSub.filesOfStatus("Completed")
        availableFiles = self.mainProcSub.filesOfStatus("Available")
        bogusAcquiredFiles = self.bogusSub.filesOfStatus("Acquired")

        self.assertEqual(len(availableFiles), 0, \
                         "Error: There should be no available files.")
        self.assertEqual(len(acquiredFiles), 0, \
                         "Error: There should be no acquired files.")
        self.assertEqual(len(bogusAcquiredFiles), 1, \
                         "Error: There should be one acquired file.")

        self.assertEqual(len(completedFiles), 3, \
                         "Error: There should be only one completed file.")
        goldenLFNs = ["lfnA", "lfnB", "lfnC"]
        for completedFile in completedFiles:
            self.assertTrue(completedFile["lfn"] in goldenLFNs, \
                            "Error: Extra completed file.")
            goldenLFNs.remove(completedFile["lfn"])

        self.assertEqual(len(failedFiles), 0, \
                         "Error: There should be no failed files.")

        self.assertEqual(len(goldenLFNs), 0, \
                         "Error: Missing LFN")

        failedFiles = self.mainMergeSub.filesOfStatus("Failed")
        acquiredFiles = self.mainMergeSub.filesOfStatus("Acquired")
        completedFiles = self.mainMergeSub.filesOfStatus("Completed")
        availableFiles = self.mainMergeSub.filesOfStatus("Available")

        self.assertEqual(len(acquiredFiles), 0, \
                         "Error: Merge subscription should have 0 acq files.")
        self.assertEqual(len(availableFiles), 0, \
                         "Error: Merge subscription should have 0 avail files.")

        self.assertEqual(len(failedFiles), 1, \
                         "Error: Merge subscription should have 1 failed files.")
        self.assertEqual(
            list(failedFiles)[0]["lfn"], "ulfnB", "Error: Wrong failed file.")

        self.assertEqual(len(completedFiles), 2, \
                         "Error: Merge subscription should have 2 compl files.")
        goldenLFNs = ["ulfnA", "ulfnC"]
        for completedFile in completedFiles:
            self.assertTrue(completedFile["lfn"] in goldenLFNs, \
                            "Error: Extra complete file.")
            goldenLFNs.remove(completedFile["lfn"])

        self.assertEqual(len(goldenLFNs), 0, \
                         "Error: Missing LFN")

        failedFiles = self.mainCleanupSub.filesOfStatus("Failed")
        acquiredFiles = self.mainCleanupSub.filesOfStatus("Acquired")
        completedFiles = self.mainCleanupSub.filesOfStatus("Completed")
        availableFiles = self.mainCleanupSub.filesOfStatus("Available")

        self.assertEqual(len(failedFiles), 0, \
                         "Error: Cleanup subscription should have 0 fai files.")

        self.assertEqual(len(acquiredFiles), 1, \
                         "Error: There should be only one acquired file.")
        self.assertEqual(list(acquiredFiles)[0]["lfn"], "ulfnA", \
                         "Error: Wrong acquired LFN.")

        self.assertEqual(len(completedFiles), 1, \
                         "Error: There should be only one completed file.")
        self.assertEqual(list(completedFiles)[0]["lfn"], "ulfnB", \
                         "Error: Wrong completed LFN.")

        self.assertEqual(len(availableFiles), 1, \
                         "Error: There should be only one available file.")
        self.assertEqual(list(availableFiles)[0]["lfn"], "ulfnC", \
                         "Error: Wrong completed LFN.")

        return

    def verifyJobKillStatus(self):
        """
        _verifyJobKillStatus_

        Verify that jobs are killed correctly.  Jobs belonging to Cleanup and
        LogCollect subscriptions are not killed.  The status of jobs that have
        already finished running is not changed.
        """
        self.procJobA.load()
        self.procJobB.load()
        self.procJobC.load()

        self.assertEqual(self.procJobA["state"], "killed", \
                         "Error: Proc job A should be killed.")
        self.assertEqual(self.procJobB["state"], "killed", \
                         "Error: Proc job B should be killed.")
        self.assertEqual(self.procJobC["state"], "complete", \
                         "Error: Proc job C should be complete.")

        self.mergeJobA.load()
        self.mergeJobB.load()
        self.mergeJobC.load()

        self.assertEqual(self.mergeJobA["state"], "exhausted", \
                         "Error: Merge job A should be exhausted.")
        self.assertEqual(self.mergeJobB["state"], "cleanout", \
                         "Error: Merge job B should be cleanout.")
        self.assertEqual(self.mergeJobC["state"], "killed", \
                         "Error: Merge job C should be killed.")

        self.cleanupJobA.load()
        self.cleanupJobB.load()
        self.cleanupJobC.load()

        self.assertEqual(self.cleanupJobA["state"], "new", \
                         "Error: Cleanup job A should be new.")
        self.assertEqual(self.cleanupJobB["state"], "executing", \
                         "Error: Cleanup job B should be executing.")
        self.assertEqual(self.cleanupJobC["state"], "complete", \
                         "Error: Cleanup job C should be complete.")
        return

    def createTestWMSpec(self):
        """
        _createTestWMSpec_

        Create a WMSpec that has a processing, merge, cleanup and skims tasks that
        can be used by the subscription creation test.
        """
        testWorkload = WMWorkloadHelper(WMWorkload("TestWorkload"))
        testWorkload.setDashboardActivity("TestReReco")
        testWorkload.setSpecUrl("/path/to/workload")
        testWorkload.setOwnerDetails("sfoulkes", "DMWM", {'dn': 'MyDN'})

        procTask = testWorkload.newTask("ProcessingTask")
        procTask.setTaskType("Processing")
        procTask.setSplittingAlgorithm("FileBased", files_per_job=1)
        procTaskCMSSW = procTask.makeStep("cmsRun1")
        procTaskCMSSW.setStepType("CMSSW")
        procTaskCMSSWHelper = procTaskCMSSW.getTypeHelper()
        procTask.setTaskType("Processing")
        procTask.setSiteWhitelist(["site1"])
        procTask.setSiteBlacklist(["site2"])
        procTask.applyTemplates()

        procTaskCMSSWHelper.addOutputModule("OutputA",
                                            primaryDataset="bogusPrimary",
                                            processedDataset="bogusProcessed",
                                            dataTier="DataTierA",
                                            lfnBase="bogusUnmerged",
                                            mergedLFNBase="bogusMerged",
                                            filterName=None)

        mergeTask = procTask.addTask("MergeTask")
        mergeTask.setInputReference(procTaskCMSSW,
                                    outputModule="OutputA",
                                    dataTier='DataTierA')
        mergeTask.setTaskType("Merge")
        mergeTask.setSplittingAlgorithm("WMBSMergeBySize",
                                        min_merge_size=1,
                                        max_merge_size=2,
                                        max_merge_events=3)
        mergeTaskCMSSW = mergeTask.makeStep("cmsRun1")
        mergeTaskCMSSW.setStepType("CMSSW")
        mergeTaskCMSSWHelper = mergeTaskCMSSW.getTypeHelper()
        mergeTask.setTaskType("Merge")
        mergeTask.applyTemplates()

        mergeTaskCMSSWHelper.addOutputModule("Merged",
                                             primaryDataset="bogusPrimary",
                                             processedDataset="bogusProcessed",
                                             dataTier="DataTierA",
                                             lfnBase="bogusUnmerged",
                                             mergedLFNBase="bogusMerged",
                                             filterName=None)

        cleanupTask = procTask.addTask("CleanupTask")
        cleanupTask.setInputReference(procTaskCMSSW,
                                      outputModule="OutputA",
                                      dataTier="DataTierA")
        cleanupTask.setTaskType("Merge")
        cleanupTask.setSplittingAlgorithm("SiblingProcessingBased",
                                          files_per_job=50)
        cleanupTaskCMSSW = cleanupTask.makeStep("cmsRun1")
        cleanupTaskCMSSW.setStepType("CMSSW")
        cleanupTask.setTaskType("Cleanup")
        cleanupTask.applyTemplates()

        skimTask = mergeTask.addTask("SkimTask")
        skimTask.setTaskType("Skim")
        skimTask.setInputReference(mergeTaskCMSSW,
                                   outputModule="Merged",
                                   dataTier="DataTierA")
        skimTask.setSplittingAlgorithm("FileBased",
                                       files_per_job=1,
                                       include_parents=True)
        skimTaskCMSSW = skimTask.makeStep("cmsRun1")
        skimTaskCMSSW.setStepType("CMSSW")
        skimTaskCMSSWHelper = skimTaskCMSSW.getTypeHelper()
        skimTask.setTaskType("Skim")
        skimTask.applyTemplates()

        skimTaskCMSSWHelper.addOutputModule("SkimOutputA",
                                            primaryDataset="bogusPrimary",
                                            processedDataset="bogusProcessed",
                                            dataTier="DataTierA",
                                            lfnBase="bogusUnmerged",
                                            mergedLFNBase="bogusMerged",
                                            filterName=None)

        skimTaskCMSSWHelper.addOutputModule("SkimOutputB",
                                            primaryDataset="bogusPrimary",
                                            processedDataset="bogusProcessed",
                                            dataTier="DataTierB",
                                            lfnBase="bogusUnmerged",
                                            mergedLFNBase="bogusMerged",
                                            filterName=None)

        return testWorkload

    def setupMCWMSpec(self):
        """Setup MC workflow"""
        self.wmspec = self.createMCWMSpec()
        self.topLevelTask = getFirstTask(self.wmspec)
        self.inputDataset = self.topLevelTask.inputDataset()
        self.dataset = self.topLevelTask.getInputDatasetPath()
        self.dbs = None

        # add sites that would normally be added by operator via resource_control
        locationDAO = self.daoFactory(classname="Locations.New")
        self.pnns = []
        for site in ['T2_XX_SiteA', 'T2_XX_SiteB']:
            locationDAO.execute(siteName=site, pnn=site)
            self.pnns.append(site)

    def createWMSpec(self, name='ReRecoWorkload'):
        factory = ReRecoWorkloadFactory()
        rerecoArgs["ConfigCacheID"] = createConfig(rerecoArgs["CouchDBName"])
        wmspec = factory.factoryWorkloadConstruction(name, rerecoArgs)
        wmspec.setSpecUrl("/path/to/workload")
        wmspec.setSubscriptionInformation(custodialSites=[],
                                          nonCustodialSites=[],
                                          autoApproveSites=[],
                                          priority="Low",
                                          custodialSubType="Move")
        return wmspec

    def createMCWMSpec(self, name='MonteCarloWorkload'):
        mcArgs = TaskChainWorkloadFactory.getTestArguments()
        mcArgs["CouchDBName"] = rerecoArgs["CouchDBName"]
        mcArgs["Task1"]["ConfigCacheID"] = createConfig(mcArgs["CouchDBName"])

        wmspec = taskChainWorkload(name, mcArgs)
        wmspec.setSpecUrl("/path/to/workload")
        getFirstTask(wmspec).addProduction(totalevents=10000)
        return wmspec

    def getDBS(self, wmspec):
        topLevelTask = getFirstTask(wmspec)
        inputDataset = topLevelTask.inputDataset()
        dbs = DBSReader(inputDataset.dbsurl)
        # dbsDict = {self.inputDataset.dbsurl : self.dbs}
        return dbs

    def createWMBSHelperWithTopTask(self,
                                    wmspec,
                                    block,
                                    mask=None,
                                    parentFlag=False,
                                    detail=False,
                                    commonLocation=None):

        topLevelTask = getFirstTask(wmspec)

        wmbs = WMBSHelper(wmspec,
                          topLevelTask.name(),
                          block,
                          mask,
                          cachepath=self.workDir,
                          commonLocation=commonLocation)
        if block:
            blockName = block
            if parentFlag:
                block = self.dbs.getFileBlockWithParents(blockName)
                data = self.rucio.getReplicaInfoForBlocks(block=[blockName])
                block['PhEDExNodeNames'] = data[0]["replica"]
            else:
                block = self.dbs.getFileBlock(blockName)
                data = self.rucio.getReplicaInfoForBlocks(block=[blockName])
                block['PhEDExNodeNames'] = data[0]["replica"]
        sub, files = wmbs.createSubscriptionAndAddFiles(block=block)
        if detail:
            return wmbs, sub, files
        else:
            return wmbs

    def testKillWorkflow(self):
        """
        _testKillWorkflow_

        Verify that workflow killing works correctly.
        """
        baAPI = BossAirAPI(config=self.config, insertStates=True)

        # Create nine jobs
        self.setupForKillTest(baAPI=baAPI)
        self.assertEqual(len(baAPI._listRunJobs()), 9)
        killWorkflow("Main", self.config, self.config)

        self.verifyFileKillStatus()
        self.verifyJobKillStatus()
        self.assertEqual(len(baAPI._listRunJobs()), 8)

        return

    def testCreateSubscription(self):
        """
        _testCreateSubscription_

        Verify that the subscription creation code works correctly.
        """
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='site1',
                                   pnn='goodse.cern.ch',
                                   ceName='site1',
                                   plugin="TestPlugin")
        resourceControl.insertSite(siteName='site2',
                                   pnn='goodse2.cern.ch',
                                   ceName='site2',
                                   plugin="TestPlugin")

        testWorkload = self.createTestWMSpec()
        testTopLevelTask = getFirstTask(testWorkload)
        testWMBSHelper = WMBSHelper(testWorkload,
                                    testTopLevelTask.name(),
                                    "SomeBlock",
                                    cachepath=self.workDir)
        testWMBSHelper.createTopLevelFileset()
        testWMBSHelper._createSubscriptionsInWMBS(
            testTopLevelTask, testWMBSHelper.topLevelFileset)

        procWorkflow = Workflow(name="TestWorkload",
                                task="/TestWorkload/ProcessingTask")
        procWorkflow.load()

        self.assertEqual(procWorkflow.owner, "sfoulkes",
                         "Error: Wrong owner: %s" % procWorkflow.owner)
        self.assertEqual(procWorkflow.group, "DMWM",
                         "Error: Wrong group: %s" % procWorkflow.group)
        self.assertEqual(procWorkflow.wfType, "TestReReco",
                         "Error: Wrong type.")
        self.assertEqual(
            procWorkflow.spec,
            os.path.join(self.workDir, procWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(procWorkflow.outputMap), 1,
                         "Error: Wrong number of WF outputs.")
        mergedProcOutput = procWorkflow.outputMap["OutputADataTierA"][0][
            "merged_output_fileset"]
        unmergedProcOutput = procWorkflow.outputMap["OutputADataTierA"][0][
            "output_fileset"]

        mergedProcOutput.loadData()
        unmergedProcOutput.loadData()
        self.assertEqual(
            mergedProcOutput.name,
            "/TestWorkload/ProcessingTask/MergeTask/merged-MergedDataTierA",
            "Error: Merged output fileset is wrong.")
        self.assertEqual(
            unmergedProcOutput.name,
            "/TestWorkload/ProcessingTask/unmerged-OutputADataTierA",
            "Error: Unmerged output fileset is wrong.")

        mergeWorkflow = Workflow(name="TestWorkload",
                                 task="/TestWorkload/ProcessingTask/MergeTask")
        mergeWorkflow.load()

        self.assertEqual(mergeWorkflow.owner, "sfoulkes",
                         "Error: Wrong owner.")
        self.assertEqual(
            mergeWorkflow.spec,
            os.path.join(self.workDir, mergeWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(mergeWorkflow.outputMap), 1,
                         "Error: Wrong number of WF outputs.")

        cleanupWorkflow = Workflow(
            name="TestWorkload",
            task="/TestWorkload/ProcessingTask/CleanupTask")
        cleanupWorkflow.load()

        self.assertEqual(cleanupWorkflow.owner, "sfoulkes",
                         "Error: Wrong owner.")
        self.assertEqual(
            cleanupWorkflow.spec,
            os.path.join(self.workDir, cleanupWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(cleanupWorkflow.outputMap), 0,
                         "Error: Wrong number of WF outputs.")

        unmergedMergeOutput = mergeWorkflow.outputMap["MergedDataTierA"][0][
            "output_fileset"]
        unmergedMergeOutput.loadData()

        self.assertEqual(
            unmergedMergeOutput.name,
            "/TestWorkload/ProcessingTask/MergeTask/merged-MergedDataTierA",
            "Error: Unmerged output fileset is wrong.")

        skimWorkflow = Workflow(
            name="TestWorkload",
            task="/TestWorkload/ProcessingTask/MergeTask/SkimTask")
        skimWorkflow.load()

        self.assertEqual(skimWorkflow.owner, "sfoulkes", "Error: Wrong owner.")
        self.assertEqual(
            skimWorkflow.spec,
            os.path.join(self.workDir, skimWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(skimWorkflow.outputMap), 2,
                         "Error: Wrong number of WF outputs.")

        mergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][0][
            "merged_output_fileset"]
        unmergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][
            0]["output_fileset"]
        mergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][0][
            "merged_output_fileset"]
        unmergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][
            0]["output_fileset"]

        mergedSkimOutputA.loadData()
        mergedSkimOutputB.loadData()
        unmergedSkimOutputA.loadData()
        unmergedSkimOutputB.loadData()

        self.assertEqual(
            mergedSkimOutputA.name,
            "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputADataTierA",
            "Error: Merged output fileset is wrong: %s" %
            mergedSkimOutputA.name)
        self.assertEqual(
            unmergedSkimOutputA.name,
            "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputADataTierA",
            "Error: Unmerged output fileset is wrong.")
        self.assertEqual(
            mergedSkimOutputB.name,
            "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB",
            "Error: Merged output fileset is wrong.")
        self.assertEqual(
            unmergedSkimOutputB.name,
            "/TestWorkload/ProcessingTask/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB",
            "Error: Unmerged output fileset is wrong.")

        topLevelFileset = Fileset(name="TestWorkload-ProcessingTask-SomeBlock")
        topLevelFileset.loadData()

        procSubscription = Subscription(fileset=topLevelFileset,
                                        workflow=procWorkflow)
        procSubscription.loadData()

        self.assertEqual(len(procSubscription.getWhiteBlackList()), 2,
                         "Error: Wrong site white/black list for proc sub.")
        for site in procSubscription.getWhiteBlackList():
            if site["site_name"] == "site1":
                self.assertEqual(site["valid"], 1,
                                 "Error: Site should be white listed.")
            else:
                self.assertEqual(site["valid"], 0,
                                 "Error: Site should be black listed.")

        self.assertEqual(procSubscription["type"], "Processing",
                         "Error: Wrong subscription type.")
        self.assertEqual(procSubscription["split_algo"], "FileBased",
                         "Error: Wrong split algo.")

        mergeSubscription = Subscription(fileset=unmergedProcOutput,
                                         workflow=mergeWorkflow)
        mergeSubscription.loadData()

        self.assertEqual(len(mergeSubscription.getWhiteBlackList()), 0,
                         "Error: Wrong white/black list for merge sub.")

        self.assertEqual(mergeSubscription["type"], "Merge",
                         "Error: Wrong subscription type.")
        self.assertEqual(mergeSubscription["split_algo"], "WMBSMergeBySize",
                         "Error: Wrong split algo.")

        skimSubscription = Subscription(fileset=unmergedMergeOutput,
                                        workflow=skimWorkflow)
        skimSubscription.loadData()

        self.assertEqual(skimSubscription["type"], "Skim",
                         "Error: Wrong subscription type.")
        self.assertEqual(skimSubscription["split_algo"], "FileBased",
                         "Error: Wrong split algo.")
        return

    def testTruncatedWFInsertion(self):
        """
        _testTruncatedWFInsertion_

        """
        resourceControl = ResourceControl()
        resourceControl.insertSite(siteName='site1',
                                   pnn='goodse.cern.ch',
                                   ceName='site1',
                                   plugin="TestPlugin")
        resourceControl.insertSite(siteName='site2',
                                   pnn='goodse2.cern.ch',
                                   ceName='site2',
                                   plugin="TestPlugin")

        testWorkload = self.createTestWMSpec()
        testTopLevelTask = getFirstTask(testWorkload)
        testWMBSHelper = WMBSHelper(testWorkload,
                                    testTopLevelTask.name(),
                                    "SomeBlock",
                                    cachepath=self.workDir)
        testWMBSHelper.createTopLevelFileset()
        testWMBSHelper._createSubscriptionsInWMBS(
            testTopLevelTask, testWMBSHelper.topLevelFileset)

        testWorkload.truncate("ResubmitTestWorkload",
                              "/TestWorkload/ProcessingTask/MergeTask",
                              "someserver", "somedatabase")

        # create  the subscription for multiple top task (MergeTask and CleanupTask for the same block)
        for task in testWorkload.getTopLevelTask():
            testResubmitWMBSHelper = WMBSHelper(testWorkload,
                                                task.name(),
                                                "SomeBlock2",
                                                cachepath=self.workDir)
            testResubmitWMBSHelper.createTopLevelFileset()
            testResubmitWMBSHelper._createSubscriptionsInWMBS(
                task, testResubmitWMBSHelper.topLevelFileset)

        mergeWorkflow = Workflow(name="ResubmitTestWorkload",
                                 task="/ResubmitTestWorkload/MergeTask")
        mergeWorkflow.load()

        self.assertEqual(mergeWorkflow.owner, "sfoulkes",
                         "Error: Wrong owner.")
        self.assertEqual(
            mergeWorkflow.spec,
            os.path.join(self.workDir, mergeWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(mergeWorkflow.outputMap), 1,
                         "Error: Wrong number of WF outputs.")

        unmergedMergeOutput = mergeWorkflow.outputMap["MergedDataTierA"][0][
            "output_fileset"]
        unmergedMergeOutput.loadData()

        self.assertEqual(
            unmergedMergeOutput.name,
            "/ResubmitTestWorkload/MergeTask/merged-MergedDataTierA",
            "Error: Unmerged output fileset is wrong.")

        skimWorkflow = Workflow(
            name="ResubmitTestWorkload",
            task="/ResubmitTestWorkload/MergeTask/SkimTask")
        skimWorkflow.load()

        self.assertEqual(skimWorkflow.owner, "sfoulkes", "Error: Wrong owner.")
        self.assertEqual(
            skimWorkflow.spec,
            os.path.join(self.workDir, skimWorkflow.name, "WMSandbox",
                         "WMWorkload.pkl"), "Error: Wrong spec URL")
        self.assertEqual(len(skimWorkflow.outputMap), 2,
                         "Error: Wrong number of WF outputs.")

        mergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][0][
            "merged_output_fileset"]
        unmergedSkimOutputA = skimWorkflow.outputMap["SkimOutputADataTierA"][
            0]["output_fileset"]
        mergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][0][
            "merged_output_fileset"]
        unmergedSkimOutputB = skimWorkflow.outputMap["SkimOutputBDataTierB"][
            0]["output_fileset"]

        mergedSkimOutputA.loadData()
        mergedSkimOutputB.loadData()
        unmergedSkimOutputA.loadData()
        unmergedSkimOutputB.loadData()

        self.assertEqual(
            mergedSkimOutputA.name,
            "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputADataTierA",
            "Error: Merged output fileset is wrong: %s" %
            mergedSkimOutputA.name)
        self.assertEqual(
            unmergedSkimOutputA.name,
            "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputADataTierA",
            "Error: Unmerged output fileset is wrong.")
        self.assertEqual(
            mergedSkimOutputB.name,
            "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB",
            "Error: Merged output fileset is wrong.")
        self.assertEqual(
            unmergedSkimOutputB.name,
            "/ResubmitTestWorkload/MergeTask/SkimTask/unmerged-SkimOutputBDataTierB",
            "Error: Unmerged output fileset is wrong.")

        topLevelFileset = Fileset(
            name="ResubmitTestWorkload-MergeTask-SomeBlock2")
        topLevelFileset.loadData()

        mergeSubscription = Subscription(fileset=topLevelFileset,
                                         workflow=mergeWorkflow)
        mergeSubscription.loadData()

        self.assertEqual(len(mergeSubscription.getWhiteBlackList()), 0,
                         "Error: Wrong white/black list for merge sub.")

        self.assertEqual(mergeSubscription["type"], "Merge",
                         "Error: Wrong subscription type.")
        self.assertEqual(mergeSubscription["split_algo"], "WMBSMergeBySize",
                         "Error: Wrong split algo.")

        skimSubscription = Subscription(fileset=unmergedMergeOutput,
                                        workflow=skimWorkflow)
        skimSubscription.loadData()

        self.assertEqual(skimSubscription["type"], "Skim",
                         "Error: Wrong subscription type.")
        self.assertEqual(skimSubscription["split_algo"], "FileBased",
                         "Error: Wrong split algo.")

        return

    def testReReco(self):
        """ReReco workflow"""
        # create workflow
        block = self.dataset + "#" + BLOCK1
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 5)

    def testReRecoBlackRunRestriction(self):
        """ReReco workflow with Run restrictions"""
        block = self.dataset + "#" + BLOCK2
        self.topLevelTask.setInputRunBlacklist(
            [181183])  # Set run blacklist to only run in the block
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)

        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 0)

    def testReRecoWhiteRunRestriction(self):
        block = self.dataset + "#" + BLOCK2
        self.topLevelTask.setInputRunWhitelist(
            [181183])  # Set run whitelist to only run in the block
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 1)

    def testLumiMaskRestrictionsOK(self):
        block = self.dataset + "#" + BLOCK1
        self.wmspec.getTopLevelTask()[0].data.input.splitting.runs = ['181367']
        self.wmspec.getTopLevelTask()[0].data.input.splitting.lumis = ['57,80']
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 1)

    def testLumiMaskRestrictionsKO(self):
        block = self.dataset + "#" + BLOCK1
        self.wmspec.getTopLevelTask()[0].data.input.splitting.runs = [
            '123454321'
        ]
        self.wmspec.getTopLevelTask()[0].data.input.splitting.lumis = [
            '123,123'
        ]
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        files = wmbs.validFiles(self.dbs.getFileBlock(block)['Files'])
        self.assertEqual(len(files), 0)

    def testDuplicateFileInsert(self):
        # using default wmspec
        block = self.dataset + "#" + BLOCK1
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        wmbs.topLevelFileset.loadData()
        numOfFiles = len(wmbs.topLevelFileset.files)
        # check initially inserted files.
        dbsFiles = self.dbs.getFileBlock(block)['Files']
        self.assertEqual(numOfFiles, len(dbsFiles))
        firstFileset = wmbs.topLevelFileset
        wmbsDao = wmbs.daofactory(classname="Files.InFileset")

        numOfFiles = len(wmbsDao.execute(firstFileset.id))
        self.assertEqual(numOfFiles, len(dbsFiles))

        # use the new spec with same inputdataset
        block = self.dataset + "#" + BLOCK1
        wmspec = self.createWMSpec("TestSpec1")
        dbs = self.getDBS(wmspec)
        wmbs = self.createWMBSHelperWithTopTask(wmspec, block)
        # check duplicate insert
        dbsFiles = dbs.getFileBlock(block)
        data = self.rucio.getReplicaInfoForBlocks(block=[block])
        dbsFiles['PhEDExNodeNames'] = data[0]["replica"]
        numOfFiles = wmbs.addFiles(dbsFiles)
        self.assertEqual(numOfFiles, 0)
        secondFileset = wmbs.topLevelFileset

        wmbsDao = wmbs.daofactory(classname="Files.InFileset")
        numOfFiles = len(wmbsDao.execute(secondFileset.id))
        self.assertEqual(numOfFiles, len(dbsFiles['Files']))

        self.assertNotEqual(firstFileset.id, secondFileset.id)

    def testDuplicateSubscription(self):
        """Can't duplicate subscriptions"""
        siteWhitelist = ["T2_XX_SiteA", "T2_XX_SiteB"]
        # using default wmspec
        block = self.dataset + "#" + BLOCK1
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        wmbs.topLevelFileset.loadData()
        numOfFiles = len(wmbs.topLevelFileset.files)
        filesetId = wmbs.topLevelFileset.id
        subId = wmbs.topLevelSubscription['id']

        # check initially inserted files.
        dbsFiles = self.dbs.getFileBlock(block)['Files']
        self.assertEqual(numOfFiles, len(dbsFiles))

        # Not clear what's supposed to happen here, 2nd test is completely redundant
        dummyFirstFileset = wmbs.topLevelFileset
        self.assertEqual(numOfFiles, len(dbsFiles))

        # reinsert subscription - shouldn't create anything new
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec, block)
        wmbs.topLevelFileset.loadData()
        self.assertEqual(numOfFiles, len(wmbs.topLevelFileset.files))
        self.assertEqual(filesetId, wmbs.topLevelFileset.id)
        self.assertEqual(subId, wmbs.topLevelSubscription['id'])

        # now do a montecarlo workflow
        self.setupMCWMSpec()
        mask = Mask(FirstRun=12,
                    FirstLumi=1234,
                    FirstEvent=12345,
                    LastEvent=999995,
                    LastLumi=12345,
                    LastRun=12)
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec,
                                                None,
                                                mask,
                                                commonLocation=siteWhitelist)
        wmbs.topLevelFileset.loadData()
        numOfFiles = len(wmbs.topLevelFileset.files)
        filesetId = wmbs.topLevelFileset.id
        subId = wmbs.topLevelSubscription['id']

        # check initially inserted files.
        # Not clear what's supposed to happen here, 2nd test is completely redundant
        numDbsFiles = 1
        self.assertEqual(numOfFiles, numDbsFiles)
        dummyFirstFileset = wmbs.topLevelFileset
        self.assertEqual(numOfFiles, numDbsFiles)

        # reinsert subscription - shouldn't create anything new
        wmbs = self.createWMBSHelperWithTopTask(self.wmspec,
                                                None,
                                                mask,
                                                commonLocation=siteWhitelist)
        wmbs.topLevelFileset.loadData()
        self.assertEqual(numOfFiles, len(wmbs.topLevelFileset.files))
        self.assertEqual(filesetId, wmbs.topLevelFileset.id)
        self.assertEqual(subId, wmbs.topLevelSubscription['id'])

    def testParentage(self):
        """
        1. check whether parent files are created in wmbs.
        2. check parent files are associated to child.
        3. When 2 specs with the same input data (one with parent processing, one without it)
           is inserted, if one without parent processing inserted first then the other with
           parent processing insert, it still needs to create parent files although child files
           are duplicate
        """

        # Swap out the dataset for one that has parents
        task = next(self.wmspec.taskIterator())
        oldDS = task.inputDataset(
        )  # Copy the old dataset, only will use DBS URL from it
        task.addInputDataset(name="/Cosmics/ComissioningHI-PromptReco-v1/RECO",
                             primary='Cosmics',
                             processed='ComissioningHI-PromptReco-v1',
                             tier='RECO',
                             dbsurl=oldDS.dbsurl)
        block = '/Cosmics/ComissioningHI-PromptReco-v1/RECO' + '#5b89ba9c-0dbf-11e1-9b6c-003048caaace'

        # File creation without parents
        wmbs, _, numFiles = self.createWMBSHelperWithTopTask(self.wmspec,
                                                             block,
                                                             parentFlag=False,
                                                             detail=True)
        self.assertEqual(8, numFiles)
        wmbs.topLevelFileset.loadData()
        for child in wmbs.topLevelFileset.files:
            self.assertEqual(len(child["parents"]), 0)  # no parents per child

        # File creation with parents
        wmbs, _, numFiles = self.createWMBSHelperWithTopTask(self.wmspec,
                                                             block,
                                                             parentFlag=True,
                                                             detail=True)
        self.assertEqual(8, numFiles)
        wmbs.topLevelFileset.loadData()
        for child in wmbs.topLevelFileset.files:
            self.assertEqual(len(child["parents"]), 1)  # one parent per child

    def testMCFakeFileInjection(self):
        """Inject fake Monte Carlo files into WMBS"""

        # This test is failing because the name of the couch DB is set to None
        # in BasicProductionWorkload.getProdArgs() but changing it to
        # "reqmgr_config_cache_t" from StdBase test arguments does not fix the
        # situation. testDuplicateSubscription probably has the same issue
        siteWhitelist = ["T2_XX_SiteA", "T2_XX_SiteB"]

        self.setupMCWMSpec()

        mask = Mask(FirstRun=12,
                    FirstLumi=1234,
                    FirstEvent=12345,
                    LastEvent=999995,
                    LastLumi=12345,
                    LastRun=12)

        wmbs = self.createWMBSHelperWithTopTask(self.wmspec,
                                                None,
                                                mask,
                                                commonLocation=siteWhitelist)
        subscription = wmbs.topLevelSubscription
        self.assertEqual(1, subscription.exists())
        fileset = subscription['fileset']
        self.assertEqual(1, fileset.exists())
        fileset.loadData()  # need to refresh from database

        self.assertEqual(len(fileset.files), 1)
        self.assertEqual(len(fileset.parents), 0)
        self.assertFalse(fileset.open)

        firstFile = list(fileset.files)[0]
        self.assertEqual(firstFile['events'], mask['LastEvent'] -
                         mask['FirstEvent'] + 1)  # inclusive range
        self.assertEqual(firstFile['merged'],
                         False)  # merged files get added to dbs
        self.assertEqual(len(firstFile['parents']), 0)
        # firstFile.loadData()
        self.assertEqual(sorted(firstFile['locations']), sorted(self.pnns))
        self.assertEqual(len(firstFile.getParentLFNs()), 0)

        self.assertEqual(len(firstFile.getRuns()), 1)
        run = firstFile.getRuns()[0]
        self.assertEqual(run.run, mask['FirstRun'])
        self.assertEqual(run.lumis[0], mask['FirstLumi'])
        self.assertEqual(run.lumis[-1], mask['LastLumi'])
        self.assertEqual(len(run.lumis),
                         mask['LastLumi'] - mask['FirstLumi'] + 1)