Ejemplo n.º 1
0
    def newQueueElement(self, **args):
        # DBS Url may not be available in the initial task
        # but in the pileup data (MC pileup)
        dbsUrl = self.initialTask.dbsUrl()
        if dbsUrl is None and self.pileupData:
            # Get the first DBS found
            dbsUrl = next(iter(self.wmspec.listPileupDatasets()))

        args.setdefault('Status', 'Available')
        args.setdefault('WMSpec', self.wmspec)
        args.setdefault('Task', self.initialTask)
        args.setdefault('RequestName', self.wmspec.name())
        args.setdefault('TaskName', self.initialTask.name())
        args.setdefault('Dbs', dbsUrl)
        args.setdefault('SiteWhitelist', self.initialTask.siteWhitelist())
        args.setdefault('SiteBlacklist', self.initialTask.siteBlacklist())
        args.setdefault('StartPolicy', self.wmspec.startPolicy())
        args.setdefault('EndPolicy', self.wmspec.endPolicyParameters())
        args.setdefault('Priority', self.wmspec.priority())
        args.setdefault('PileupData', self.pileupData)
        if not args['Priority']:
            args['Priority'] = 0
        ele = WorkQueueElement(**args)
        for data, sites in viewitems(ele['Inputs']):
            if not sites:
                raise WorkQueueWMSpecError(
                    self.wmspec, 'Input data has no locations "%s"' % data)

        # catch infinite splitting loops
        if len(self.workQueueElements) > self.args.get('maxRequestSize', 1e8):
            raise WorkQueueWMSpecError(
                self.wmspec, 'Too many elements (%d)' %
                self.args.get('MaxRequestElements', 1e8))
        self.workQueueElements.append(ele)
Ejemplo n.º 2
0
    def queueWork(self, wmspecUrl, request=None, team=None):
        """
        Take and queue work from a WMSpec.

        If request name is provided but doesn't match WMSpec name
        an error is raised.

        If team is provided work will only be available to queue's
        belonging to that team.

        Duplicate specs will be ignored.
        """
        self.logger.info('queueWork() begin queueing "%s"' % wmspecUrl)
        wmspec = WMWorkloadHelper()
        wmspec.load(wmspecUrl)

        if request:  # validate request name
            try:
                Lexicon.requestName(request)
            except Exception, ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(
                    wmspec, "Request name validation error: %s" % str(ex))
                raise error
            if request != wmspec.name():
                raise WorkQueueWMSpecError(
                    wmspec, 'Request & workflow name mismatch %s vs %s' %
                    (request, wmspec.name()))
Ejemplo n.º 3
0
    def validate(self):
        """Check args and spec work with block splitting"""
        StartPolicyInterface.validateCommon(self)

        if self.initialTask.totalEvents() < 1:
            raise WorkQueueNoWorkError(self.wmspec, 'Invalid total events selection: %s' % str(self.initialTask.totalEvents()))

        if self.mask and self.mask['LastEvent'] < self.mask['FirstEvent']:
            raise WorkQueueWMSpecError(self.wmspec, "Invalid start & end events")

        if self.mask and self.mask['LastLumi'] < self.mask['FirstLumi']:
            raise WorkQueueWMSpecError(self.wmspec, "Invalid start & end lumis")
Ejemplo n.º 4
0
    def __call__(self,
                 wmspec,
                 task,
                 data=None,
                 mask=None,
                 team=None,
                 continuous=False,
                 rucioObj=None):
        self.wmspec = wmspec
        # bring in spec specific settings
        self.args.update(self.wmspec.startPolicyParameters())
        self.initialTask = task
        if data:
            self.data = data
        self.mask = mask
        self.validate()
        try:
            pileupDatasets = self.wmspec.listPileupDatasets()
            if pileupDatasets:
                self.pileupData = self.getDatasetLocations(pileupDatasets)
            self.split()
        # For known exceptions raise custom error that will fail the workflow.
        except dbsClientException as ex:
            # A dbs configuration error implies the spec is invalid
            error = WorkQueueWMSpecError(self.wmspec,
                                         "DBS config error: %s" % str(ex))
            raise error
        except AssertionError as ex:
            # Assertion generally means validation of an input field failed
            error = WorkQueueWMSpecError(self.wmspec,
                                         "Assertion error: %s" % str(ex))
            raise error
        except DBSReaderError as ex:
            # Hacky way of identifying non-existant data, DbsBadRequest chomped by DBSReader
            if 'Invalid parameters' in str(ex):
                data = task.data.input.pythonise_(
                ) if task.data.input else 'None'
                msg = """data: %s, mask: %s, pileup: %s. %s""" % (
                    str(data), str(mask), str(pileupDatasets), str(ex))
                error = WorkQueueNoWorkError(self.wmspec, msg)
                raise error
            raise  # propagate other dbs errors

        # if we have no new elements and we are not adding work to request
        # already running, then raise exception
        if not self.workQueueElements and not continuous:
            data = task.data.input.pythonise_() if task.data.input else 'None'
            msg = "Failed to add work. Input data: %s, mask: %s." % (str(data),
                                                                     str(mask))
            error = WorkQueueNoWorkError(self.wmspec, msg)
            raise error

        return self.workQueueElements, self.rejectedWork, self.badWork
Ejemplo n.º 5
0
    def validBlocks(self, task):
        """Return blocks that pass the input data restriction according
           to the splitting algorithm"""
        validBlocks = []

        acdcInfo = task.getInputACDC()
        if not acdcInfo:
            raise WorkQueueWMSpecError(
                self.wmspec, 'No acdc section for %s' % task.getPathName())
        acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
        if self.data:
            acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0])
        else:
            # if self.data is not passed, assume the the data is input dataset
            # from the spec
            acdcBlockSplit = False

        if acdcBlockSplit:
            dbsBlock = {}
            dbsBlock['Name'] = self.data.keys()[0]
            block = acdc.getChunkInfo(
                acdcInfo['collection'],
                acdcBlockSplit['TaskName'],
                acdcBlockSplit['Offset'],
                acdcBlockSplit['NumOfFiles'],
                user=self.wmspec.getOwner().get("name"),
                group=self.wmspec.getOwner().get("group"))
            dbsBlock['NumberOfFiles'] = block['files']
            dbsBlock['NumberOfEvents'] = block['events']
            dbsBlock['NumberOfLumis'] = block['lumis']
            dbsBlock['ACDC'] = acdcInfo
            if task.getTrustSitelists():
                dbsBlock["Sites"] = self.sites
            else:
                # TODO remove this line when all DBS origin_site_name is converted to PNN
                block["locations"] = self.siteDB.checkAndConvertSENameToPNN(
                    block["locations"])
                # upto this
                dbsBlock["Sites"] = self.siteDB.PNNstoPSNs(block["locations"])
            validBlocks.append(dbsBlock)
        else:
            if self.args['SplittingAlgo'] in self.unsupportedAlgos:
                raise WorkQueueWMSpecError(
                    self.wmspec, 'ACDC is not supported for %s' %
                    self.args['SplittingAlgo'])
            splittingFunc = self.defaultAlgo
            if self.args['SplittingAlgo'] in self.algoMapping:
                splittingFunc = self.algoMapping[self.args['SplittingAlgo']]
            validBlocks = splittingFunc(acdc, acdcInfo, task)

        return validBlocks
Ejemplo n.º 6
0
    def validateCommon(self):
        """Common validation stuff"""
        try:
            Lexicon.requestName(self.wmspec.name())
        except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(self.wmspec, "Workflow name validation error: %s" % str(ex))
            raise error

        if self.initialTask.siteWhitelist():
            if isinstance(self.initialTask.siteWhitelist(), basestring):
                error = WorkQueueWMSpecError(self.wmspec, 'Invalid site whitelist: Must be tuple/list but is %s' % type(
                    self.initialTask.siteWhitelist()))
                raise error
            try:
                [Lexicon.cmsname(site) for site in self.initialTask.siteWhitelist()]
            except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(self.wmspec, "Site whitelist validation error: %s" % str(ex))
                raise error

        if self.initialTask.siteBlacklist():
            if isinstance(self.initialTask.siteBlacklist(), basestring):
                error = WorkQueueWMSpecError(self.wmspec, 'Invalid site blacklist: Must be tuple/list but is %s' % type(
                    self.initialTask.siteBlacklist()))
                raise error
            try:
                [Lexicon.cmsname(site) for site in self.initialTask.siteBlacklist()]
            except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(self.wmspec, "Site blacklist validation error: %s" % str(ex))
                raise error

        # splitter settings
        if self.args.get('SliceSize', 1) <= 0:
            error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SliceSize parameter')
            raise error
        if self.args.get('SubSliceSize', 1) <= 0:
            error = WorkQueueWMSpecError(self.wmspec, 'Zero or negative SubSliceSize parameter')
            raise error

        # check input dataset is valid
        try:
            if self.initialTask.getInputDatasetPath():
                Lexicon.dataset(self.initialTask.getInputDatasetPath())
        except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(self.wmspec, "Dataset validation error: %s" % str(ex))
            raise error

        # if pileup is found, check that they are valid datasets
        try:
            pileupDatasets = self.wmspec.listPileupDatasets()
            for dbsUrl in pileupDatasets:
                for dataset in pileupDatasets[dbsUrl]:
                    Lexicon.dataset(dataset)
        except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(self.wmspec, "Pileup dataset validation error: %s" % str(ex))
            raise error
Ejemplo n.º 7
0
class StartPolicyInterface(PolicyInterface):
    """Interface for start policies"""
    def __init__(self, **args):
        PolicyInterface.__init__(self, **args)
        self.workQueueElements = []
        self.wmspec = None
        self.team = None
        self.initialTask = None
        self.splitParams = None
        self.dbs_pool = {}
        self.data = {}
        self.lumi = None
        self.couchdb = None
        self.rejectedWork = []  # List of inputs that were rejected
        self.pileupData = {}

    def split(self):
        """Apply policy to spec"""
        raise NotImplementedError

    def validate(self):
        """Check params and spec are appropriate for the policy"""
        raise NotImplementedError

    def validateCommon(self):
        """Common validation stuff"""
        try:
            Lexicon.requestName(self.wmspec.name())
        except Exception, ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
            error = WorkQueueWMSpecError(
                self.wmspec, "Workflow name validation error: %s" % str(ex))
            raise error

        if self.initialTask.siteWhitelist():
            if type(self.initialTask.siteWhitelist()) in types.StringTypes:
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    'Invalid site whitelist: Must be tuple/list but is %s' %
                    type(self.initialTask.siteWhitelist()))
                raise error
            try:
                [
                    Lexicon.cmsname(site)
                    for site in self.initialTask.siteWhitelist()
                ]
            except Exception, ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    "Site whitelist validation error: %s" % str(ex))
                raise error
Ejemplo n.º 8
0
 def validateCommon(self):
     """Common validation stuff"""
     try:
         Lexicon.requestName(self.wmspec.name())
     except Exception, ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
         error = WorkQueueWMSpecError(
             self.wmspec, "Workflow name validation error: %s" % str(ex))
         raise error
Ejemplo n.º 9
0
    def __call__(self, wmspec, task, data=None, mask=None, team=None):
        self.wmspec = wmspec
        # bring in spec specific settings
        self.args.update(self.wmspec.startPolicyParameters())
        self.initialTask = task
        if data:
            self.data = data
        self.mask = mask
        self.validate()
        try:
            pileupDatasets = self.wmspec.listPileupDatasets()
            if pileupDatasets:
                self.pileupData = self.getDatasetLocations(pileupDatasets)
            self.split()
        # For known exceptions raise custom error that will fail the workflow.
        except DbsConfigurationError as ex:
            # A dbs configuration error implies the spec is invalid
            error = WorkQueueWMSpecError(self.wmspec,
                                         "DBS config error: %s" % str(ex))
            raise error
        except AssertionError as ex:
            # Assertion generally means validation of an input field failed
            error = WorkQueueWMSpecError(self.wmspec,
                                         "Assertion error: %s" % str(ex))
            raise error
        except DBSReaderError as ex:
            # Hacky way of identifying non-existant data, DbsBadRequest chomped by DBSReader
            # DbsConnectionError: Database exception,Invalid parameters thrown by Summary api
            if 'DbsBadRequest' in str(ex) or 'Invalid parameters' in str(ex):
                data = task.data.input.pythonise_(
                ) if task.data.input else 'None'
                msg = """data: %s, mask: %s, pileup: %s. %s""" % (
                    str(data), str(mask), str(pileupDatasets), str(ex))
                error = WorkQueueNoWorkError(self.wmspec, msg)
                raise error
            raise  # propagate other dbs errors

        # if we have no elements then there was no work in the spec, fail it
        if not self.workQueueElements:
            data = task.data.input.pythonise_() if task.data.input else 'None'
            msg = """data: %s, mask: %s.""" % (str(data), str(mask))
            error = WorkQueueNoWorkError(self.wmspec, msg)
            raise error
        return self.workQueueElements, self.rejectedWork
Ejemplo n.º 10
0
    def validBlocks(self, task):
        """Return blocks that pass the input data restriction according
           to the splitting algorithm"""
        validBlocks = []

        acdcInfo = task.getInputACDC()
        if not acdcInfo:
            raise WorkQueueWMSpecError(
                self.wmspec, 'No acdc section for %s' % task.getPathName())
        acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
        if self.data:
            acdcBlockSplit = ACDCBlock.splitBlockName(next(iter(self.data)))
        else:
            # if self.data is not passed, assume the the data is input dataset
            # from the spec
            acdcBlockSplit = False

        if acdcBlockSplit:
            dbsBlock = {}
            dbsBlock['Name'] = next(iter(self.data))
            block = acdc.getChunkInfo(acdcInfo['collection'],
                                      acdcBlockSplit['TaskName'],
                                      acdcBlockSplit['Offset'],
                                      acdcBlockSplit['NumOfFiles'])
            dbsBlock['NumberOfFiles'] = block['files']
            dbsBlock['NumberOfEvents'] = block['events']
            dbsBlock['NumberOfLumis'] = block['lumis']
            dbsBlock['ACDC'] = acdcInfo
            if task.getTrustSitelists().get('trustlists'):
                dbsBlock["Sites"] = self.sites
            else:
                dbsBlock["Sites"] = self.cric.PNNstoPSNs(block["locations"])
            validBlocks.append(dbsBlock)
        else:
            if self.args['SplittingAlgo'] in self.unsupportedAlgos:
                raise WorkQueueWMSpecError(
                    self.wmspec, 'ACDC is not supported for %s' %
                    self.args['SplittingAlgo'])
            splittingFunc = self.defaultAlgo
            if self.args['SplittingAlgo'] in self.algoMapping:
                splittingFunc = self.algoMapping[self.args['SplittingAlgo']]
            validBlocks = splittingFunc(acdc, acdcInfo, task)

        return validBlocks
Ejemplo n.º 11
0
    def validBlocks(self, task):
        """Return blocks that pass the input data restriction"""
        validBlocks = []
        # TODO take the chunk size from parameter
        chunkSize = 200

        acdcInfo = task.getInputACDC()
        if not acdcInfo:
            raise WorkQueueWMSpecError(
                self.wmspec, 'No acdc section for %s' % task.getPathName())
        acdc = DataCollectionService(acdcInfo["server"], acdcInfo["database"])
        if self.data:
            acdcBlockSplit = ACDCBlock.splitBlockName(self.data.keys()[0])
        else:
            #if self.data is not passed, assume the the data is input dataset
            # from the spec
            acdcBlockSplit = False

        if acdcBlockSplit:
            dbsBlock = {}
            dbsBlock['Name'] = self.data.keys()[0]
            block = acdc.getChunkInfo(
                acdcInfo['collection'],
                acdcBlockSplit['TaskName'],
                acdcBlockSplit['Offset'],
                acdcBlockSplit['NumOfFiles'],
                user=self.wmspec.getOwner().get("name"),
                group=self.wmspec.getOwner().get("group"))
            dbsBlock['NumberOfFiles'] = block['files']
            dbsBlock['NumberOfEvents'] = block['events']
            dbsBlock['NumberOfLumis'] = block['lumis']
            dbsBlock['ACDC'] = acdcInfo
            dbsBlock["Sites"] = sitesFromStorageEelements(block["locations"])
            validBlocks.append(dbsBlock)
        else:
            acdcBlocks = acdc.chunkFileset(
                acdcInfo['collection'],
                acdcInfo['fileset'],
                chunkSize,
                user=self.wmspec.getOwner().get("name"),
                group=self.wmspec.getOwner().get("group"))
            for block in acdcBlocks:
                dbsBlock = {}
                dbsBlock['Name'] = ACDCBlock.name(self.wmspec.name(),
                                                  acdcInfo["fileset"],
                                                  block['offset'],
                                                  block['files'])
                dbsBlock['NumberOfFiles'] = block['files']
                dbsBlock['NumberOfEvents'] = block['events']
                dbsBlock['NumberOfLumis'] = block['lumis']
                dbsBlock["Sites"] = sitesFromStorageEelements(
                    block["locations"])
                dbsBlock['ACDC'] = acdcInfo
                validBlocks.append(dbsBlock)

        return validBlocks
Ejemplo n.º 12
0
 def newQueueElement(self, **args):
     args.setdefault('Status', 'Available')
     args.setdefault('WMSpec', self.wmspec)
     args.setdefault('Task', self.initialTask)
     args.setdefault('RequestName', self.wmspec.name())
     args.setdefault('TaskName', self.initialTask.name())
     args.setdefault('Dbs', self.initialTask.dbsUrl())
     args.setdefault('SiteWhitelist', self.initialTask.siteWhitelist())
     args.setdefault('SiteBlacklist', self.initialTask.siteBlacklist())
     args.setdefault('EndPolicy', self.wmspec.endPolicyParameters())
     args.setdefault('Priority', self.wmspec.priority())
     if not args['Priority']:
         args['Priority'] = 0
     ele = WorkQueueElement(**args)
     for data, sites in ele['Inputs'].items():
         if not sites:
             raise WorkQueueWMSpecError(
                 self.wmspec, 'Input data has no locations "%s"' % data)
     # catch infinite splitting loops
     if len(self.workQueueElements) > self.args.get('maxRequestSize', 1e8):
         raise WorkQueueWMSpecError(
             self.wmspec, 'Too many elements (%d)' %
             self.args.get('MaxRequestElements', 1e8))
     self.workQueueElements.append(ele)
Ejemplo n.º 13
0
 def __call__(self, wmspec, task, data=None, mask=None, team=None):
     self.wmspec = wmspec
     # bring in spec specific settings
     self.args.update(self.wmspec.startPolicyParameters())
     self.initialTask = task
     if data:
         self.data = data
     self.mask = mask
     self.validate()
     try:
         self.split()
     # For known exceptions raise custom error that will fail the workflow.
     except DbsConfigurationError, ex:
         # A dbs configuration error implies the spec is invalid
         error = WorkQueueWMSpecError(self.wmspec,
                                      "DBS config error: %s" % str(ex))
         raise error
Ejemplo n.º 14
0
 def validate(self):
     """Check args and spec work with block splitting"""
     StartPolicyInterface.validateCommon(self)
     if not self.initialTask.inputDataset():
         raise WorkQueueWMSpecError(self.wmspec, 'No input dataset')
Ejemplo n.º 15
0
    def queueNewRequests(self, queue):
        """Get requests from regMgr and queue to workqueue"""
        self.logger.info("Contacting Request manager for more work")
        work = 0
        workLoads = []

        try:
            workLoads = self.getAvailableRequests()
        except Exception as ex:
            traceMsg = traceback.format_exc()
            msg = "Error contacting RequestManager: %s" % traceMsg
            self.logger.warning(msg)
            return 0

        for team, reqName, workLoadUrl in workLoads:
            try:
                try:
                    Lexicon.couchurl(workLoadUrl)
                except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                    # check its not a local file
                    if not os.path.exists(workLoadUrl):
                        error = WorkQueueWMSpecError(
                            None,
                            "Workflow url validation error: %s" % str(ex))
                        raise error

                self.logger.info("Processing request %s at %s" %
                                 (reqName, workLoadUrl))
                units = queue.queueWork(workLoadUrl,
                                        request=reqName,
                                        team=team)
                self.logdb.delete(reqName,
                                  "error",
                                  this_thread=True,
                                  agent=False)
            except TERMINAL_EXCEPTIONS as ex:
                # fatal error - report back to ReqMgr
                self.logger.error(
                    'Permanent failure processing request "%s": %s' %
                    (reqName, str(ex)))
                self.logger.info("Marking request %s as failed in ReqMgr" %
                                 reqName)
                self.reportRequestStatus(reqName, 'Failed', message=str(ex))
                continue
            except (IOError, socket.error, CouchError,
                    CouchConnectionError) as ex:
                # temporary problem - try again later
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nError: "%s"' % str(ex)
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except Exception as ex:
                # Log exception as it isnt a communication problem
                msg = 'Error processing request "%s": will try again later.' % reqName
                msg += '\nSee log for details.\nError: "%s"' % str(ex)
                self.logger.exception('Unknown error processing %s' % reqName)
                self.logdb.post(reqName, msg, 'error')
                continue

            self.logger.info('%s units(s) queued for "%s"' % (units, reqName))
            work += units

            self.logger.info("%s element(s) obtained from RequestManager" %
                             work)
        return work
Ejemplo n.º 16
0
    def queueNewRequests(self, queue):
        """Get requests from regMgr and queue to workqueue"""
        self.logger.info("Contacting Request manager for more work")
        work = 0
        workLoads = []

        if queue.params['DrainMode']:
            self.logger.info(
                'Draining queue: Skip requesting work from ReqMgr')
            return 0

        try:
            workLoads = self.getAvailableRequests(queue.params['Teams'])
        except Exception as ex:
            traceMsg = traceback.format_exc()
            msg = "Error contacting RequestManager: %s" % traceMsg
            self.logger.warning(msg)
            return 0

        for team, reqName, workLoadUrl in workLoads:
            #            try:
            #                self.reportRequestStatus(reqName, "negotiating")
            #            except Exception, ex:
            #                self.logger.error("""
            #                    Unable to update ReqMgr state to negotiating: %s
            #                    Ignoring this request: %s""" % (str(ex), reqName))
            #                continue

            try:
                try:
                    Lexicon.couchurl(workLoadUrl)
                except Exception as ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                    # check its not a local file
                    if not os.path.exists(workLoadUrl):
                        error = WorkQueueWMSpecError(
                            None,
                            "Workflow url validation error: %s" % str(ex))
                        raise error

                self.logger.info("Processing request %s at %s" %
                                 (reqName, workLoadUrl))
                units = queue.queueWork(workLoadUrl,
                                        request=reqName,
                                        team=team)
                self.logdb.delete(reqName, "error", this_thread=True)
            except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex:
                # fatal error - report back to ReqMgr
                self.logger.info(
                    'Permanent failure processing request "%s": %s' %
                    (reqName, str(ex)))
                self.logger.info("Marking request %s as failed in ReqMgr" %
                                 reqName)
                self.reportRequestStatus(reqName, 'Failed', message=str(ex))
                continue
            except (IOError, socket.error, CouchError,
                    CouchConnectionError) as ex:
                # temporary problem - try again later
                msg = 'Error processing request "%s": will try again later.' \
                '\nError: "%s"' % (reqName, str(ex))
                self.logger.info(msg)
                self.logdb.post(reqName, msg, 'error')
                continue
            except Exception as ex:
                # Log exception as it isnt a communication problem
                msg = 'Error processing request "%s": will try again later.' \
                '\nSee log for details.\nError: "%s"' % (reqName, str(ex))
                self.logger.exception('Unknown error processing %s' % reqName)
                self.logdb.post(reqName, msg, 'error')
                continue

            try:
                self.reportRequestStatus(reqName, "acquired")
            except Exception as ex:
                self.logger.warning("Unable to update ReqMgr state: %s" %
                                    str(ex))
                self.logger.warning('Will try again later')

            self.logger.info('%s units(s) queued for "%s"' % (units, reqName))
            work += units

            self.logger.info("%s element(s) obtained from RequestManager" %
                             work)
        return work
Ejemplo n.º 17
0
    def split(self):
        """Apply policy to spec"""
        # if not specified take standard defaults
        self.args.setdefault('SliceType', 'NumberOfEvents')
        self.args.setdefault('SliceSize', 1000)  # events per job
        self.args.setdefault('SubSliceType', 'NumberOfEventsPerLumi')
        self.args.setdefault('SubSliceSize',
                             self.args['SliceSize'])  # events per lumi
        self.args.setdefault('MaxJobsPerElement', 1000)  # jobs per WQE
        self.args.setdefault('MaxLumisPerElement',
                             os.environ.get('MAX_LUMIS_PER_WQE'))
        self.args.setdefault(
            'blowupFactor',
            1.0)  # Estimate of additional jobs following tasks.
        # Total WQE tasks will be Jobs*(1+blowupFactor)
        noInputUpdate = self.initialTask.getTrustSitelists().get('trustlists')
        noPileupUpdate = self.initialTask.getTrustSitelists().get(
            'trustPUlists')

        if not self.mask:
            self.mask = Mask(FirstRun=1,
                             FirstLumi=self.initialTask.getFirstLumi(),
                             FirstEvent=self.initialTask.getFirstEvent(),
                             LastRun=1,
                             LastEvent=self.initialTask.getFirstEvent() +
                             self.initialTask.totalEvents() - 1)
        mask = Mask(**self.mask)

        #First let's initialize some parameters
        stepSize = int(self.args['SliceSize']) * int(
            self.args['MaxJobsPerElement'])
        total = mask['LastEvent'] - mask['FirstEvent'] + 1
        lastAllowedEvent = mask['LastEvent']
        eventsAccounted = 0

        while eventsAccounted < total:
            current = mask['FirstEvent'] + stepSize - 1  # inclusive range
            if current > lastAllowedEvent:
                current = lastAllowedEvent
            mask['LastEvent'] = current

            #Calculate the job splitting without actually doing it
            # number of lumis is calculated by events number and SubSliceSize which is events per lumi
            # So if there no exact division between events per job and events per lumi
            # it takes the ceiling of the value.
            # Therefore total lumis can't be calculated from total events / SubSliceSize
            # It has to be caluated by adding the lumis_per_job * number of jobs
            nEvents = mask['LastEvent'] - mask['FirstEvent'] + 1
            lumis_per_job = ceil(self.args['SliceSize'] /
                                 self.args['SubSliceSize'])
            nLumis = floor(nEvents / self.args['SliceSize']) * lumis_per_job
            remainingLumis = ceil(nEvents % self.args['SliceSize'] /
                                  self.args['SubSliceSize'])
            nLumis += remainingLumis
            jobs = ceil(nEvents / self.args['SliceSize'])

            if self.args['MaxLumisPerElement'] and nLumis > int(
                    self.args['MaxLumisPerElement']):
                raise WorkQueueWMSpecError(
                    self.wmspec, "Too many lumis in WQE: %s" % nLumis)

            mask['LastLumi'] = mask['FirstLumi'] + int(
                nLumis) - 1  # inclusive range
            self.newQueueElement(WMSpec=self.wmspec,
                                 NumberOfLumis=nLumis,
                                 NumberOfEvents=nEvents,
                                 Jobs=jobs,
                                 Mask=copy(mask),
                                 NoInputUpdate=noInputUpdate,
                                 NoPileupUpdate=noPileupUpdate,
                                 blowupFactor=self.args['blowupFactor'])

            if mask['LastEvent'] > (2**32 - 1):
                #This is getting tricky, to ensure consecutive
                #events numbers we must calculate where the jobSplitter
                #will restart the firstEvent to 1 for the last time
                #in the newly created unit
                internalEvents = mask['FirstEvent']
                accumulatedEvents = internalEvents
                breakPoint = internalEvents

                while accumulatedEvents < mask['LastEvent']:
                    if (internalEvents + self.args['SliceSize'] - 1) > (2**32 -
                                                                        1):
                        internalEvents = 1
                        breakPoint = accumulatedEvents
                    else:
                        internalEvents += self.args['SliceSize']
                        accumulatedEvents += self.args['SliceSize']

                leftoverEvents = mask['LastEvent'] - breakPoint + 1
                mask['FirstEvent'] = leftoverEvents + 1

            else:
                mask['FirstEvent'] = mask['LastEvent'] + 1

            mask['FirstLumi'] = mask['LastLumi'] + 1
            eventsAccounted += stepSize
            lastAllowedEvent = (total -
                                eventsAccounted) + mask['FirstEvent'] - 1
Ejemplo n.º 18
0
class WorkQueueReqMgrInterface():
    """Helper class for ReqMgr interaction"""
    def __init__(self, **kwargs):
        if not kwargs.get('logger'):
            import logging
            kwargs['logger'] = logging
        self.logger = kwargs['logger']
        self.reqMgr = RequestManager(kwargs)
        self.previous_state = {}

    def __call__(self, queue):
        """Synchronize WorkQueue and RequestManager"""
        msg = ''
        try:  # pull in new work
            work = self.queueNewRequests(queue)
            msg += "New Work: %d\n" % work
        except Exception:
            self.logger.exception("Error caught during RequestManager pull")

        try:  # get additional open-running work
            extraWork = self.addNewElementsToOpenRequests(queue)
            msg += "Work added: %d\n" % extraWork
        except Exception:
            self.logger.exception("Error caught during RequestManager split")

        try:  # report back to ReqMgr
            uptodate_elements = self.report(queue)
            msg += "Updated ReqMgr status for: %s\n" % ", ".join(
                [x['RequestName'] for x in uptodate_elements])
        except:
            self.logger.exception("Error caught during RequestManager update")
        else:
            try:  # Delete finished requests from WorkQueue
                self.deleteFinishedWork(queue, uptodate_elements)
            except:
                self.logger.exception("Error caught during work deletion")

        queue.backend.recordTaskActivity('reqmgr_sync', msg)

    def queueNewRequests(self, queue):
        """Get requests from regMgr and queue to workqueue"""
        self.logger.info("Contacting Request manager for more work")
        work = 0
        workLoads = []

        if queue.params['DrainMode']:
            self.logger.info(
                'Draining queue: Skip requesting work from ReqMgr')
            return 0

        try:
            workLoads = self.getAvailableRequests(*queue.params['Teams'])
        except Exception, ex:
            msg = "Error contacting RequestManager: %s" % str(ex)
            self.logger.warning(msg)
            return 0

        for team, reqName, workLoadUrl in workLoads:
            #            try:
            #                self.reportRequestStatus(reqName, "negotiating")
            #            except Exception, ex:
            #                self.logger.error("""
            #                    Unable to update ReqMgr state to negotiating: %s
            #                    Ignoring this request: %s""" % (str(ex), reqName))
            #                continue

            try:
                try:
                    Lexicon.couchurl(workLoadUrl)
                except Exception, ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                    # check its not a local file
                    if not os.path.exists(workLoadUrl):
                        error = WorkQueueWMSpecError(
                            None,
                            "Workflow url validation error: %s" % str(ex))
                        raise error

                self.logger.info("Processing request %s at %s" %
                                 (reqName, workLoadUrl))
                units = queue.queueWork(workLoadUrl,
                                        request=reqName,
                                        team=team)
            except (WorkQueueWMSpecError, WorkQueueNoWorkError), ex:
                # fatal error - report back to ReqMgr
                self.logger.info(
                    'Permanent failure processing request "%s": %s' %
                    (reqName, str(ex)))
                self.logger.info("Marking request %s as failed in ReqMgr" %
                                 reqName)
                self.reportRequestStatus(reqName, 'Failed', message=str(ex))
                continue
Ejemplo n.º 19
0
                raise error
            try:
                [
                    Lexicon.cmsname(site)
                    for site in self.initialTask.siteWhitelist()
                ]
            except Exception, ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    "Site whitelist validation error: %s" % str(ex))
                raise error

        if self.initialTask.siteBlacklist():
            if type(self.initialTask.siteBlacklist()) in types.StringTypes:
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    'Invalid site blacklist: Must be tuple/list but is %s' %
                    type(self.initialTask.siteBlacklist()))
                raise error
            try:
                [
                    Lexicon.cmsname(site)
                    for site in self.initialTask.siteBlacklist()
                ]
            except Exception, ex:  # can throw many errors e.g. AttributeError, AssertionError etc.
                error = WorkQueueWMSpecError(
                    self.wmspec,
                    "Site blacklist validation error: %s" % str(ex))
                raise error

        # splitter settings
        if self.args.get('SliceSize', 1) <= 0: