class AuxCacheUpdateTasks(CherryPyPeriodicTask):
    """
    Updates Aux db update periodically. (i.e. TagCollector)
    """
    def __init__(self, rest, config):

        super(AuxCacheUpdateTasks, self).__init__(config)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url)

    def setConcurrentTasks(self, config):
        """
        sets the list of functions which
        """
        self.concurrentTasks = [{
            'func': self.updateCMSSW,
            'duration': config.tagCollectDuration
        }]

    def updateCMSSW(self, config):
        """
        gather active data statistics
        """

        self.reqmgrAux.populateCMSSWVersion(config.tagcollect_url,
                                            **config.tagcollect_args)
Beispiel #2
0
    def __init__(self, msConfig, logger=None):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param logger: logger object (optional)
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False), logger)
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s",
                         self.msConfig)

        self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'], logger=self.logger)
        self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                   httpDict={'cacheduration': 1.0},
                                   logger=self.logger)

        # hard code it to production DBS otherwise PhEDEx subscribe API fails to match TMDB data
        dbsUrl = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
        if usingRucio():
            # FIXME: we cannot use Rucio in write mode yet
            # self.rucio = Rucio(self.msConfig['rucioAccount'], configDict={"logger": self.logger})
            self.phedex = PhEDEx(httpDict={'cacheduration': 0.5},
                                 dbsUrl=dbsUrl,
                                 logger=self.logger)
        else:
            self.phedex = PhEDEx(httpDict={'cacheduration': 0.5},
                                 dbsUrl=dbsUrl,
                                 logger=self.logger)
Beispiel #3
0
    def __init__(self, msConfig, **kwargs):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param kwargs: can be used to skip the initialization of specific services, such as:
            logger: logger object
            skipReqMgr: boolean to skip ReqMgr initialization
            skipReqMgrAux: boolean to skip ReqMgrAux initialization
            skipRucio: boolean to skip Rucio initialization
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False), kwargs.get("logger"))
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s", self.msConfig)

        if not kwargs.get("skipReqMgr", False):
            self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'], logger=self.logger)
        if not kwargs.get("skipReqMgrAux", False):
            self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                       httpDict={'cacheduration': 1.0}, logger=self.logger)

        self.phedex = None
        self.rucio = None
        if not kwargs.get("skipRucio", False):
            self.rucio = Rucio(acct=self.msConfig['rucioAccount'],
                               hostUrl=self.msConfig['rucioUrl'],
                               authUrl=self.msConfig['rucioAuthUrl'],
                               configDict={"logger": self.logger, "user_agent": "wmcore-microservices"})
    def __init__(self, rest, config):

        super(CouchDBCleanup, self).__init__(config)
        self.reqDB = RequestDBReader(config.reqmgrdb_url)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url, logger=self.logger)
        # statuses that we want to keep the transfer documents
        self.transferStatuses = [
            "assigned", "staging", "staged", "acquired", "failed",
            "running-open", "running-closed"
        ]

        baseURL, acdcDB = splitCouchServiceURL(config.acdc_url)
        self.acdcService = CouchService(url=baseURL, database=acdcDB)
Beispiel #5
0
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI()
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.validSpeedDrainConfigKeys = [
            'CondorPriority', 'NoJobRetries', 'EnableAllSites'
        ]

        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
Beispiel #6
0
    def __init__(self, rest, config):

        super(BuildParentLock, self).__init__(config)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url, logger=self.logger)
        self.dbs = DBS3Reader(config.dbs_url)
        # cache of dbs lookups mapping input dataset to parent dataset
        self.dbsLookupCache = {}
        # set of of currently active datasets requiring parent dataset
        self.inputDatasetCache = set()
        self.reqDB = RequestDBReader(config.reqmgrdb_url)
        self.filterKeys = [
            'assignment-approved', 'assigned', 'staging', 'staged', 'failed',
            'acquired', 'running-open', 'running-closed', 'force-complete',
            'completed', 'closed-out'
        ]
Beispiel #7
0
    def __init__(self, microConfig, uniConfig, logger=None):
        """
        Runs the basic setup and initialization for the MS Transferor module
        :param microConfig: microservice configuration
        """
        self.msConfig = microConfig
        self.uConfig = uniConfig
        self.reqRecords = []
        self.logger = getMSLogger(microConfig['verbose'], logger=logger)

        self.reqmgr2 = ReqMgr(microConfig['reqmgrUrl'], logger=self.logger)
        self.reqmgrAux = ReqMgrAux(microConfig['reqmgrUrl'], httpDict={'cacheduration': 60}, logger=self.logger)
        # eventually will change it to Rucio
        self.phedex = PhEDEx(httpDict={'cacheduration': 10 * 60},
                             dbsUrl=microConfig['dbsUrl'], logger=self.logger)
Beispiel #8
0
 def __init__(self, config):
     """
     initialize properties specified from config
     """
     BaseWorkerThread.__init__(self)
     self.config = config
     self.drainAPI = DrainStatusAPI(config)
     self.condorAPI = PyCondorAPI()
     self.agentConfig = {}
     self.previousConfig = {}
     self.validSpeedDrainConfigKeys = [
         'CondorPriority', 'NoJobRetries', 'EnableAllSites'
     ]
     self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
     self.emailAlert = EmailAlert(config.EmailAlert.dictionary_())
     self.condorStates = ("Running", "Idle")
Beispiel #9
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
            self.maxRetries = self.config.ErrorHandler.maxRetries
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
            self.maxRetries = self.reqAuxDB.getWMAgentConfig(
                self.config.Agent.hostName).get("MaxRetries")

        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException(
                'Max retries for the default job type must be specified')

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler,
                                      'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime',
                                   32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        return
Beispiel #10
0
class AuxCacheUpdateTasks(CherryPyPeriodicTask):
    """
    Updates Aux db update periodically. (i.e. TagCollector)
    """
    def __init__(self, rest, config):

        super(AuxCacheUpdateTasks, self).__init__(config)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url, logger=self.logger)
        self.mgr = RequestHandler()

    def setConcurrentTasks(self, config):
        """
        sets the list of functions which
        """
        self.concurrentTasks = [{
            'func': self.updateAuxiliarDocs,
            'duration': config.tagCollectDuration
        }]

    def updateAuxiliarDocs(self, config):
        """
        Update the central couch database with auxiliary documents
        that need to be constanly updated whenever an update is
        made at the data source
        """
        self.logger.info("Updating auxiliary couch documents ...")

        self.reqmgrAux.populateCMSSWVersion(config.tagcollect_url,
                                            **config.tagcollect_args)

        try:
            data = self.mgr.getdata(config.unified_url,
                                    params={},
                                    headers={'Accept': 'application/json'})
            data = json.loads(data)
        except Exception as ex:
            msg = "Failed to retrieve unified configuration from github. Error: %s" % str(
                ex)
            msg += "\nRetrying again in the next cycle"
            self.logger.error(msg)
            return

        self.reqmgrAux.updateUnifiedConfig(data, docName="config")
Beispiel #11
0
    def __init__(self, msConfig, **kwargs):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param kwargs: can be used to skip the initialization of specific services, such as:
            logger: logger object
            skipReqMgr: boolean to skip ReqMgr initialization
            skipReqMgrAux: boolean to skip ReqMgrAux initialization
            skipRucio: boolean to skip Rucio initialization
            skipPhEDEx: boolean to skip PhEDEx initialization
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False),
                                  kwargs.get("logger"))
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s",
                         self.msConfig)

        if not kwargs.get("skipReqMgr", False):
            self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'],
                                  logger=self.logger)
        if not kwargs.get("skipReqMgrAux", False):
            self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                       httpDict={'cacheduration': 1.0},
                                       logger=self.logger)

        self.phedex = None
        self.rucio = None
        if self.msConfig.get('useRucio',
                             False) and not kwargs.get("skipRucio", False):
            self.rucio = Rucio(acct=self.msConfig['rucioAccount'],
                               hostUrl=self.msConfig['rucioUrl'],
                               authUrl=self.msConfig['rucioAuthUrl'],
                               configDict={
                                   "logger": self.logger,
                                   "user_agent": "wmcore-microservices"
                               })
        elif not kwargs.get("skipPhEDEx", False):
            # hard code it to production DBS otherwise PhEDEx subscribe API fails to match TMDB data
            dbsUrl = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
            self.phedex = PhEDEx(httpDict={'cacheduration': 0.5},
                                 dbsUrl=dbsUrl,
                                 logger=self.logger)
Beispiel #12
0
    def __init__(self, config=None, logger=None):
        """
        Setup a bunch of things, like:
         * logger for this service
         * initialize all the necessary service helpers
         * fetch the unified configuration from central couch
         * update the unified configuration with some deployment and default settings
         * start both transfer and monitor threads
        :param config: reqmgr2ms service configuration
        :param logger:
        """
        self.uConfig = {}
        self.config = config
        self.logger = getMSLogger(getattr(config, 'verbose', False), logger)
        self._parseConfig(config)
        self.logger.info("Configuration including default values:\n%s",
                         self.msConfig)

        self.reqmgr2 = ReqMgr(self.msConfig['reqmgrUrl'], logger=self.logger)
        self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgrUrl'],
                                   httpDict={'cacheduration': 60},
                                   logger=self.logger)

        # transferor has to look at workflows in assigned status
        self.msTransferor = MSTransferor(self.msConfig,
                                         "assigned",
                                         logger=self.logger)

        ### Last but not least, get the threads started
        thname = 'MSTransferor'
        self.transfThread = start_new_thread(
            thname, daemon, (self.transferor, 'assigned',
                             self.msConfig['interval'], self.logger))
        self.logger.debug("### Running %s thread %s", thname,
                          self.transfThread.running())

        thname = 'MSTransferorMonit'
        self.monitThread = start_new_thread(
            thname, daemon, (self.monitor, 'staging',
                             self.msConfig['interval'] * 2, self.logger))
        self.logger.debug("+++ Running %s thread %s", thname,
                          self.monitThread.running())
Beispiel #13
0
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI(config)
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.validSpeedDrainConfigKeys = ['CondorPriority', 'NoJobRetries', 'EnableAllSites']

        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
class AuxCacheUpdateTasks(CherryPyPeriodicTask):
    """
    Updates Aux db update periodically. (i.e. TagCollector)
    """

    def __init__(self, rest, config):

        super(AuxCacheUpdateTasks, self).__init__(config)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url, logger=self.logger)

    def setConcurrentTasks(self, config):
        """
        sets the list of functions which
        """
        self.concurrentTasks = [{'func': self.updateCMSSW, 'duration': config.tagCollectDuration}]

    def updateCMSSW(self, config):
        """
        gather active data statistics
        """
        self.reqmgrAux.populateCMSSWVersion(config.tagcollect_url, **config.tagcollect_args)
        self.logger.info("Updated CMSSW versions in the auxiliar db")
Beispiel #15
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
            self.maxRetries = self.config.ErrorHandler.maxRetries
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler,
                                      'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime',
                                   32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)

        self.setupComponentParam()

        return
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()
    verbose = int(opts.verbose)
    logger = None
    mgr = None
    if verbose:
        logger = logging.getLogger('parse_campaign')
        logger.setLevel(logging.DEBUG)
        logging.basicConfig()
    if opts.url:
        key = os.getenv('X509_USER_KEY', '')
        cert = os.getenv('X509_USER_CERT', '')
        proxy = os.getenv('X509_USER_PROXY', '')
        if proxy and not cert:
            cert = proxy
            key = proxy
        hdict = {'cert': cert, 'key': key, 'pycurl': True}
        mgr = ReqMgrAux(opts.url, hdict, logger=logger)
    if opts.dburi:
        conn = MongoClient(host=opts.dburi)
        dbname = opts.dbname
        dbcoll = opts.dbcoll
        if verbose:
            print("### read data from '%s', %s/%s" %
                  (opts.dburi, dbname, dbcoll))
        data = [r for r in conn[dbname][dbcoll].find()]
    else:
        fin = opts.fin
        if verbose:
            print("### read data from '%s'" % fin)
        with open(fin, 'r') as istream:
            data = []
            for key, val in json.load(istream).items():
                rec = {'name': key}
                rec.update(val)
                data.append(rec)
    rawRecords = parse(data, verbose)

    output = []  # in case we want to dump all records to a json file
    for rec in process(rawRecords):
        output.append(rec)
        print(json.dumps(rec))
        upload(mgr, rec)
    if opts.fout:
        print("Saving all %d unique campaign records to: %s\n" %
              (len(output), opts.fout))
        with open(opts.fout, "w") as jo:
            json.dump(output, jo, indent=2)
Beispiel #17
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
            self.maxRetries = self.config.ErrorHandler.maxRetries
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
            self.maxRetries = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName).get("MaxRetries")

        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException('Max retries for the default job type must be specified')

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url=config.ACDC.couchurl,
                                                    database=config.ACDC.database)

        return
Beispiel #18
0
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
            self.maxRetries = self.config.ErrorHandler.maxRetries
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url=config.ACDC.couchurl,
                                                    database=config.ACDC.database)

        self.setupComponentParam()

        return
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()
    verbose = int(opts.verbose)
    logger = None
    mgr = None
    inputWMCore = False
    if verbose:
        logger = logging.getLogger('parse_campaign')
        logger.setLevel(logging.DEBUG)
        logging.basicConfig()
    if opts.url:
        key = os.getenv('X509_USER_KEY', '')
        cert = os.getenv('X509_USER_CERT', '')
        proxy = os.getenv('X509_USER_PROXY', '')
        if proxy and not cert:
            cert = proxy
            key = proxy
        hdict = {'cert': cert, 'key': key, 'pycurl': True}
        mgr = ReqMgrAux(opts.url, hdict, logger=logger)
    if opts.dburi:
        conn = MongoClient(host=opts.dburi)
        dbname = opts.dbname
        dbcoll = opts.dbcoll
        if verbose:
            print("### read data from '%s', %s/%s" %
                  (opts.dburi, dbname, dbcoll))
        data = [r for r in conn[dbname][dbcoll].find()]
    elif opts.fin:
        fin = opts.fin
        if verbose:
            print("### read data from '%s'" % fin)
        data = []
        with open(fin, 'r') as istream:
            campData = json.load(istream)
            if isinstance(campData, dict):
                # then it's a Unified-like campaign schema
                for key, val in campData.items():
                    rec = {'name': key}
                    rec.update(val)
                    data.append(rec)
            elif isinstance(campData, list):
                # then the input file has WMCore-like campaign schema
                print("Found %d campaigns in the input file." % len(campData))
                data = campData
                inputWMCore = True
    if not inputWMCore:
        data = parse(data, verbose)

    output = []  # in case we want to dump all records to a json file
    for rec in process(data):
        output.append(rec)
        upload(mgr, rec)
    if opts.testcamp:
        insertTestCampaigns(mgr)
    if opts.fout:
        print("Saving all %d unique campaign records to: %s\n" %
              (len(output), opts.fout))
        with open(opts.fout, "w") as jo:
            json.dump(output, jo, indent=2)
Beispiel #20
0
class MSCore(object):
    """
    This class provides core functionality for
    MSTransferor, MSMonitor, MSOutput. MSRuleCleaner classes.
    """

    def __init__(self, msConfig, **kwargs):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param kwargs: can be used to skip the initialization of specific services, such as:
            logger: logger object
            skipReqMgr: boolean to skip ReqMgr initialization
            skipReqMgrAux: boolean to skip ReqMgrAux initialization
            skipRucio: boolean to skip Rucio initialization
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False), kwargs.get("logger"))
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s", self.msConfig)

        if not kwargs.get("skipReqMgr", False):
            self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'], logger=self.logger)
        if not kwargs.get("skipReqMgrAux", False):
            self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                       httpDict={'cacheduration': 1.0}, logger=self.logger)

        self.phedex = None
        self.rucio = None
        if not kwargs.get("skipRucio", False):
            self.rucio = Rucio(acct=self.msConfig['rucioAccount'],
                               hostUrl=self.msConfig['rucioUrl'],
                               authUrl=self.msConfig['rucioAuthUrl'],
                               configDict={"logger": self.logger, "user_agent": "wmcore-microservices"})

    def unifiedConfig(self):
        """
        Fetches the unified configuration
        :return: unified configuration content
        """
        res = self.reqmgrAux.getUnifiedConfig(docName="config")
        if res:
            if isinstance(res, list):
                return res[0]
            return res
        else:
            return {}

    def change(self, reqName, reqStatus, prefix='###'):
        """
        Update the request status in ReqMgr2
        """
        try:
            if self.msConfig['enableStatusTransition']:
                self.logger.info('%s updating %s status to: %s', prefix, reqName, reqStatus)
                self.reqmgr2.updateRequestStatus(reqName, reqStatus)
            else:
                self.logger.info('DRY-RUN:: %s updating %s status to: %s', prefix, reqName, reqStatus)
        except Exception as err:
            self.logger.exception("Failed to change request status. Error: %s", str(err))

    def updateReportDict(self, reportDict, keyName, value):
        """
        Provided a key name and value, validate the key name
        and update the report dictionary if it passes the validation
        :param reportDict: dictionary with a summary of the service
        :param keyName: string with the key name in the report
        :param value: string/integer value with the content of a metric
        :return: the updated dictionary
        """
        if keyName not in reportDict:
            self.logger.error("Report metric '%s' is not supported", keyName)
        else:
            reportDict[keyName] = value
        return reportDict
Beispiel #21
0
class DrainStatusPoller(BaseWorkerThread):
    """
    Collects information related to the agent drain status
    """
    # class variable that contains drain statistics
    drainStats = {}

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI()
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.validSpeedDrainConfigKeys = [
            'CondorPriority', 'NoJobRetries', 'EnableAllSites'
        ]

        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

    @timeFunction
    def algorithm(self, parameters):
        """
        Update drainStats if agent is in drain mode
        """
        logging.info("Running agent drain algorithm...")
        self.agentConfig = self.reqAuxDB.getWMAgentConfig(
            self.config.Agent.hostName)

        if isDrainMode(self.config):
            # check to see if the agent hit any speed drain thresholds
            thresholdsHit = self.checkSpeedDrainThresholds()
            if thresholdsHit:
                logging.info("Updating agent configuration for speed drain...")
                self.updateAgentSpeedDrainConfig(thresholdsHit)
            try:
                DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo()
                logging.info("Finished collecting agent drain status.")
                logging.info("Drain stats: " +
                             str(DrainStatusPoller.drainStats))

            except Exception as ex:
                msg = "Error occurred, will retry later:\n"
                msg += str(ex)
                logging.exception(msg)
        else:
            logging.info(
                "Agent not in drain mode. Resetting flags and skipping drain check..."
            )
            self.resetAgentSpeedDrainConfig()

    @classmethod
    def getDrainInfo(cls):
        """
        Return drainStats class variable
        """
        return cls.drainStats

    def updateAgentSpeedDrainConfig(self, thresholdsHit):
        """
        Takes a list of speed drain configuration keys and updates the agent configuration
        """
        updateConfig = False
        condorPriorityFlag = False
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        if 'CondorPriority' in thresholdsHit:
            logging.info(
                "Bumping condor job priority to 999999 for Production/Processing pending jobs."
            )
            self.condorAPI.editCondorJobs(
                "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")",
                "JobPrio", "999999")
            condorPriorityFlag = True

        if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']:
            # CondorPriority setting is irreversible so the flag only indicates weather
            # priority is increased or not. It is not checked by other components
            logging.info("Enabling CondorPriority flag.")
            speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag
            updateConfig = True

        if 'NoJobRetries' in thresholdsHit:
            logging.info(
                "Enabling NoJobRetries flag: Error Handler won't retry the jobs"
            )
            # ErrorHandler will pick this up and set max retries to 0
            speedDrainConfig['NoJobRetries']['Enabled'] = True
            updateConfig = True

        if 'EnableAllSites' in thresholdsHit:
            logging.info(
                "Enabling EnableAllSites flag: Updating agent to submit to all sites."
            )
            # setting this value to True makes JobSubmitterPoller ignore site status
            speedDrainConfig['EnableAllSites']['Enabled'] = True
            updateConfig = True

        # update the aux db speed drain config with any changes
        if updateConfig:
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName,
                                            "SpeedDrainMode", True)
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName,
                                            "SpeedDrainConfig",
                                            speedDrainConfig)

        return

    def resetAgentSpeedDrainConfig(self):
        """
        resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False
        """

        if self.agentConfig.get("SpeedDrainMode"):
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName,
                                            "SpeedDrainMode", False)
            speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")
            for key, v in speedDrainConfig.items():
                if key in self.validSpeedDrainConfigKeys and v['Enabled']:
                    speedDrainConfig[key]['Enabled'] = False

            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName,
                                            "SpeedDrainConfig",
                                            speedDrainConfig)
        return

    def checkSpeedDrainThresholds(self):
        """
        Check the current number of jobs in Condor and create a list of agent configuration parameters
        that need updated for speed draining
        """
        enableKeys = []

        # get the current speed drain status
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        # get condor jobs
        jobs = self.condorAPI.getCondorJobs("", [])
        if jobs is None:
            logging.warning(
                "There was an error querying the schedd.  Not checking speed drain thresholds."
            )
            return []

        # loop through the speed drain configuration and make a list of what thresholds have been hit
        for k, v in speedDrainConfig.items():
            # make sure keys in the speed drain config are valid
            if k in self.validSpeedDrainConfigKeys and isinstance(
                    v['Threshold'], int) and isinstance(v['Enabled'], bool):
                # we always want to apply the condor priority change if the threshold is hit
                if not v['Enabled'] or k == 'CondorPriority':
                    logging.info("Checking speed drain threshold for %s. ", k)
                    if len(jobs) < v['Threshold']:
                        logging.info(
                            "Agent will update speed drain configuration for %s. ",
                            k)
                        enableKeys.append(k)
            else:
                logging.warning(
                    "Speed drain configuration error for %s.  Please check aux db contents.",
                    k)

        return enableKeys
Beispiel #22
0
class DrainStatusPoller(BaseWorkerThread):
    """
    Collects information related to the agent drain status
    """
    # class variable that contains drain statistics
    drainStats = {}

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI(config)
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.validSpeedDrainConfigKeys = ['CondorPriority', 'NoJobRetries', 'EnableAllSites']

        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

    @timeFunction
    def algorithm(self, parameters):
        """
        Update drainStats if agent is in drain mode
        """
        logging.info("Running agent drain algorithm...")
        self.agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName)
        if not self.agentConfig:
            logging.error("Failed to fetch agent configuration from the auxiliary DB")
            return

        if isDrainMode(self.config):
            # check to see if the agent hit any speed drain thresholds
            thresholdsHit = self.checkSpeedDrainThresholds()
            if thresholdsHit:
                logging.info("Updating agent configuration for speed drain...")
                self.updateAgentSpeedDrainConfig(thresholdsHit)
            # now collect drain statistics
            try:
                DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo()
                logging.info("Finished collecting agent drain status.")
                logging.info("Drain stats: " + str(DrainStatusPoller.drainStats))

            except Exception as ex:
                msg = "Error occurred, will retry later:\n"
                msg += str(ex)
                logging.exception(msg)
        else:
            logging.info("Agent not in drain mode. Resetting flags and skipping drain check...")
            self.resetAgentSpeedDrainConfig()

    @classmethod
    def getDrainInfo(cls):
        """
        Return drainStats class variable
        """
        return cls.drainStats

    def updateAgentSpeedDrainConfig(self, thresholdsHit):
        """
        Takes a list of speed drain configuration keys and updates the agent configuration
        """
        updateConfig = False
        condorPriorityFlag = False
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        if 'CondorPriority' in thresholdsHit:
            logging.info("Bumping condor job priority to 999999 for Production/Processing pending jobs.")
            self.condorAPI.editCondorJobs(
                "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")",
                "JobPrio", "999999")
            condorPriorityFlag = True

        if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']:
            # CondorPriority setting is irreversible so the flag only indicates weather
            # priority is increased or not. It is not checked by other components
            logging.info("Enabling CondorPriority flag.")
            speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag
            updateConfig = True

        if 'NoJobRetries' in thresholdsHit:
            logging.info("Enabling NoJobRetries flag: Error Handler won't retry the jobs")
            # ErrorHandler will pick this up and set max retries to 0
            speedDrainConfig['NoJobRetries']['Enabled'] = True
            updateConfig = True

        if 'EnableAllSites' in thresholdsHit:
            logging.info("Enabling EnableAllSites flag: Updating agent to submit to all sites.")
            # setting this value to True makes JobSubmitterPoller ignore site status
            speedDrainConfig['EnableAllSites']['Enabled'] = True
            updateConfig = True

        # update the aux db speed drain config with any changes
        if updateConfig:
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", True)
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig)

        return

    def resetAgentSpeedDrainConfig(self):
        """
        resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False
        """

        if self.agentConfig.get("SpeedDrainMode"):
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", False)
            speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")
            for key, v in speedDrainConfig.items():
                if key in self.validSpeedDrainConfigKeys and v['Enabled']:
                    speedDrainConfig[key]['Enabled'] = False

            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig)
        return

    def checkSpeedDrainThresholds(self):
        """
        Check the current number of jobs in Condor and create a list of agent configuration parameters
        that need updated for speed draining
        """
        enableKeys = []

        # get the current speed drain status
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        # get condor jobs
        jobs = self.condorAPI.getCondorJobs("", [])
        if jobs is None:
            logging.warning("There was an error querying the schedd.  Not checking speed drain thresholds.")
            return []

        # loop through the speed drain configuration and make a list of what thresholds have been hit
        for k, v in speedDrainConfig.items():
            # make sure keys in the speed drain config are valid
            if k in self.validSpeedDrainConfigKeys and isinstance(v['Threshold'], int) and isinstance(v['Enabled'], bool):
                # we always want to apply the condor priority change if the threshold is hit
                if not v['Enabled'] or k == 'CondorPriority':
                    logging.info("Checking speed drain threshold for %s. ", k)
                    if len(jobs) < v['Threshold']:
                        logging.info("Agent will update speed drain configuration for %s. ", k)
                        enableKeys.append(k)
            else:
                logging.warning("Speed drain configuration error for %s.  Please check aux db contents.", k)

        return enableKeys
Beispiel #23
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
            )
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return
Beispiel #24
0
class MSTransferor(object):
    def __init__(self, microConfig, status, logger=None):
        """
        Runs the basic setup and initialization for the MS Transferor module
        :param microConfig: microservice configuration
        """
        self.msConfig = microConfig
        self.status = status
        self.uConfig = {}
        self.logger = getMSLogger(microConfig['verbose'], logger=logger)
        self.reqmgr2 = ReqMgr(microConfig['reqmgrUrl'], logger=self.logger)
        self.reqmgrAux = ReqMgrAux(microConfig['reqmgrUrl'],
                                   httpDict={'cacheduration': 60},
                                   logger=self.logger)
        # eventually will change it to Rucio
        self.phedex = PhEDEx(httpDict={'cacheduration': 10 * 60},
                             dbsUrl=microConfig['dbsUrl'],
                             logger=self.logger)

    def prep(self):
        """
        Runs any preparation tasks before executing the actual algorithm.
        For now:
         * fetches the unified configuration
        :return: False if it fail to run this action, otherwise True
        """
        self.uConfig = self.reqmgrAux.getUnifiedConfig(docName="config")
        return bool(self.uConfig)

    def execute(self):
        """
        Executes the whole transferor logic
        :param reqStatus: request status to that matters for this module
        :return:
        """
        if not self.prep():
            self.logger.warning(
                "Failed to fetch the latest unified config. Skipping this cycle"
            )
            return
        self.uConfig = self.uConfig[0]

        requestRecords = []
        try:
            # get requests from ReqMgr2 data-service for given status
            requests = self.reqmgr2.getRequestByStatus([self.status],
                                                       detail=True)
            if requests:
                requests = requests[0]
            self.logger.info("### transferor found %s requests in '%s' state",
                             len(requests), self.status)
            if requests:
                for _, wfData in requests.iteritems():
                    requestRecords.append(self.requestRecord(wfData))
        except Exception as err:
            self.logger.exception('### transferor error: %s', str(err))

        if not requestRecords:
            return

        try:
            reqInfo = RequestInfo(self.msConfig, self.uConfig, self.logger)
            for reqSlice in grouper(requestRecords, 50):
                reqResults = reqInfo(reqSlice)
                self.logger.info("%d requests completely processed.",
                                 len(reqResults))
                self.logger.info(
                    "Working on the data subscription and status change...")
                # process all requests
                for req in reqResults:
                    reqName = req['name']
                    # perform transfer
                    tid = self.transferRequest(req)
                    if tid:
                        # Once all transfer requests were successfully made, update: assigned -> staging
                        self.logger.debug(
                            "### transfer request for %s successfull", reqName)
                        self.change(req, 'staging', '### transferor')
                        # if there is nothing to be transferred (no input at all),
                        # then update the request status once again staging -> staged
                        # self.change(req, 'staged', '### transferor')
        except Exception as err:  # general error
            self.logger.exception('### transferor error: %s', str(err))

    def post(self):
        """
        Runs any post tasks before exiting the execution cycle
        :return:
        """
        pass

    def requestRecord(self, wfData):
        """
        Selects only important information for a request dictionary

        Returns: a dictionary
        """
        datasets = []
        if "TaskChain" in wfData or "StepChain" in wfData:
            innerDicts = []
            for i in range(
                    1,
                    wfData.get("TaskChain", wfData.get("StepChain")) + 1):
                innerDicts.append(
                    wfData.get("Task%d" % i, wfData.get("Step%d" % i)))
        else:
            # ReReco and DQMHarvesting
            innerDicts = [wfData]
        for item in innerDicts:
            for key in ['InputDataset', 'MCPileup', 'DataPileup']:
                dataset = item.get(key)
                if dataset:
                    datasets.append({'type': key, 'name': dataset})

        return {
            'name': wfData.get('RequestName'),
            'reqStatus': wfData.get('RequestStatus'),
            'SiteWhiteList': wfData.get('SiteWhitelist', []),
            'SiteBlackList': wfData.get('SiteBlacklist', []),
            'datasets': datasets,
            'campaign': []
        }

    def transferRequest(self, req):
        "Send request to Phedex and return status of request subscription"
        datasets = req.get('datasets', [])
        sites = req.get('sites', [])
        if datasets and sites:
            self.logger.debug("### creating subscription for: %s",
                              pformat(req))
            subscription = PhEDExSubscription(datasets, sites,
                                              self.msConfig['group'])
            # TODO: implement how to get transfer id
            tid = hashlib.md5(str(subscription)).hexdigest()
            # TODO: when ready enable submit subscription step
            # self.phedex.subscribe(subscription)
            return tid

    def change(self, req, reqStatus, prefix='###'):
        """
        Change request status, internally it is done via PUT request to ReqMgr2:
        curl -X PUT -H "Content-Type: application/json" \
             -d '{"RequestStatus":"staging", "RequestName":"bla-bla"}' \
             https://xxx.yyy.zz/reqmgr2/data/request
        """
        self.logger.debug('%s updating %s status to %s', prefix, req['name'],
                          reqStatus)
        try:
            if not self.msConfig['readOnly']:
                self.reqmgr2.updateRequestStatus(req['name'], reqStatus)
        except Exception as err:
            self.logger.exception("Failed to change request status. Error: %s",
                                  str(err))
    def __init__(self, rest, config):

        super(AuxCacheUpdateTasks, self).__init__(config)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url, logger=self.logger)
Beispiel #26
0
class CouchDBCleanup(CherryPyPeriodicTask):
    def __init__(self, rest, config):

        super(CouchDBCleanup, self).__init__(config)
        self.reqDB = RequestDBReader(config.reqmgrdb_url)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url, logger=self.logger)
        # statuses that we want to keep the transfer documents
        self.transferStatuses = [
            "assigned", "staging", "staged", "acquired", "failed",
            "running-open", "running-closed"
        ]

        baseURL, acdcDB = splitCouchServiceURL(config.acdc_url)
        self.acdcService = CouchService(url=baseURL, database=acdcDB)

    def setConcurrentTasks(self, config):
        """
        sets the list of functions which
        """
        self.concurrentTasks = [{
            'func': self.acdcCleanup,
            'duration': config.acdcCleanDuration
        }, {
            'func': self.auxCouchCleanup,
            'duration': config.auxCleanDuration
        }]

    def auxCouchCleanup(self, config):
        """
        Cleanup TRANSFER documents from the reqmgr_auxiliary CouchDB.
        The list of status can be expanded in the future
        """
        self.logger.info("Fetching TRANSFER documents from CouchDB...")

        transferDocs = self.reqmgrAux.getTransferInfo("ALL_DOCS")
        if not transferDocs:
            self.logger.info(
                "  there are no transfer documents in the database.")
            return
        auxDocs = []
        for row in transferDocs:
            auxDocs.append(row['workflowName'])

        results = self.reqDB._getCouchView("bystatus", {},
                                           self.transferStatuses)
        activeRequests = []
        for row in results["rows"]:
            activeRequests.append(row["id"])

        # now find transfer docs that are not active in the system
        transferDocs = []
        for transferDoc in auxDocs:
            if transferDoc not in activeRequests:
                transferDocs.append(transferDoc)
        self.logger.info("Found %d transfer documents to delete",
                         len(transferDocs))

        for wflowName in transferDocs:
            self.logger.info("Deleting transfer document: %s", wflowName)
            try:
                self.reqmgrAux.deleteConfigDoc("transferinfo", wflowName)
            except Exception as exc:
                self.logger.warning(
                    "Failed to delete transfer doc: %s. Error: %s", wflowName,
                    str(exc))
        self.logger.info("Transfer documents cleanup completed.")

    def acdcCleanup(self, config):
        """
        gather active data statistics
        """
        self.logger.info("Fetching ACDC collection names...")
        originalRequests = self.acdcService.listCollectionNames()
        if not originalRequests:
            self.logger.info("  there are no collection documents to delete.")
            return

        # filter requests
        results = self.reqDB._getCouchView("byrequest", {}, originalRequests)
        # filter requests only in the following status
        deleteStates = [
            "announced", "rejected-archived", "aborted-archived",
            "normal-archived"
        ]
        filteredRequests = []
        for row in results["rows"]:
            if row["value"][0] in deleteStates:
                filteredRequests.append(row["key"])

        total = 0
        for req in filteredRequests:
            try:
                self.logger.info("Removing ACDC collection for: %s", req)
                deleted = self.acdcService.removeFilesetsByCollectionName(req)
                if deleted is None:
                    self.logger.warning("  request '%s' already deleted", req)
                else:
                    total += len(deleted)
                    self.logger.info("request %s deleted", req)
            except Exception as ex:
                self.logger.error(
                    "Failed to delete request: %s, will try again later. Error: %s",
                    req, str(ex))
        self.logger.info("total %s requests deleted", total)
        return
    def __init__(self, rest, config):

        super(AuxCacheUpdateTasks, self).__init__(config)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url)
Beispiel #28
0
class RequestInfo(object):
    def __init__(self, microConfig, uniConfig, logger=None):
        """
        Runs the basic setup and initialization for the MS Transferor module
        :param microConfig: microservice configuration
        """
        self.msConfig = microConfig
        self.uConfig = uniConfig
        self.reqRecords = []
        self.logger = getMSLogger(microConfig['verbose'], logger=logger)

        self.reqmgr2 = ReqMgr(microConfig['reqmgrUrl'], logger=self.logger)
        self.reqmgrAux = ReqMgrAux(microConfig['reqmgrUrl'], httpDict={'cacheduration': 60}, logger=self.logger)
        # eventually will change it to Rucio
        self.phedex = PhEDEx(httpDict={'cacheduration': 10 * 60},
                             dbsUrl=microConfig['dbsUrl'], logger=self.logger)

    def __call__(self, reqRecords):
        """
        Run the unified transferor box
        :param args:
        :param kwargs:
        :return:
        """
        self.reqRecords = reqRecords
        self.logger.info("Going to process %d requests.", len(self.reqRecords))

        # get complete requests information (based on Unified Transferor logic)
        self.requestsInfo()
        requestsToProcess = self.unified()

        return requestsToProcess

    def requestsInfo(self):
        """
        Helper function to get information about all requests
        """
        # get campaigns for all requests which will be used to decide
        # how many replicas have to be made and where data has to be subscribed to
        # FIXME: it looks like we don't fetch all the possible campaigns in a given request
        for req in self.reqRecords:
            reqName = req['name']
            for wflow in getWorkflow(reqName, self.msConfig['reqmgrUrl']):
                campaign = wflow[reqName]['Campaign']
                self.logger.debug("request: %s, campaign: %s", reqName, campaign)
                campaignConfig = self.reqmgrAux.getCampaignConfig(campaign)
                self.logger.debug("request: %s, campaignConfig: %s", reqName, campaignConfig)
                if not campaignConfig:
                    # we skip and create alert
                    msg = 'No campagin configuration found for %s' % reqName
                    msg += ', skip transferor step ...'
                    self.logger.warning(msg)
                    continue
                self.reqRecords['campaign'].append(campaignConfig)

    def unified(self):
        """
        Unified Transferor black box
        """
        # get aux info for dataset/blocks from inputs/parents/pileups
        # make subscriptions based on site white/black lists
        self.logger.info("unified processing %d requests", len(self.reqRecords))

        requests = [r['name'] for r in self.reqRecords]

        ### TODO: the logic below shows original unified port and it should be
        ###       revisited wrt new proposal specs and unified codebase

        # get workflows from list of requests
        orig = time.time()
        time0 = time.time()
        requestWorkflows = self._getRequestWorkflows(requests)
        workflows = requestWorkflows.values()
        self.logger.debug(elapsedTime(time0, "### getWorkflows"))

        # get workflows info summaries and collect datasets we need to process
        winfo = workflowsInfo(workflows)
        datasets = [d for row in winfo.values() for d in row['datasets']]

        # find dataset info
        time0 = time.time()
        datasetBlocks, datasetSizes = dbsInfo(datasets, self.msConfig['dbsUrl'])
        self.logger.debug(elapsedTime(time0, "### dbsInfo"))

        # find block nodes information for our datasets
        time0 = time.time()
        blockNodes = phedexInfo(datasets, self.msConfig['phedexUrl'])
        self.logger.debug(elapsedTime(time0, "### phedexInfo"))

        # find events-lumis info for our datasets
        time0 = time.time()
        eventsLumis = eventsLumisInfo(datasets, self.msConfig['dbsUrl'])
        self.logger.debug(elapsedTime(time0, "### eventsLumisInfo"))

        # get specs for all requests and re-use them later in getSiteWhiteList as cache
        requests = [v['RequestName'] for w in workflows for v in w.values()]
        reqSpecs = self._getRequestSpecs(requests)

        # get siteInfo instance once and re-use it later, it is time-consumed object
        siteInfo = SiteInfo(self.uConfig)

        requestsToProcess = []
        tst0 = time.time()
        totBlocks = totEvents = totSize = totCpuT = 0
        for wflow in workflows:
            for wname, wspec in wflow.items():
                time0 = time.time()
                cput = getComputingTime(wspec, eventsLumis=eventsLumis, dbsUrl=self.msConfig['dbsUrl'], logger=self.logger)
                ncopies = getNCopies(cput)

                attrs = winfo[wname]
                ndatasets = len(attrs['datasets'])
                npileups = len(attrs['pileups'])
                nblocks = nevts = nlumis = size = 0
                nodes = set()
                for dataset in attrs['datasets']:
                    blocks = datasetBlocks[dataset]
                    for blk in blocks:
                        for node in blockNodes.get(blk, []):
                            nodes.add(node)
                    nblocks += len(blocks)
                    size += datasetSizes[dataset]
                    edata = eventsLumis.get(dataset, {'num_event': 0, 'num_lumi': 0})
                    nevts += edata['num_event']
                    nlumis += edata['num_lumi']
                totBlocks += nblocks
                totEvents += nevts
                totSize += size
                totCpuT += cput
                sites = json.dumps(sorted(list(nodes)))
                self.logger.debug("### %s", wname)
                self.logger.debug(
                    "%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s",
                    ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites)
                # find out which site can serve given workflow request
                t0 = time.time()
                lheInput, primary, parent, secondary, allowedSites \
                    = self._getSiteWhiteList(wspec, siteInfo, reqSpecs)
                if not isinstance(primary, list):
                    primary = [primary]
                if not isinstance(secondary, list):
                    secondary = [secondary]
                wflowDatasets = primary + secondary
                wflowDatasetsBlocks = []
                for dset in wflowDatasets:
                    for item in datasetBlocks.get(dset, []):
                        wflowDatasetsBlocks.append(item)
                rdict = dict(name=wname, datasets=wflowDatasets,
                             blocks=wflowDatasetsBlocks,
                             npileups=npileups, size=size,
                             nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies,
                             sites=sites, allowedSites=allowedSites, parent=parent,
                             lheInput=lheInput, primary=primary, secondary=secondary)
                requestsToProcess.append(rdict)
                self.logger.debug(elapsedTime(t0, "### getSiteWhiteList"))
        self.logger.debug("total # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)",
                          len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT)
        self.logger.debug(elapsedTime(tst0, '### workflows info'))
        self.logger.debug(elapsedTime(orig, '### total time'))

        return requestsToProcess

    def _getRequestWorkflows(self, requestNames):
        "Helper function to get all specs for given set of request names"
        urls = [str('%s/data/request/%s' % (self.msConfig['reqmgrUrl'], r)) for r in requestNames]
        self.logger.debug("getRequestWorkflows")
        for u in urls:
            self.logger.debug("url %s", u)
        data = multi_getdata(urls, ckey(), cert())
        rdict = {}
        for row in data:
            req = row['url'].split('/')[-1]
            try:
                data = json.loads(row['data'])
                rdict[req] = data['result'][0]  # we get back {'result': [workflow]} dict
            except Exception as exp:
                self.logger.error("fail to process row %s", row)
                self.logger.exception("fail to load data as json record, error=%s", str(exp))
        return rdict

    def _getRequestSpecs(self, requestNames):
        "Helper function to get all specs for given set of request names"
        urls = [str('%s/%s/spec' % (self.msConfig['reqmgrCacheUrl'], r)) for r in requestNames]
        data = multi_getdata(urls, ckey(), cert())
        rdict = {}
        for row in data:
            req = row['url'].split('/')[-2]
            rdict[req] = pickle.loads(row['data'])
        return rdict

    def _getSiteWhiteList(self, request, siteInfo, reqSpecs=None, pickone=False):
        "Return site list for given request"
        lheinput, primary, parent, secondary = getIO(request, self.msConfig['dbsUrl'])
        allowedSites = []
        if lheinput:
            allowedSites = sorted(siteInfo.sites_eos)
        elif secondary:
            if self.heavyRead(request):
                allowedSites = sorted(set(siteInfo.sites_T1s + siteInfo.sites_with_goodIO))
            else:
                allowedSites = sorted(set(siteInfo.sites_T1s + siteInfo.sites_with_goodAAA))
        elif primary:
            allowedSites = sorted(set(siteInfo.sites_T1s + siteInfo.sites_T2s + siteInfo.sites_T3s))
        else:
            # no input at all all site should contribute
            allowedSites = sorted(set(siteInfo.sites_T2s + siteInfo.sites_T1s + siteInfo.sites_T3s))
        if pickone:
            allowedSites = sorted([siteInfo.pick_CE(allowedSites)])

        # do further restrictions based on memory
        # do further restrictions based on blow-up factor
        minChildJobPerEvent, rootJobPerEvent, blowUp = self._getBlowupFactors(request, reqSpecs=reqSpecs)
        maxBlowUp, neededCores = self.uConfig.get('blow_up_limits', (0, 0))
        if blowUp > maxBlowUp:
            # then restrict to only sites with >4k slots
            newAllowedSites = list(set(allowedSites) &
                                   set([site for site in allowedSites
                                        if siteInfo.cpu_pledges[site] > neededCores]))
            if newAllowedSites:
                allowedSites = newAllowedSites
                msg = "restricting site white list because of blow-up factor: "
                msg += 'minChildJobPerEvent=%s ' % minChildJobPerEvent
                msg += 'rootJobPerEvent=%s' % rootJobPerEvent
                msg += 'maxBlowUp=%s' % maxBlowUp
                self.logger.debug(msg)

        for campaign in self.getCampaigns(request):
            # for testing purposes add post campaign call
            # res = reqmgrAux.postCampaignConfig(campaign, {'%s_name' % campaign: {"Key1": "Value1"}})
            campaignConfig = self.reqmgrAux.getCampaignConfig(campaign)
            if isinstance(campaignConfig, list):
                campaignConfig = campaignConfig[0]
            campSites = campaignConfig.get('SiteWhitelist', [])
            if campSites:
                msg = "Using site whitelist restriction by campaign=%s " % campaign
                msg += "configuration=%s" % sorted(campSites)
                self.logger.debug(msg)
                allowedSites = list(set(allowedSites) & set(campSites))
                if not allowedSites:
                    allowedSites = list(campSites)

            campBlackList = campaignConfig.get('SiteBlacklist', [])
            if campBlackList:
                self.logger.debug("Reducing the whitelist due to black list in campaign configuration")
                self.logger.debug("Removing %s", campBlackList)
                allowedSites = list(set(allowedSites) - set(campBlackList))

        ncores = self.getMulticore(request)
        memAllowed = siteInfo.sitesByMemory(float(request['Memory']), maxCore=ncores)
        if memAllowed is not None:
            msg = "sites allowing %s " % request['Memory']
            msg += "MB and ncores=%s" % ncores
            msg += "core are %s" % sorted(memAllowed)
            self.logger.debug(msg)
            # mask to sites ready for mcore
            if ncores > 1:
                memAllowed = list(set(memAllowed) & set(siteInfo.sites_mcore_ready))
            allowedSites = list(set(allowedSites) & set(memAllowed))
        return lheinput, list(primary), list(parent), list(secondary), list(sorted(allowedSites))


    def _getBlowupFactors(self, request, reqSpecs=None):
        "Return blowup factors for given request"
        if request['RequestType'] != 'TaskChain':
            return 1., 1., 1.
        minChildJobPerEvent = None
        rootJobPerEvent = None
        maxBlowUp = 0
        splits = self._getSplittings(request, reqSpecs=reqSpecs)
        for item in splits:
            cSize = None
            pSize = None
            task = item['splittingTask']
            for key in ['events_per_job', 'avg_events_per_job']:
                if key in item:
                    cSize = item[key]
            parents = [s for s in splits \
                       if task.startswith(s['splittingTask']) and task != s['splittingTask']]
            if parents:
                for parent in parents:
                    for key in ['events_per_job', 'avg_events_per_job']:
                        if key in parent:
                            pSize = parent[key]
                    if not minChildJobPerEvent or minChildJobPerEvent > cSize:
                        minChildJobPerEvent = cSize
            else:
                rootJobPerEvent = cSize
            if cSize and pSize:
                blowUp = float(pSize) / cSize
                if blowUp > maxBlowUp:
                    maxBlowUp = blowUp
        return minChildJobPerEvent, rootJobPerEvent, maxBlowUp

    def _getSplittings(self, request, reqSpecs=None):
        "Return splittings for given request"
        spl = []
        for task in self.getWorkTasks(request, reqSpecs=reqSpecs):
            tsplit = task.input.splitting
            spl.append({"splittingAlgo": tsplit.algorithm, "splittingTask": task.pathName})
            get_those = ['events_per_lumi', 'events_per_job', 'lumis_per_job',
                         'halt_job_on_file_boundaries', 'job_time_limit',
                         'halt_job_on_file_boundaries_event_aware']
            translate = {'EventAwareLumiBased': [('events_per_job', 'avg_events_per_job')]}
            include = {'EventAwareLumiBased': {'halt_job_on_file_boundaries_event_aware': 'True'},
                       'LumiBased': {'halt_job_on_file_boundaries': 'True'}}
            if tsplit.algorithm in include:
                for key, val in include[tsplit.algorithm].items():
                    spl[-1][key] = val
            for get in get_those:
                if hasattr(tsplit, get):
                    setTo = get
                    if tsplit.algorithm in translate:
                        for src, des in translate[tsplit.algorithm]:
                            if src == get:
                                setTo = des
                                break
                    spl[-1][setTo] = getattr(tsplit, get)
        return spl

    def getWorkTasks(self, request, reqSpecs=None):
        "Return work tasks for given request"
        select = {'taskType': ['Production', 'Processing', 'Skim']}
        allTasks = []
        tasks = self.getSpec(request, reqSpecs).tasks
        for task in tasks.tasklist:
            node = getattr(tasks, task)
            allTasks.extend(self.taskDescending(node, select))
        return allTasks

    def getSpec(self, request, reqSpecs=None):
        "Get request from workload cache"
        if reqSpecs and request['RequestName'] in reqSpecs:
            return reqSpecs[request['RequestName']]
        url = str('%s/%s/spec' % (self.msConfig['reqmgrCacheUrl'], request['RequestName']))
        mgr = RequestHandler()
        data = mgr.getdata(url, params={}, cert=cert(), ckey=ckey())
        return pickle.loads(data)

    def taskDescending(self, node, select=None):
        "Helper function to walk through task nodes in descending order"
        allTasks = []
        if not select:
            allTasks.append(node)
        else:
            for key, value in select.items():
                if (isinstance(value, list) and getattr(node, key) in value) or \
                        (not isinstance(value, list) and getattr(node, key) == value):
                    allTasks.append(node)
                    break

        for child in node.tree.childNames:
            chItem = getattr(node.tree.children, child)
            allTasks.extend(self.taskDescending(chItem, select))
        return allTasks

    def getCampaigns(self, request):
        "Return campaigns of given request"
        if 'Chain' in request['RequestType'] and not self.isRelval(request):
            return list(set(self.collectinchain(request, 'AcquisitionEra').values()))
        return [request['Campaign']]

    def heavyRead(self, request):
        """
        Return True by default. False if 'premix' appears in the
        output datasets or in the campaigns
        """
        response = True
        if any(['premix' in c.lower() for c in self.getCampaigns(request)]):
            response = False
        if any(['premix' in o.lower() for o in request['OutputDatasets']]):
            response = False
        return response

    def isRelval(self, request):
        "Return if given request is RelVal sample"
        if 'SubRequestType' in request and 'RelVal' in request['SubRequestType']:
            return True
        return False

    def collectinchain(self, request, member, func=None, default=None):
        "Helper function to return dictionary of collection chain"
        if request['RequestType'] == 'StepChain':
            return self.collectionHelper(request, member, func, default, base='Step')
        elif request['RequestType'] == 'TaskChain':
            return self.collectionHelper(request, member, func, default, base='Task')
        else:
            raise Exception("should not call collectinchain on non-chain request")

    def collectionHelper(self, request, member, func=None, default=None, base=None):
        "Helper function to return uhm chain as a dictionary"
        coll = {}
        item = 1
        while '%s%d' % (base, item) in request:
            if member in request['%s%d' % (base, item)]:
                if func:
                    coll[request['%s%d' % (base, item)]['%sName' % base]] = \
                        func(request['%s%d' % (base, item)][member])
                else:
                    coll[request['%s%d' % (base, item)]['%sName' % base]] = \
                        request['%s%d' % (base, item)].get(member, default)
            item += 1
        return coll

    def getMulticore(self, request):
        "Return max number of cores for a given request"
        mcores = [int(request.get('Multicore', 1))]
        if 'Chain' in request['RequestType']:
            mcoresCol = self.collectinchain(request, 'Multicore', default=1)
            mcores.extend([int(v) for v in mcoresCol.values()])
        return max(mcores)
Beispiel #29
0
class ErrorHandlerPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """

    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
            self.maxRetries = self.config.ErrorHandler.maxRetries
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
            self.maxRetries = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName).get("MaxRetries")

        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException('Max retries for the default job type must be specified')

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(url=config.ACDC.couchurl,
                                                    database=config.ACDC.database)

        return

    def setup(self, parameters=None):
        """
        Load DB objects required for queries
        """
        # For now, does nothing
        return

    def terminate(self, params):
        """
        _terminate_

        Do one pass, then commit suicide
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    def exhaustJobs(self, jobList):
        """
        _exhaustJobs_

        Actually do the jobs exhaustion
        """

        # Remove all the files in the exhausted jobs.
        logging.debug("About to fail input files for exhausted jobs")
        for job in jobList:
            job.failInputFiles()

        # Do not build ACDC for utilitarian job types
        acdcJobList = [job for job in jobList if job['type'] not in ['LogCollect', 'Cleanup']]

        self.handleACDC(acdcJobList)

        self.changeState.propagate(jobList, 'exhausted', 'retrydone')

        return

    def processRetries(self, jobList, state):
        """
        _processRetries_

        Actually do the retries
        """
        logging.info("Processing retries for %d failed jobs of type %sfailed", len(jobList), state)
        retrydoneJobs = []
        cooloffJobs = []
        passJobs = []

        # Query auxiliary db for current state of maxRetries
        if self.reqAuxDB:
            self.maxRetries = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName).get("MaxRetries", self.maxRetries)

        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException('Max retries for the default job type must be specified')

        # Retries < max retry count
        for job in jobList:
            allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default'])
            # Retries < allowed max retry count
            if job['retry_count'] < allowedRetries and state != 'create':
                cooloffJobs.append(job)
            # Check if Retries >= allowed max retry count
            elif job['retry_count'] >= allowedRetries or state == 'create':
                retrydoneJobs.append(job)
                msg = "Stopping retries for job %d" % job['id']
                logging.debug(msg)
                logging.debug("JobInfo: %s", job)

        if self.readFWJR:
            # Then we have to check each FWJR for exit status
            cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs)
            retrydoneJobs.extend(retrydoneFWJRJobs)

        # Now to actually do something.
        logging.debug("About to propagate jobs")
        if len(retrydoneJobs) > 0:
            self.changeState.propagate(retrydoneJobs, 'retrydone',
                                       '%sfailed' % state, updatesummary=True)
        if len(cooloffJobs) > 0:
            self.changeState.propagate(cooloffJobs, '%scooloff' % state,
                                       '%sfailed' % state, updatesummary=True)
        if len(passJobs) > 0:
            # Overwrite the transition states and move directly to created
            self.changeState.propagate(passJobs, 'created', 'new')

        return

    def handleACDC(self, jobList):
        """
        _handleACDC_

        Do the ACDC creation and hope it works
        """
        idList = [x['id'] for x in jobList]
        logging.info("Starting to build ACDC with %i jobs", len(idList))
        logging.info("This operation will take some time...")
        loadList = self.loadJobsFromListFull(idList)
        for job in loadList:
            job.getMask()
        self.dataCollection.failedJobs(loadList)
        return

    def readFWJRForErrors(self, jobList):
        """
        _readFWJRForErrors_

        Check the FWJRs of the failed jobs
        and determine those that can be retried
        and which must be retried without going through cooloff.
        Returns a triplet with cooloff, passed and exhausted jobs.
        """
        cooloffJobs = []
        passJobs = []
        exhaustJobs = []

        if self.reqAuxDB:
            self.exitCodesNoRetry = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName).get("NoRetryExitCodes", [])

        for job in jobList:
            report = Report()
            reportPath = job['fwjr_path']
            if reportPath is None:
                logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff.", job['id'])
                cooloffJobs.append(job)
                continue
            if not os.path.isfile(reportPath):
                logging.error(
                    "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff.", job['id'], reportPath)
                cooloffJobs.append(job)
                continue
            try:
                report.load(reportPath)
                # First let's check the time conditions
                times = report.getFirstStartLastStop()
                startTime = None
                stopTime = None
                if times is not None:
                    startTime = times['startTime']
                    stopTime = times['stopTime']

                # correct the location if the original location is different from recorded in wmbs
                # WARNING: we are not updating job location in wmbs only updating in couchdb by doing this.
                # If location in wmbs needs to be updated, it should happen in JobAccountant.
                locationFromFWJR = report.getSiteName()
                if locationFromFWJR:
                    job["location"] = locationFromFWJR
                    job["site_cms_name"] = locationFromFWJR

                if startTime is None or stopTime is None:
                    # We have no information to make a decision, keep going.
                    logging.debug("No start, stop times for steps for job %i", job['id'])
                elif stopTime - startTime > self.maxFailTime:
                    msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime)
                    logging.debug(msg)
                    exhaustJobs.append(job)
                    continue

                if len([x for x in report.getExitCodes() if x in self.exitCodesNoRetry]):
                    msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes()))
                    logging.debug(msg)
                    exhaustJobs.append(job)
                    continue

                if len([x for x in report.getExitCodes() if x in self.passCodes]):
                    msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'],
                                                                                     str(report.getExitCodes()))
                    logging.debug(msg)
                    passJobs.append(job)
                    continue

                cooloffJobs.append(job)

            except Exception as ex:
                logging.warning("Exception while trying to check jobs for failures!")
                logging.warning(str(ex))
                logging.warning("Ignoring and sending job to cooloff")
                cooloffJobs.append(job)

        return cooloffJobs, passJobs, exhaustJobs

    def handleRetryDoneJobs(self, jobList):
        """
        _handleRetryDoneJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d retry done jobs", len(jobList))
        myThread.transaction.begin()
        self.exhaustJobs(jobList)
        myThread.transaction.commit()

        return

    def handleFailedJobs(self, jobList, state):
        """
        _handleFailedJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d failures", len(jobList))
        myThread.transaction.begin()
        self.processRetries(jobList, state)
        myThread.transaction.commit()

        return

    def handleErrors(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """
        # Run over created, submitted and executed job failures
        failure_states = ['create', 'submit', 'job']
        for state in failure_states:
            idList = self.getJobs.execute(state="%sfailed" % state)
            logging.info("Found %d failed jobs in state %sfailed", len(idList), state)
            for jobSlice in grouper(idList, self.maxProcessSize):
                jobList = self.loadJobsFromList(jobSlice)
                self.handleFailedJobs(jobList, state)

        # Run over jobs done with retries
        idList = self.getJobs.execute(state='retrydone')
        logging.info("Found %d jobs done with all retries", len(idList))
        for jobSlice in grouper(idList, self.maxProcessSize):
            jobList = self.loadJobsFromList(jobSlice)
            self.handleRetryDoneJobs(jobList)

        return

    def loadJobsFromList(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk
        """
        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})
        results = self.idLoad.execute(jobID=binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id=entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    def loadJobsFromListFull(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk.
        Include the full metadata.
        """

        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})

        results = self.loadAction.execute(jobID=binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id=entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    @timeFunction
    def algorithm(self, parameters=None):
        """
        Performs the handleErrors method, looking for each type of failure
        And deal with it as desired.
        """
        logging.debug("Running error handling algorithm")
        try:
            myThread = threading.currentThread()
            self.handleErrors()
        except (CouchConnectionError, HTTPException) as ex:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler. "
            msg += "Transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.error(msg)
        except Exception as ex:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            msg = "Caught unexpected exception in ErrorHandler:\n"
            msg += str(ex)
            logging.exception(msg)
            raise ErrorHandlerException(msg)
Beispiel #30
0
    def __init__(self, rest, config):

        super(AuxCacheUpdateTasks, self).__init__(config)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url, logger=self.logger)
        self.mgr = RequestHandler()
Beispiel #31
0
class MSManager(object):
    """
    Entry point for the MicroServices.
    This class manages both transferor and monitor services/threads.
    """
    def __init__(self, config=None, logger=None):
        """
        Setup a bunch of things, like:
         * logger for this service
         * initialize all the necessary service helpers
         * fetch the unified configuration from central couch
         * update the unified configuration with some deployment and default settings
         * start both transfer and monitor threads
        :param config: reqmgr2ms service configuration
        :param logger:
        """
        self.uConfig = {}
        self.config = config
        self.logger = getMSLogger(getattr(config, 'verbose', False), logger)
        self._parseConfig(config)
        self.logger.info("Configuration including default values:\n%s",
                         self.msConfig)

        self.reqmgr2 = ReqMgr(self.msConfig['reqmgrUrl'], logger=self.logger)
        self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgrUrl'],
                                   httpDict={'cacheduration': 60},
                                   logger=self.logger)

        # transferor has to look at workflows in assigned status
        self.msTransferor = MSTransferor(self.msConfig,
                                         "assigned",
                                         logger=self.logger)

        ### Last but not least, get the threads started
        thname = 'MSTransferor'
        self.transfThread = start_new_thread(
            thname, daemon, (self.transferor, 'assigned',
                             self.msConfig['interval'], self.logger))
        self.logger.debug("### Running %s thread %s", thname,
                          self.transfThread.running())

        thname = 'MSTransferorMonit'
        self.monitThread = start_new_thread(
            thname, daemon, (self.monitor, 'staging',
                             self.msConfig['interval'] * 2, self.logger))
        self.logger.debug("+++ Running %s thread %s", thname,
                          self.monitThread.running())

    def _parseConfig(self, config):
        """
        __parseConfig_
        Parse the MicroService configuration and set any default values.
        :param config: config as defined in the deployment
        """
        self.logger.info("Using the following config:\n%s", config)

        self.msConfig = {}
        self.msConfig['verbose'] = getattr(config, 'verbose', False)
        self.msConfig['group'] = getattr(config, 'group', 'DataOps')
        self.msConfig['interval'] = getattr(config, 'interval', 5 * 60)
        self.msConfig['readOnly'] = getattr(config, 'readOnly', True)

        self.msConfig['reqmgrUrl'] = getattr(config, 'reqmgr2Url',
                                             'https://cmsweb.cern.ch/reqmgr2')
        self.msConfig['reqmgrCacheUrl'] = self.msConfig['reqmgrUrl'].replace(
            'reqmgr2', 'couchdb/reqmgr_workload_cache')
        self.msConfig['phedexUrl'] = getattr(
            config, 'phedexUrl',
            'https://cmsweb.cern.ch/phedex/datasvc/json/prod')
        self.msConfig['dbsUrl'] = getattr(
            config, 'dbsUrl',
            'https://cmsweb.cern.ch/dbs/prod/global/DBSReader')

    def transferor(self, reqStatus):
        """
        MSManager transferor function.
        It performs Unified logic for data subscription and
        transfers requests from assigned to staging/staged state of ReqMgr2.
        For references see
        https://github.com/dmwm/WMCore/wiki/ReqMgr2-MicroService-Transferor
        """
        startT = time.time()
        self.logger.info("Starting the transferor thread...")
        self.msTransferor.execute()
        self.logger.info("Total transferor execution time: %.2f secs",
                         time.time() - startT)

    def monitor(self, reqStatus='staging'):
        """
        MSManager monitoring function.
        It performs transfer requests from staging to staged state of ReqMgr2.
        For references see
        https://github.com/dmwm/WMCore/wiki/ReqMgr2-MicroService-Transferor
        """
        startT = time.time()
        self.logger.info("Starting the monitor thread...")
        # First, fetch/update our unified configuration from reqmgr_aux db
        # Keep our own copy of the unified config to avoid race conditions
        self.uConfig = self.reqmgrAux.getUnifiedConfig(docName="config")
        if not self.uConfig:
            self.logger.warning(
                "Monitor failed to fetch the unified config. Skipping this cycle."
            )
            return
        self.uConfig = self.uConfig[0]

        try:
            # get requests from ReqMgr2 data-service for given statue
            # here with detail=False we get back list of records
            requests = self.reqmgr2.getRequestByStatus([reqStatus],
                                                       detail=False)
            self.logger.debug('+++ monit found %s requests in %s state',
                              len(requests), reqStatus)

            requestStatus = {}  # keep track of request statuses
            for reqName in requests:
                req = {'name': reqName, 'reqStatus': reqStatus}
                # get transfer IDs
                tids = self.getTransferIDs()
                # get transfer status
                transferStatuses = self.getTransferStatuses(tids)
                # get campaing and unified configuration
                campaign = self.requestCampaign(reqName)
                conf = self.requestConfiguration(reqName)
                self.logger.debug("+++ request %s campaing %s conf %s", req,
                                  campaign, conf)

                # if all transfers are completed, move the request status staging -> staged
                # completed = self.checkSubscription(request)
                completed = 100  # TMP
                if completed == 100:  # all data are staged
                    self.logger.debug(
                        "+++ request %s all transfers are completed", req)
                    self.change(req, 'staged', '+++ monit')
                # if pileup transfers are completed AND some input blocks are completed, move the request status staging -> staged
                elif self.pileupTransfersCompleted(tids):
                    self.logger.debug(
                        "+++ request %s pileup transfers are completed", req)
                    self.change(req, 'staged', '+++ monit')
                # transfers not completed, just update the database with their completion
                else:
                    self.logger.debug(
                        "+++ request %s transfers are not completed", req)
                    requestStatus[
                        req] = transferStatuses  # TODO: implement update of transfer ids
            self.updateTransferIDs(requestStatus)
        except Exception as err:  # general error
            self.logger.exception('+++ monit error: %s', str(err))
        self.logger.info("Total monitor execution time: %.2f secs",
                         time.time() - startT)

    def stop(self):
        "Stop MSManager"
        # stop MSTransferorMonit thread
        self.monitThread.stop()
        # stop MSTransferor thread
        self.transfThread.stop()  # stop checkStatus thread
        status = self.transfThread.running()
        return status

    def getTransferIDsDoc(self):
        """
        Get transfer ids document from backend. The document has the following form:
        {
          "wf_A": [record1, record2, ...],
          "wf_B": [....],
        }
        where each record has the following format:
        {"timestamp":000, "dataset":"/a/b/c", "type": "primary", "trainsferIDs": [1,2,3]}
        """
        doc = {}
        return doc

    def updateTransferIDs(self, requestStatus):
        "Update transfer ids in backend"
        # TODO/Wait: https://github.com/dmwm/WMCore/issues/9198
        # doc = self.getTransferIDsDoc()

    def getTransferIDs(self):
        "Get transfer ids from backend"
        # TODO/Wait: https://github.com/dmwm/WMCore/issues/9198
        # meanwhile return transfer ids from internal store
        return []

    def getTransferStatuses(self, tids):
        "get transfer statuses for given transfer IDs from backend"
        # transfer docs on backend has the following form
        # https://gist.github.com/amaltaro/72599f995b37a6e33566f3c749143154
        statuses = {}
        for tid in tids:
            # TODO: I need to find request name from transfer ID
            # status = self.checkSubscription(request)
            status = 100
            statuses[tid] = status
        return statuses

    def requestCampaign(self, req):
        "Return request campaign"
        return 'campaign_TODO'  # TODO

    def requestConfiguration(self, req):
        "Return request configuration"
        return {}

    def pileupTransfersCompleted(self, tids):
        "Check if pileup transfers are completed"
        # TODO: add implementation
        return False

    def checkSubscription(self, req):
        "Send request to Phedex and return status of request subscription"
        sdict = {}
        for dataset in req.get('datasets', []):
            data = self.phedex.subscriptions(dataset=dataset,
                                             group=self.msConfig['group'])
            self.logger.debug("### dataset %s group %s", dataset,
                              self.msConfig['group'])
            self.logger.debug("### subscription %s", data)
            for row in data['phedex']['dataset']:
                if row['name'] != dataset:
                    continue
                nodes = [s['node'] for s in row['subscription']]
                rNodes = req.get('sites')
                self.logger.debug("### nodes %s %s", nodes, rNodes)
                subset = set(nodes) & set(rNodes)
                if subset == set(rNodes):
                    sdict[dataset] = 1
                else:
                    pct = float(len(subset)) / float(len(set(rNodes)))
                    sdict[dataset] = pct
        self.logger.debug("### sdict %s", sdict)
        tot = len(sdict.keys())
        if not tot:
            return -1
        # return percentage of completion
        return round(float(sum(sdict.values())) / float(tot), 2) * 100

    def checkStatus(self, req):
        "Check status of request in local storage"
        self.logger.debug("### checkStatus of request: %s", req['name'])
        # check subscription status of the request
        # completed = self.checkSubscription(req)
        completed = 100
        if completed == 100:  # all data are staged
            self.logger.debug(
                "### request is completed, change its status and remove it from the store"
            )
            self.change(req, 'staged', '### transferor')
        else:
            self.logger.debug("### request %s, completed %s", req, completed)

    def info(self, req):
        "Return info about given request"
        completed = self.checkSubscription(req)
        return {'request': req, 'status': completed}

    def delete(self, request):
        "Delete request in backend"
        pass

    def change(self, req, reqStatus, prefix='###'):
        """
        Change request status, internally it is done via PUT request to ReqMgr2:
        curl -X PUT -H "Content-Type: application/json" \
             -d '{"RequestStatus":"staging", "RequestName":"bla-bla"}' \
             https://xxx.yyy.zz/reqmgr2/data/request
        """
        self.logger.debug('%s updating %s status to %s', prefix, req['name'],
                          reqStatus)
        try:
            if not self.msConfig['readOnly']:
                self.reqmgr2.updateRequestStatus(req['name'], reqStatus)
        except Exception as err:
            self.logger.exception("Failed to change request status. Error: %s",
                                  str(err))
Beispiel #32
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(sandboxDir,
                                         'PackageCollection_%i' % collectionIndex,
                                         'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {"batchid": batchid,
                                                         'id': loadedJob['id'],
                                                         "package": JobPackage(directory=collectionDir)}

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache with currently %i jobs", len(self.cachedJobIDs))

        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0

            if self.useReqMgrForCompletionCheck:
                # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
                abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
            else:
                #T0Agent
                abortedAndForceCompleteRequests = []

            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            abortedAndForceCompleteRequests = []
            logging.info("Skipping cache update to be submitted. (%s job in cache)", len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if (newJob['request_name'] in abortedAndForceCompleteRequests) and \
               (newJob['type'] not in ['LogCollect', "Cleanup"]):
                continue

            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount, len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s", pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = pickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [x for x in possibleLocations if x not in self.abortSites]
                if nonAbortSites: # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in self.ioboundTypes:
                nonDrainingSites = [x for x in possibleLocations if x not in self.drainSites]
                if nonDrainingSites: # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            # locations clear of abort and draining sites
            newJob['possibleLocations'] = possibleLocations

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = self.taskTypePrioMap.get(newJob['type'], 0) + newJob['wf_priority']
            if jobPrio not in self.cachedJobs:
                self.cachedJobs[jobPrio] = {}

            # now add basic information keyed by the jobid
            self.cachedJobs[jobPrio][jobID] = newJob

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {'id': jobID,
                       'requestName': newJob['request_name'],
                       'taskName': newJob['task_name'],
                       'taskType': newJob['type'],
                       'cache_dir': newJob["cache_dir"],
                       'priority': newJob['wf_priority'],
                       'taskID': newJob['task_id'],
                       'retry_count': newJob["retry_count"],
                       'taskPriority': None,                                # update from the thresholds
                       'custom': {'location': None},                        # update later
                       'packageDir': batchDir,
                       'sandbox': loadedJob["sandbox"],                     # remove before submit
                       'userdn': loadedJob.get("ownerDN", None),
                       'usergroup': loadedJob.get("ownerGroup", ''),
                       'userrole': loadedJob.get("ownerRole", ''),
                       'possibleSites': frozenset(possibleLocations),       # abort and drain sites filtered out
                       'potentialSites': frozenset(potentialLocations),     # original list of sites
                       'scramArch': loadedJob.get("scramArch", None),
                       'swVersion': loadedJob.get("swVersion", None),
                       'name': loadedJob["name"],
                       'proxyPath': loadedJob.get("proxyPath", None),
                       'estimatedJobTime': loadedJob.get("estimatedJobTime", None),
                       'estimatedDiskUsage': loadedJob.get("estimatedDiskUsage", None),
                       'estimatedMemoryUsage': loadedJob.get("estimatedMemoryUsage", None),
                       'numberOfCores': loadedJob.get("numberOfCores", 1),  # may update it later
                       'inputDataset': loadedJob.get('inputDataset', None),
                       'inputDatasetLocations': loadedJob.get('inputDatasetLocations', None),
                       'allowOpportunistic': loadedJob.get('allowOpportunistic', False)}

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug("The following jobs could not be submitted: %s, error code : %d", badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData()
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \
               (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        self.cachedJobIDs -= jobIDsToPurge

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.cachedJobs:
                if self.cachedJobs[jobPrio].pop(jobid, None):
                    # then the jobid was found, go to the next one
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]  +
                                         ": file locations: " + ', '.join(job['fileLocations']) +
                                         ": site white list: " + ', '.join(job['siteWhitelist']) +
                                         ": site black list: " + ', '.join(job['siteBlacklist']))

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid" : job["id"], "fwjrpath" : fwjrPath})
            except IOError as ioer:
                logging.error("Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        self.taskTypePrioMap = {}
        newDrainSites = set()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            # then update the task type x task priority mapping
            if not self.taskTypePrioMap:
                for task, value in rcThresholds[siteName]['thresholds'].items():
                    self.taskTypePrioMap[task] = value.get('priority', 0) * self.maxTaskPriority

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if newDrainSites != self.drainSites or  newAbortSites != self.abortSites:
            logging.info("Draining or Aborted sites have changed, the cache will be rebuilt.")
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return

    def _getJobSubmitCondition(self, jobPrio, siteName, jobType):
        """
        returns the string describing whether a job is ready to be submitted or the reason can't be submitted
        Only jobs with "JobSubmitReady" return value will be added to submit job.
        Other return values will indicate the reason jobs cannot be submitted.
        i.e. "NoPendingSlot"  - pending slot is full with pending job
        """
        try:
            totalPendingSlots = self.currentRcThresholds[siteName]["total_pending_slots"]
            totalPendingJobs = self.currentRcThresholds[siteName]["total_pending_jobs"]
            totalRunningSlots = self.currentRcThresholds[siteName]["total_running_slots"]
            totalRunningJobs = self.currentRcThresholds[siteName]["total_running_jobs"]

            taskPendingSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["pending_slots"]
            taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"]
            taskRunningSlots = self.currentRcThresholds[siteName]['thresholds'][jobType]["max_slots"]
            taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]["task_running_jobs"]
            highestPriorityInJobs = self.currentRcThresholds[siteName]['thresholds'][jobType]['wf_highest_priority']

            # set the initial totalPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName].setdefault("init_total_pending_jobs", totalPendingJobs)

            # set the initial taskPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName]['thresholds'][jobType].setdefault("init_task_pending_jobs", taskPendingJobs)

            initTotalPending = self.currentRcThresholds[siteName]["init_total_pending_jobs"]
            initTaskPending = self.currentRcThresholds[siteName]['thresholds'][jobType]["init_task_pending_jobs"]

        except KeyError as ex:
            msg = "Invalid key for site %s and job type %s\n" % (siteName, jobType)
            logging.exception(msg)
            return "NoJobType_%s_%s" % (siteName, jobType)

        if (highestPriorityInJobs is None) or (jobPrio <= highestPriorityInJobs) or (jobType in self.ioboundTypes):
            # there is no pending or running jobs in the system (None case) or
            # priority of the job is lower or equal don't allow overflow
            # Also if jobType is in ioboundTypes don't allow overflow
            totalPendingThreshold = totalPendingSlots
            taskPendingThreshold = taskPendingSlots
            totalJobThreshold = totalPendingSlots + totalRunningSlots
            totalTaskTheshold = taskPendingSlots + taskRunningSlots
        else:
            # In case the priority of the job is higher than any of currently pending or running jobs.
            # Then increase the threshold by condorOverflowFraction * original pending slot.
            totalPendingThreshold = max(totalPendingSlots, initTotalPending) + (
                                    totalPendingSlots * self.condorOverflowFraction)
            taskPendingThreshold = max(taskPendingSlots, initTaskPending) + (
                                    taskPendingSlots * self.condorOverflowFraction)
            totalJobThreshold = totalPendingThreshold + totalRunningSlots
            totalTaskTheshold = taskPendingThreshold + taskRunningSlots

        jobStats = [{"Condition": "NoPendingSlot",
                     "Current": totalPendingJobs,
                     "Threshold": totalPendingThreshold},
                    {"Condition": "NoTaskPendingSlot",
                     "Current": taskPendingJobs,
                     "Threshold": taskPendingThreshold},
                    {"Condition": "NoRunningSlot",
                     "Current": totalPendingJobs + totalRunningJobs,
                     "Threshold": totalJobThreshold},
                    {"Condition": "NoTaskRunningSlot",
                     "Current": taskPendingJobs + taskRunningJobs,
                     "Threshold": totalTaskTheshold}]
        return jobSubmitCondition(jobStats)

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToUncache = []
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(Counter)
        jobSubmitLogByPriority = defaultdict(Counter)

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.cachedJobs, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # start eating through the elder jobs first
            for job in sorted(self.cachedJobs[jobPrio].values(), key=itemgetter('timestamp')):
                jobid = job['id']
                jobType = job['type']
                possibleSites = job['possibleLocations']
                jobSubmitLogByPriority[jobPrio]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    if siteName not in self.currentRcThresholds:
                        logging.warn("Have a job for %s which is not in the resource control", siteName)
                        continue

                    condition = self._getJobSubmitCondition(jobPrio, siteName, jobType)

                    if condition != "JobSubmitReady":
                        jobSubmitLogBySites[siteName][condition] += 1
                        logging.debug("Found a job for %s : %s", siteName, condition)
                        continue

                    # otherwise, update the site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName]["total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType]["task_pending_jobs"] += 1

                    jobsCount += 1

                    # load (and remove) the job dictionary object from jobDataCache
                    cachedJob = self.jobDataCache.pop(jobid)
                    jobsToUncache.append((jobPrio, jobid))

                    # Sort jobs by jobPackage
                    package = cachedJob['packageDir']
                    if package not in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob.pop('sandbox')

                    # Now update the job dictionary object
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['taskPriority'] = self.currentRcThresholds[siteName]['thresholds'][jobType]["priority"]

                    # Get this job in place to be submitted by the plugin
                    jobsToSubmit[package].append(cachedJob)

                    jobSubmitLogBySites[siteName]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio]['submitted'] += 1
                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsThisCycle:
                    logging.info("Submitter reached limit of submit slots for this cycle: %i", self.maxJobsThisCycle)
                    exitLoop = True
                    break

        # jobs that are going to be submitted must be removed from all caches
        for prio, jobid in jobsToUncache:
            self.cachedJobs[prio].pop(jobid)
            self.cachedJobIDs.remove(jobid)

        logging.info("Site submission report: %s", dict(jobSubmitLogBySites))
        logging.info("Priority submission report: %s", dict(jobSubmitLogByPriority))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job['site_cms_name'] = self.getSiteInfo(job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({'jobid': job['id'], 'location': job['custom']['location']})

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.", len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList, conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """
        myThread = threading.currentThread()

        if self.useReqMgrForCompletionCheck:
            # only runs when reqmgr is used (not Tier0)
            self.removeAbortedForceCompletedWorkflowFromCache()
            agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName)
            self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75)
            self.condorOverflowFraction = agentConfig.get("CondorOverflowFraction", 0.2)
        else:
            # For Tier0 agent
            self.condorFraction = 1
            self.condorOverflowFraction = 0

        if not self.passSubmitConditions():
            msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle."
            logging.warning(msg)
            myThread.logdbClient.post("JobSubmitter_submitWork", msg, "warning")
            return

        try:
            myThread.logdbClient.delete("JobSubmitter_submitWork", "warning", this_thread=True)
            self.getThresholds()
            self.refreshCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def passSubmitConditions(self):
        """
        _passSubmitConditions_

        Check whether the component is allowed to submit jobs to condor.

        Initially it has only one condition, which is the total number of
        jobs we can have in condor (pending + running) per schedd, set by
        MAX_JOBS_PER_OWNER.
        """

        myThread = threading.currentThread()
        freeSubmitSlots = availableScheddSlots(dbi=myThread.dbi, logger=logging,
                                               condorFraction=self.condorFraction)
        self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll)

        return (freeSubmitSlots > 0)

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Beispiel #33
0
 def __init__(self, config=None):
     self.config = config
     self.reqmgrAux = ReqMgrAux(self.config.reqmgr2_url)
     self.requests = {}
     self.reqManager = RequestManager()
     self.taskManager = TaskManager(nworkers=3)
Beispiel #34
0
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount', 10000)
        self.maxJobsPerPoll = int(getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize', 500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize', self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority', 1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir, 'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache()
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return
Beispiel #35
0
class DrainStatusPoller(BaseWorkerThread):
    """
    Collects information related to the agent drain status
    """
    # class variable that contains drain statistics
    drainStats = {}

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI(config)
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.previousConfig = {}
        self.validSpeedDrainConfigKeys = [
            'CondorPriority', 'NoJobRetries', 'EnableAllSites'
        ]
        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        self.emailAlert = EmailAlert(config.EmailAlert.dictionary_())
        self.condorStates = ("Running", "Idle")

    @timeFunction
    def algorithm(self, parameters):
        """
        Update drainStats if agent is in drain mode
        """
        logging.info("Running agent drain algorithm...")
        if self.agentConfig:
            # make a copy of the previous agent aux db configuration to compare against later
            self.previousConfig = copy.deepcopy(self.agentConfig)
        # grab a new copy of the agent aux db configuration
        self.agentConfig = self.reqAuxDB.getWMAgentConfig(
            self.config.Agent.hostName)
        if not self.agentConfig:
            logging.error(
                "Failed to fetch agent configuration from the auxiliary DB")
            return

        try:
            # see if the agent is in drain mode
            if self.agentConfig["UserDrainMode"] or self.agentConfig[
                    "AgentDrainMode"]:
                # check to see if the agent hit any speed drain thresholds
                thresholdsHit = self.checkSpeedDrainThresholds()
                if thresholdsHit:
                    logging.info(
                        "Updating agent configuration for speed drain...")
                    self.updateAgentSpeedDrainConfig(thresholdsHit)
                # now collect drain statistics
                DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo()
                logging.info("Finished collecting agent drain status.")
                logging.info("Drain stats: %s",
                             str(DrainStatusPoller.drainStats))
            else:
                logging.info(
                    "Agent not in drain mode. Resetting flags and skipping drain check..."
                )
                self.resetAgentSpeedDrainConfig()

            # finally, check for any changes in drain status
            self.checkDrainStatusChanges()

        except Exception as ex:
            msg = "Error occurred, will retry later:\n"
            msg += str(ex)
            logging.exception(msg)

    @classmethod
    def getDrainInfo(cls):
        """
        Return drainStats class variable
        """
        return cls.drainStats

    def checkDrainStatusChanges(self):
        """
        Check to see if any drain statuses have changed in the auxiliary db
        If yes, send email notification and update local drain thread variables

        """
        message = ""
        drainStatusKeys = ['UserDrainMode', 'AgentDrainMode', 'SpeedDrainMode']

        if not self.previousConfig:
            return

        for key in drainStatusKeys:
            if self.previousConfig[key] != self.agentConfig[key]:
                message += "Agent had a drain status transition to %s = %s\n" % (
                    str(key), str(self.agentConfig[key]))

        if message:
            self.emailAlert.send(
                "DrainMode status change on " +
                getattr(self.config.Agent, "hostName"), message)
            logging.info("Drain mode status change: %s", message)

        return

    def updateAgentSpeedDrainConfig(self, thresholdsHit):
        """
        Takes a list of speed drain configuration keys and updates the agent configuration
        """
        updateConfig = False
        condorPriorityFlag = False
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        if 'CondorPriority' in thresholdsHit:
            logging.info(
                "Bumping condor job priority to 999999 for Production/Processing pending jobs."
            )
            self.condorAPI.editCondorJobs(
                "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")",
                "JobPrio", "999999")
            condorPriorityFlag = True

        if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']:
            # CondorPriority setting is irreversible so the flag only indicates weather
            # priority is increased or not. It is not checked by other components
            logging.info("Enabling CondorPriority flag.")
            speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag
            updateConfig = True

        if 'NoJobRetries' in thresholdsHit:
            logging.info(
                "Enabling NoJobRetries flag: Error Handler won't retry the jobs"
            )
            # ErrorHandler will pick this up and set max retries to 0
            speedDrainConfig['NoJobRetries']['Enabled'] = True
            updateConfig = True

        if 'EnableAllSites' in thresholdsHit:
            logging.info(
                "Enabling EnableAllSites flag: Updating agent to submit to all sites."
            )
            # setting this value to True makes JobSubmitterPoller ignore site status
            speedDrainConfig['EnableAllSites']['Enabled'] = True
            updateConfig = True

        # update the aux db speed drain config with any changes
        if updateConfig:
            self.agentConfig['SpeedDrainMode'] = True
            self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName,
                                              self.agentConfig)

        return

    def resetAgentSpeedDrainConfig(self):
        """
        resetting SpeedDrainMode to False and SpeedDrainConfig Enabled to False
        """

        if self.agentConfig.get("SpeedDrainMode"):
            self.agentConfig['SpeedDrainMode'] = False
            speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")
            for key, v in viewitems(speedDrainConfig):
                if key in self.validSpeedDrainConfigKeys and v['Enabled']:
                    speedDrainConfig[key]['Enabled'] = False

            self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName,
                                              self.agentConfig)
        return

    def checkSpeedDrainThresholds(self):
        """
        Check the current number of jobs in Condor and create a list of agent configuration parameters
        that need updated for speed draining
        """
        enableKeys = []
        # first, update our summary of condor jobs
        totalJobs = self.getTotalCondorJobs()
        if totalJobs is None:
            msg = "Cannot check speed drain because there was an error fetching job summary from HTCondor."
            msg += " Will retry again in the next cycle."
            logging.warning(msg)
            return []

        # get the current speed drain status
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        # loop through the speed drain configuration and make a list of what thresholds have been hit
        for k, v in viewitems(speedDrainConfig):
            # make sure keys in the speed drain config are valid
            if k in self.validSpeedDrainConfigKeys and isinstance(
                    v['Threshold'], int) and isinstance(v['Enabled'], bool):
                # we always want to apply the condor priority change if the threshold is hit
                if not v['Enabled'] or k == 'CondorPriority':
                    logging.info("Checking speed drain threshold for %s. ", k)
                    if totalJobs < v['Threshold']:
                        logging.info(
                            "Agent will update speed drain configuration for %s. ",
                            k)
                        enableKeys.append(k)
            else:
                logging.warning(
                    "Speed drain configuration error for %s.  Please check aux db contents.",
                    k)

        return enableKeys

    def getTotalCondorJobs(self):
        """
        Retrieve a summary of the jobs in condor and return an absolute number
        of the jobs in Idle and Running states.
        :return: returns an integer with the total number of jobs, or None if it failed.
        """
        jobs = self.condorAPI.getCondorJobsSummary()
        if not jobs:
            return None

        results = 0
        if jobs:
            for state in self.condorStates:
                results += int(jobs[0].get(state))
        return results
Beispiel #36
0
class JobSubmitterPoller(BaseWorkerThread):
    """
    _JobSubmitterPoller_

    The jobSubmitterPoller takes the jobs and organizes them into packages
    before sending them to the individual plugin submitters.
    """
    def __init__(self, config):
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()
        self.config = config

        #DAO factory for WMBS objects
        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)

        #Libraries
        self.resourceControl = ResourceControl()
        self.changeState = ChangeState(self.config)
        self.bossAir = BossAirAPI(config=self.config)

        self.hostName = self.config.Agent.hostName
        self.repollCount = getattr(self.config.JobSubmitter, 'repollCount',
                                   10000)
        self.maxJobsPerPoll = int(
            getattr(self.config.JobSubmitter, 'maxJobsPerPoll', 1000))
        self.maxJobsThisCycle = self.maxJobsPerPoll  # changes as per schedd limit
        self.cacheRefreshSize = int(
            getattr(self.config.JobSubmitter, 'cacheRefreshSize', 30000))
        self.skipRefreshCount = int(
            getattr(self.config.JobSubmitter, 'skipRefreshCount', 20))
        self.packageSize = getattr(self.config.JobSubmitter, 'packageSize',
                                   500)
        self.collSize = getattr(self.config.JobSubmitter, 'collectionSize',
                                self.packageSize * 1000)
        self.maxTaskPriority = getattr(self.config.BossAir, 'maxTaskPriority',
                                       1e7)
        self.condorFraction = 0.75  # update during every algorithm cycle
        self.condorOverflowFraction = 0.2
        self.ioboundTypes = ('LogCollect', 'Merge', 'Cleanup', 'Harvesting')

        # Additions for caching-based JobSubmitter
        self.cachedJobIDs = set()
        self.cachedJobs = {}
        self.jobDataCache = {}
        self.jobsToPackage = {}
        self.sandboxPackage = {}
        self.locationDict = {}
        self.taskTypePrioMap = {}
        self.drainSites = set()
        self.abortSites = set()
        self.refreshPollingCount = 0

        try:
            if not getattr(self.config.JobSubmitter, 'submitDir', None):
                self.config.JobSubmitter.submitDir = self.config.JobSubmitter.componentDir
            self.packageDir = os.path.join(self.config.JobSubmitter.submitDir,
                                           'packages')

            if not os.path.exists(self.packageDir):
                os.makedirs(self.packageDir)
        except OSError as ex:
            msg = "Error while trying to create packageDir %s\n!"
            msg += str(ex)
            logging.error(msg)
            logging.debug("PackageDir: %s", self.packageDir)
            logging.debug("Config: %s", config)
            raise JobSubmitterPollerException(msg)

        # Now the DAOs
        self.listJobsAction = self.daoFactory(
            classname="Jobs.ListForSubmitter")
        self.setLocationAction = self.daoFactory(classname="Jobs.SetLocation")
        self.locationAction = self.daoFactory(
            classname="Locations.GetSiteInfo")
        self.setFWJRPathAction = self.daoFactory(classname="Jobs.SetFWJRPath")
        self.listWorkflows = self.daoFactory(
            classname="Workflow.ListForSubmitter")

        # Keep a record of the thresholds in memory
        self.currentRcThresholds = {}

        self.useReqMgrForCompletionCheck = getattr(
            self.config.TaskArchiver, 'useReqMgrForCompletionCheck', True)

        if self.useReqMgrForCompletionCheck:
            # only set up this when reqmgr is used (not Tier0)
            self.reqmgr2Svc = ReqMgr(self.config.General.ReqMgr2ServiceURL)
            self.abortedAndForceCompleteWorkflowCache = self.reqmgr2Svc.getAbortedAndForceCompleteRequestsFromMemoryCache(
            )
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        else:
            # Tier0 Case - just for the clarity (This private variable shouldn't be used
            self.abortedAndForceCompleteWorkflowCache = None

        return

    def getPackageCollection(self, sandboxDir):
        """
        _getPackageCollection_

        Given a jobID figure out which packageCollection
        it should belong in.
        """

        rawList = os.listdir(sandboxDir)
        collections = []
        numberList = []
        for entry in rawList:
            if 'PackageCollection' in entry:
                collections.append(entry)

        # If we have no collections, return 0 (PackageCollection_0)
        if len(collections) < 1:
            return 0

        # Loop over the list of PackageCollections
        for collection in collections:
            collectionPath = os.path.join(sandboxDir, collection)
            packageList = os.listdir(collectionPath)
            collectionNum = int(collection.split('_')[1])
            if len(packageList) < self.collSize:
                return collectionNum
            else:
                numberList.append(collectionNum)

        # If we got here, then all collections are full.  We'll need
        # a new one.  Find the highest number, increment by one
        numberList.sort()
        return numberList[-1] + 1

    def addJobsToPackage(self, loadedJob):
        """
        _addJobsToPackage_

        Add a job to a job package and then return the batch ID for the job.
        Packages are only written out to disk when they contain 100 jobs.  The
        flushJobsPackages() method must be called after all jobs have been added
        to the cache and before they are actually submitted to make sure all the
        job packages have been written to disk.
        """
        if loadedJob["workflow"] not in self.jobsToPackage:
            # First, let's pull all the information from the loadedJob
            batchid = "%s-%s" % (loadedJob["id"], loadedJob["retry_count"])
            sandboxDir = os.path.dirname(loadedJob["sandbox"])

            # Second, assemble the jobPackage location
            collectionIndex = self.getPackageCollection(sandboxDir)
            collectionDir = os.path.join(
                sandboxDir, 'PackageCollection_%i' % collectionIndex,
                'batch_%s' % batchid)

            # Now create the package object
            self.jobsToPackage[loadedJob["workflow"]] = {
                "batchid": batchid,
                'id': loadedJob['id'],
                "package": JobPackage(directory=collectionDir)
            }

        jobPackage = self.jobsToPackage[loadedJob["workflow"]]["package"]
        jobPackage[loadedJob["id"]] = loadedJob.getDataStructsJob()
        batchDir = jobPackage['directory']

        if len(jobPackage.keys()) == self.packageSize:
            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[loadedJob["workflow"]]

        return batchDir

    def flushJobPackages(self):
        """
        _flushJobPackages_

        Write any jobs packages to disk that haven't been written out already.
        """
        workflowNames = self.jobsToPackage.keys()
        for workflowName in workflowNames:
            jobPackage = self.jobsToPackage[workflowName]["package"]
            batchDir = jobPackage['directory']

            if not os.path.exists(batchDir):
                os.makedirs(batchDir)

            batchPath = os.path.join(batchDir, "JobPackage.pkl")
            jobPackage.save(batchPath)
            del self.jobsToPackage[workflowName]

        return

    def refreshCache(self):
        """
        _refreshCache_

        Query WMBS for all jobs in the 'created' state.  For all jobs returned
        from the query, check if they already exist in the cache.  If they
        don't, unpickle them and combine their site white and black list with
        the list of locations they can run at.  Add them to the cache.

        Each entry in the cache is a tuple with five items:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
        """
        badJobs = dict([(x, []) for x in range(71101, 71105)])
        dbJobs = set()

        logging.info("Refreshing priority cache with currently %i jobs",
                     len(self.cachedJobIDs))

        if self.cacheRefreshSize == -1 or len(self.cachedJobIDs) < self.cacheRefreshSize or \
           self.refreshPollingCount >= self.skipRefreshCount:
            newJobs = self.listJobsAction.execute()
            self.refreshPollingCount = 0

            if self.useReqMgrForCompletionCheck:
                # if reqmgr is used (not Tier0 Agent) get the aborted/forceCompleted record
                abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
                )
            else:
                #T0Agent
                abortedAndForceCompleteRequests = []

            logging.info("Found %s new jobs to be submitted.", len(newJobs))
        else:
            self.refreshPollingCount += 1
            newJobs = []
            dbJobs = self.cachedJobIDs
            abortedAndForceCompleteRequests = []
            logging.info(
                "Skipping cache update to be submitted. (%s job in cache)",
                len(dbJobs))

        logging.info("Determining possible sites for new jobs...")
        jobCount = 0
        for newJob in newJobs:
            # whether newJob belongs to aborted or force-complete workflow, and skip it if it is.
            if (newJob['request_name'] in abortedAndForceCompleteRequests) and \
               (newJob['type'] not in ['LogCollect', "Cleanup"]):
                continue

            jobID = newJob['id']
            dbJobs.add(jobID)
            if jobID in self.cachedJobIDs:
                continue

            jobCount += 1
            if jobCount % 5000 == 0:
                logging.info("Processed %d/%d new jobs.", jobCount,
                             len(newJobs))

            pickledJobPath = os.path.join(newJob["cache_dir"], "job.pkl")

            if not os.path.isfile(pickledJobPath):
                # Then we have a problem - there's no file
                logging.error("Could not find pickled jobObject %s",
                              pickledJobPath)
                badJobs[71103].append(newJob)
                continue
            try:
                jobHandle = open(pickledJobPath, "r")
                loadedJob = pickle.load(jobHandle)
                jobHandle.close()
            except Exception as ex:
                msg = "Error while loading pickled job object %s\n" % pickledJobPath
                msg += str(ex)
                logging.error(msg)
                raise JobSubmitterPollerException(msg)

            loadedJob['retry_count'] = newJob['retry_count']

            # figure out possible locations for job
            possibleLocations = loadedJob["possiblePSN"]

            # Create another set of locations that may change when a site goes white/black listed
            # Does not care about the non_draining or aborted sites, they may change and that is the point
            potentialLocations = set()
            potentialLocations.update(possibleLocations)

            # now check for sites in drain and adjust the possible locations
            # also check if there is at least one site left to run the job
            if len(possibleLocations) == 0:
                newJob['name'] = loadedJob['name']
                newJob['fileLocations'] = loadedJob.get('fileLocations', [])
                newJob['siteWhitelist'] = loadedJob.get('siteWhitelist', [])
                newJob['siteBlacklist'] = loadedJob.get('siteBlacklist', [])
                badJobs[71101].append(newJob)
                continue
            else:
                nonAbortSites = [
                    x for x in possibleLocations if x not in self.abortSites
                ]
                if nonAbortSites:  # if there is at least a non aborted/down site then run there, otherwise fail the job
                    possibleLocations = nonAbortSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71102].append(newJob)
                    continue

            # try to remove draining sites if possible, this is needed to stop
            # jobs that could run anywhere blocking draining sites
            # if the job type is Merge, LogCollect or Cleanup this is skipped
            if newJob['type'] not in self.ioboundTypes:
                nonDrainingSites = [
                    x for x in possibleLocations if x not in self.drainSites
                ]
                if nonDrainingSites:  # if >1 viable non-draining site remove draining ones
                    possibleLocations = nonDrainingSites
                else:
                    newJob['name'] = loadedJob['name']
                    newJob['possibleLocations'] = possibleLocations
                    badJobs[71104].append(newJob)
                    continue

            # locations clear of abort and draining sites
            newJob['possibleLocations'] = possibleLocations

            batchDir = self.addJobsToPackage(loadedJob)
            self.cachedJobIDs.add(jobID)

            # calculate the final job priority such that we can order cached jobs by prio
            jobPrio = self.taskTypePrioMap.get(newJob['type'],
                                               0) + newJob['wf_priority']
            if jobPrio not in self.cachedJobs:
                self.cachedJobs[jobPrio] = {}

            # now add basic information keyed by the jobid
            self.cachedJobs[jobPrio][jobID] = newJob

            # allow job baggage to override numberOfCores
            #       => used for repacking to get more slots/disk
            numberOfCores = loadedJob.get('numberOfCores', 1)
            if numberOfCores == 1:
                baggage = loadedJob.getBaggage()
                numberOfCores = getattr(baggage, "numberOfCores", 1)
            loadedJob['numberOfCores'] = numberOfCores

            # Create a job dictionary object and put it in the cache (needs to be in sync with RunJob)
            jobInfo = {
                'id':
                jobID,
                'requestName':
                newJob['request_name'],
                'taskName':
                newJob['task_name'],
                'taskType':
                newJob['type'],
                'cache_dir':
                newJob["cache_dir"],
                'priority':
                newJob['wf_priority'],
                'taskID':
                newJob['task_id'],
                'retry_count':
                newJob["retry_count"],
                'taskPriority':
                None,  # update from the thresholds
                'custom': {
                    'location': None
                },  # update later
                'packageDir':
                batchDir,
                'sandbox':
                loadedJob["sandbox"],  # remove before submit
                'userdn':
                loadedJob.get("ownerDN", None),
                'usergroup':
                loadedJob.get("ownerGroup", ''),
                'userrole':
                loadedJob.get("ownerRole", ''),
                'possibleSites':
                frozenset(
                    possibleLocations),  # abort and drain sites filtered out
                'potentialSites':
                frozenset(potentialLocations),  # original list of sites
                'scramArch':
                loadedJob.get("scramArch", None),
                'swVersion':
                loadedJob.get("swVersion", None),
                'name':
                loadedJob["name"],
                'proxyPath':
                loadedJob.get("proxyPath", None),
                'estimatedJobTime':
                loadedJob.get("estimatedJobTime", None),
                'estimatedDiskUsage':
                loadedJob.get("estimatedDiskUsage", None),
                'estimatedMemoryUsage':
                loadedJob.get("estimatedMemoryUsage", None),
                'numberOfCores':
                loadedJob.get("numberOfCores", 1),  # may update it later
                'inputDataset':
                loadedJob.get('inputDataset', None),
                'inputDatasetLocations':
                loadedJob.get('inputDatasetLocations', None),
                'allowOpportunistic':
                loadedJob.get('allowOpportunistic', False)
            }

            self.jobDataCache[jobID] = jobInfo

        # Register failures in submission
        for errorCode in badJobs:
            if badJobs[errorCode]:
                logging.debug(
                    "The following jobs could not be submitted: %s, error code : %d",
                    badJobs, errorCode)
                self._handleSubmitFailedJobs(badJobs[errorCode], errorCode)

        # If there are any leftover jobs, we want to get rid of them.
        self.flushJobPackages()

        # We need to remove any jobs from the cache that were not returned in
        # the last call to the database.
        jobIDsToPurge = self.cachedJobIDs - dbJobs
        self._purgeJobsFromCache(jobIDsToPurge)

        logging.info("Done pruning killed jobs, moving on to submit.")
        return

    def removeAbortedForceCompletedWorkflowFromCache(self):
        abortedAndForceCompleteRequests = self.abortedAndForceCompleteWorkflowCache.getData(
        )
        jobIDsToPurge = set()
        for jobID, jobInfo in self.jobDataCache.iteritems():
            if (jobInfo['requestName'] in abortedAndForceCompleteRequests) and \
               (jobInfo['taskType'] not in ['LogCollect', "Cleanup"]):
                jobIDsToPurge.add(jobID)
        self._purgeJobsFromCache(jobIDsToPurge)
        return

    def _purgeJobsFromCache(self, jobIDsToPurge):

        if len(jobIDsToPurge) == 0:
            return

        self.cachedJobIDs -= jobIDsToPurge

        for jobid in jobIDsToPurge:
            self.jobDataCache.pop(jobid, None)
            for jobPrio in self.cachedJobs:
                if self.cachedJobs[jobPrio].pop(jobid, None):
                    # then the jobid was found, go to the next one
                    break
        return

    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError(
                    "JobSubmit", exitCode, "SubmitFailed",
                    WM_JOB_ERROR_CODES[exitCode] +
                    ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError(
                        "JobSubmit", exitCode, "SubmitFailed",
                        WM_JOB_ERROR_CODES[exitCode] + ": file locations: " +
                        ', '.join(job['fileLocations']) +
                        ": site white list: " +
                        ', '.join(job['siteWhitelist']) +
                        ": site black list: " +
                        ', '.join(job['siteBlacklist']))

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error(
                    "Failed to write FWJR for submit failed job %d, message: %s",
                    job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return

    def getThresholds(self):
        """
        _getThresholds_

        Retrieve submit thresholds, which considers what is pending and running
        for those sites.
        Also update the list of draining and abort/down sites.
        Finally, creates a map between task type and its priority.
        """
        self.taskTypePrioMap = {}
        newDrainSites = set()
        newAbortSites = set()

        rcThresholds = self.resourceControl.listThresholdsForSubmit()

        for siteName in rcThresholds.keys():
            # Add threshold if we don't have it already
            state = rcThresholds[siteName]["state"]

            if state == "Draining":
                newDrainSites.add(siteName)
            if state in ["Down", "Aborted"]:
                newAbortSites.add(siteName)

            # then update the task type x task priority mapping
            if not self.taskTypePrioMap:
                for task, value in rcThresholds[siteName]['thresholds'].items(
                ):
                    self.taskTypePrioMap[task] = value.get(
                        'priority', 0) * self.maxTaskPriority

        # When the list of drain/abort sites change between iteration then a location
        # refresh is needed, for now it forces a full cache refresh
        if newDrainSites != self.drainSites or newAbortSites != self.abortSites:
            logging.info(
                "Draining or Aborted sites have changed, the cache will be rebuilt."
            )
            self.cachedJobIDs = set()
            self.cachedJobs = {}
            self.jobDataCache = {}

        self.currentRcThresholds = rcThresholds
        self.abortSites = newAbortSites
        self.drainSites = newDrainSites

        return

    def _getJobSubmitCondition(self, jobPrio, siteName, jobType):
        """
        returns the string describing whether a job is ready to be submitted or the reason can't be submitted
        Only jobs with "JobSubmitReady" return value will be added to submit job.
        Other return values will indicate the reason jobs cannot be submitted.
        i.e. "NoPendingSlot"  - pending slot is full with pending job
        """
        try:
            totalPendingSlots = self.currentRcThresholds[siteName][
                "total_pending_slots"]
            totalPendingJobs = self.currentRcThresholds[siteName][
                "total_pending_jobs"]
            totalRunningSlots = self.currentRcThresholds[siteName][
                "total_running_slots"]
            totalRunningJobs = self.currentRcThresholds[siteName][
                "total_running_jobs"]

            taskPendingSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["pending_slots"]
            taskPendingJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_pending_jobs"]
            taskRunningSlots = self.currentRcThresholds[siteName][
                'thresholds'][jobType]["max_slots"]
            taskRunningJobs = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["task_running_jobs"]
            highestPriorityInJobs = self.currentRcThresholds[siteName][
                'thresholds'][jobType]['wf_highest_priority']

            # set the initial totalPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName].setdefault(
                "init_total_pending_jobs", totalPendingJobs)

            # set the initial taskPendingJobs since it increases in every cycle when a job is submitted
            self.currentRcThresholds[siteName]['thresholds'][
                jobType].setdefault("init_task_pending_jobs", taskPendingJobs)

            initTotalPending = self.currentRcThresholds[siteName][
                "init_total_pending_jobs"]
            initTaskPending = self.currentRcThresholds[siteName]['thresholds'][
                jobType]["init_task_pending_jobs"]

        except KeyError as ex:
            msg = "Invalid key for site %s and job type %s\n" % (siteName,
                                                                 jobType)
            logging.exception(msg)
            return "NoJobType_%s_%s" % (siteName, jobType)

        if (highestPriorityInJobs is None) or (
                jobPrio <= highestPriorityInJobs) or (jobType
                                                      in self.ioboundTypes):
            # there is no pending or running jobs in the system (None case) or
            # priority of the job is lower or equal don't allow overflow
            # Also if jobType is in ioboundTypes don't allow overflow
            totalPendingThreshold = totalPendingSlots
            taskPendingThreshold = taskPendingSlots
            totalJobThreshold = totalPendingSlots + totalRunningSlots
            totalTaskTheshold = taskPendingSlots + taskRunningSlots
        else:
            # In case the priority of the job is higher than any of currently pending or running jobs.
            # Then increase the threshold by condorOverflowFraction * original pending slot.
            totalPendingThreshold = max(
                totalPendingSlots, initTotalPending) + (
                    totalPendingSlots * self.condorOverflowFraction)
            taskPendingThreshold = max(taskPendingSlots, initTaskPending) + (
                taskPendingSlots * self.condorOverflowFraction)
            totalJobThreshold = totalPendingThreshold + totalRunningSlots
            totalTaskTheshold = taskPendingThreshold + taskRunningSlots

        jobStats = [{
            "Condition": "NoPendingSlot",
            "Current": totalPendingJobs,
            "Threshold": totalPendingThreshold
        }, {
            "Condition": "NoTaskPendingSlot",
            "Current": taskPendingJobs,
            "Threshold": taskPendingThreshold
        }, {
            "Condition": "NoRunningSlot",
            "Current": totalPendingJobs + totalRunningJobs,
            "Threshold": totalJobThreshold
        }, {
            "Condition": "NoTaskRunningSlot",
            "Current": taskPendingJobs + taskRunningJobs,
            "Threshold": totalTaskTheshold
        }]
        return jobSubmitCondition(jobStats)

    def assignJobLocations(self):
        """
        _assignJobLocations_

        Loop through the submit thresholds and pull sites out of the job cache
        as we discover open slots.  This will return a list of tuple where each
        tuple will have six elements:
          - WMBS Job ID
          - Retry count
          - Batch ID
          - Path to sanbox
          - Path to cache directory
          - SE name of the site to run at
        """
        jobsToSubmit = {}
        jobsToUncache = []
        jobsCount = 0
        exitLoop = False
        jobSubmitLogBySites = defaultdict(Counter)
        jobSubmitLogByPriority = defaultdict(Counter)

        # iterate over jobs from the highest to the lowest prio
        for jobPrio in sorted(self.cachedJobs, reverse=True):

            # then we're completely done and have our basket full of jobs to submit
            if exitLoop:
                break

            # start eating through the elder jobs first
            for job in sorted(self.cachedJobs[jobPrio].values(),
                              key=itemgetter('timestamp')):
                jobid = job['id']
                jobType = job['type']
                possibleSites = job['possibleLocations']
                jobSubmitLogByPriority[jobPrio]['Total'] += 1
                # now look for sites with free pending slots
                for siteName in possibleSites:
                    if siteName not in self.currentRcThresholds:
                        logging.warn(
                            "Have a job for %s which is not in the resource control",
                            siteName)
                        continue

                    condition = self._getJobSubmitCondition(
                        jobPrio, siteName, jobType)

                    if condition != "JobSubmitReady":
                        jobSubmitLogBySites[siteName][condition] += 1
                        logging.debug("Found a job for %s : %s", siteName,
                                      condition)
                        continue

                    # otherwise, update the site/task thresholds and the component job counter
                    self.currentRcThresholds[siteName][
                        "total_pending_jobs"] += 1
                    self.currentRcThresholds[siteName]['thresholds'][jobType][
                        "task_pending_jobs"] += 1

                    jobsCount += 1

                    # load (and remove) the job dictionary object from jobDataCache
                    cachedJob = self.jobDataCache.pop(jobid)
                    jobsToUncache.append((jobPrio, jobid))

                    # Sort jobs by jobPackage
                    package = cachedJob['packageDir']
                    if package not in jobsToSubmit.keys():
                        jobsToSubmit[package] = []

                    # Add the sandbox to a global list
                    self.sandboxPackage[package] = cachedJob.pop('sandbox')

                    # Now update the job dictionary object
                    cachedJob['custom'] = {'location': siteName}
                    cachedJob['taskPriority'] = self.currentRcThresholds[
                        siteName]['thresholds'][jobType]["priority"]

                    # Get this job in place to be submitted by the plugin
                    jobsToSubmit[package].append(cachedJob)

                    jobSubmitLogBySites[siteName]["submitted"] += 1
                    jobSubmitLogByPriority[jobPrio]['submitted'] += 1
                    # found a site to submit this job, so go to the next job
                    break

                # set the flag and get out of the job iteration
                if jobsCount >= self.maxJobsThisCycle:
                    logging.info(
                        "Submitter reached limit of submit slots for this cycle: %i",
                        self.maxJobsThisCycle)
                    exitLoop = True
                    break

        # jobs that are going to be submitted must be removed from all caches
        for prio, jobid in jobsToUncache:
            self.cachedJobs[prio].pop(jobid)
            self.cachedJobIDs.remove(jobid)

        logging.info("Site submission report: %s", dict(jobSubmitLogBySites))
        logging.info("Priority submission report: %s",
                     dict(jobSubmitLogByPriority))
        logging.info("Have %s packages to submit.", len(jobsToSubmit))
        logging.info("Have %s jobs to submit.", jobsCount)
        logging.info("Done assigning site locations.")
        return jobsToSubmit

    def submitJobs(self, jobsToSubmit):
        """
        _submitJobs_

        Actually do the submission of the jobs
        """

        jobList = []
        idList = []

        if len(jobsToSubmit) == 0:
            logging.debug("There are no packages to submit.")
            return

        for package in jobsToSubmit.keys():

            sandbox = self.sandboxPackage[package]
            jobs = jobsToSubmit.get(package, [])
            for job in jobs:
                job['location'], job['plugin'], job[
                    'site_cms_name'] = self.getSiteInfo(
                        job['custom']['location'])
                job['sandbox'] = sandbox
                idList.append({
                    'jobid': job['id'],
                    'location': job['custom']['location']
                })

            #Clean out the package reference
            del self.sandboxPackage[package]

            jobList.extend(jobs)

        myThread = threading.currentThread()
        myThread.transaction.begin()

        # Run the actual underlying submit code using bossAir
        successList, failList = self.bossAir.submit(jobs=jobList)
        logging.info("Jobs that succeeded/failed submission: %d/%d.",
                     len(successList), len(failList))

        # Propagate states in the WMBS database
        logging.debug("Propagating success state to WMBS.")
        self.changeState.propagate(successList, 'executing', 'created')
        logging.debug("Propagating fail state to WMBS.")
        self.changeState.propagate(failList, 'submitfailed', 'created')

        # At the end we mark the locations of the jobs
        # This applies even to failed jobs, since the location
        # could be part of the failure reason.
        logging.debug("Updating job location...")
        self.setLocationAction.execute(bulkList=idList,
                                       conn=myThread.transaction.conn,
                                       transaction=True)
        myThread.transaction.commit()
        logging.info("Transaction cycle successfully completed.")

        return

    def getSiteInfo(self, jobSite):
        """
        _getSiteInfo_

        This is how you get the name of a CE and the plugin for a job
        """

        if not jobSite in self.locationDict.keys():
            siteInfo = self.locationAction.execute(siteName=jobSite)
            self.locationDict[jobSite] = siteInfo[0]
        return (self.locationDict[jobSite].get('ce_name'),
                self.locationDict[jobSite].get('plugin'),
                self.locationDict[jobSite].get('cms_name'))

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        Try to, in order:
        1) Refresh the cache
        2) Find jobs for all the necessary sites
        3) Submit the jobs to the plugin
        """
        myThread = threading.currentThread()

        if self.useReqMgrForCompletionCheck:
            # only runs when reqmgr is used (not Tier0)
            self.removeAbortedForceCompletedWorkflowFromCache()
            agentConfig = self.reqAuxDB.getWMAgentConfig(
                self.config.Agent.hostName)
            self.condorFraction = agentConfig.get('CondorJobsFraction', 0.75)
            self.condorOverflowFraction = agentConfig.get(
                "CondorOverflowFraction", 0.2)
        else:
            # For Tier0 agent
            self.condorFraction = 1
            self.condorOverflowFraction = 0

        if not self.passSubmitConditions():
            msg = "JobSubmitter didn't pass the submit conditions. Skipping this cycle."
            logging.warning(msg)
            myThread.logdbClient.post("JobSubmitter_submitWork", msg,
                                      "warning")
            return

        try:
            myThread.logdbClient.delete("JobSubmitter_submitWork",
                                        "warning",
                                        this_thread=True)
            self.getThresholds()
            self.refreshCache()

            jobsToSubmit = self.assignJobLocations()
            self.submitJobs(jobsToSubmit=jobsToSubmit)
        except WMException:
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg = 'Fatal error in JobSubmitter:\n'
            msg += str(ex)
            #msg += str(traceback.format_exc())
            msg += '\n\n'
            logging.error(msg)
            if getattr(myThread, 'transaction', None) != None:
                myThread.transaction.rollback()
            raise JobSubmitterPollerException(msg)

        return

    def passSubmitConditions(self):
        """
        _passSubmitConditions_

        Check whether the component is allowed to submit jobs to condor.

        Initially it has only one condition, which is the total number of
        jobs we can have in condor (pending + running) per schedd, set by
        MAX_JOBS_PER_OWNER.
        """

        myThread = threading.currentThread()
        freeSubmitSlots = availableScheddSlots(
            dbi=myThread.dbi,
            logger=logging,
            condorFraction=self.condorFraction)
        self.maxJobsThisCycle = min(freeSubmitSlots, self.maxJobsPerPoll)

        return (self.maxJobsThisCycle > 0)

    def terminate(self, params):
        """
        _terminate_

        Kill the code after one final pass when called by the master thread.
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)
Beispiel #37
0
class MSCore(object):
    """
    This class provides core functionality for
    MSTransferor, MSMonitor and MSOutput classes.
    """
    def __init__(self, msConfig, logger=None):
        """
        Provides setup for MSTransferor and MSMonitor classes

        :param config: MS service configuration
        :param logger: logger object (optional)
        """
        self.logger = getMSLogger(getattr(msConfig, 'verbose', False), logger)
        self.msConfig = msConfig
        self.logger.info("Configuration including default values:\n%s",
                         self.msConfig)

        self.reqmgr2 = ReqMgr(self.msConfig['reqmgr2Url'], logger=self.logger)
        self.reqmgrAux = ReqMgrAux(self.msConfig['reqmgr2Url'],
                                   httpDict={'cacheduration': 1.0},
                                   logger=self.logger)

        # hard code it to production DBS otherwise PhEDEx subscribe API fails to match TMDB data
        dbsUrl = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
        if usingRucio():
            # FIXME: we cannot use Rucio in write mode yet
            # self.rucio = Rucio(self.msConfig['rucioAccount'], configDict={"logger": self.logger})
            self.phedex = PhEDEx(httpDict={'cacheduration': 0.5},
                                 dbsUrl=dbsUrl,
                                 logger=self.logger)
        else:
            self.phedex = PhEDEx(httpDict={'cacheduration': 0.5},
                                 dbsUrl=dbsUrl,
                                 logger=self.logger)

    def unifiedConfig(self):
        """
        Fetches the unified configuration
        :return: unified configuration content
        """
        res = self.reqmgrAux.getUnifiedConfig(docName="config")
        if res:
            if isinstance(res, list):
                return res[0]
            return res
        else:
            return {}

    def change(self, reqName, reqStatus, prefix='###'):
        """
        Update the request status in ReqMgr2
        """
        try:
            if self.msConfig['enableStatusTransition']:
                self.logger.info('%s updating %s status to: %s', prefix,
                                 reqName, reqStatus)
                self.reqmgr2.updateRequestStatus(reqName, reqStatus)
            else:
                self.logger.info('DRY-RUN:: %s updating %s status to: %s',
                                 prefix, reqName, reqStatus)
        except Exception as err:
            self.logger.exception("Failed to change request status. Error: %s",
                                  str(err))

    def updateReportDict(self, reportDict, keyName, value):
        """
        Provided a key name and value, validate the key name
        and update the report dictionary if it passes the validation
        :param reportDict: dictionary with a summary of the service
        :param keyName: string with the key name in the report
        :param value: string/integer value with the content of a metric
        :return: the updated dictionary
        """
        if keyName not in reportDict:
            self.logger.error("Report metric '%s' is not supported", keyName)
        else:
            reportDict[keyName] = value
        return reportDict
Beispiel #38
0
class ErrorHandlerPoller(BaseWorkerThread):
    """
    Polls for Error Conditions, handles them
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.config = config

        myThread = threading.currentThread()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)
        self.changeState = ChangeState(self.config)

        self.maxRetries = self.config.ErrorHandler.maxRetries
        if not isinstance(self.maxRetries, dict):
            self.maxRetries = {'default': self.maxRetries}
        if 'default' not in self.maxRetries:
            raise ErrorHandlerException(
                'Max retries for the default job type must be specified')

        self.exitCodesNoRetry = []
        self.maxProcessSize = getattr(self.config.ErrorHandler,
                                      'maxProcessSize', 250)
        self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime',
                                   32 * 3600)
        self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False)
        self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', [])

        self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs")
        self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType")
        self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler")

        self.dataCollection = DataCollectionService(
            url=config.ACDC.couchurl, database=config.ACDC.database)
        if hasattr(self.config, "Tier0Feeder"):
            self.reqAuxDB = None
        else:
            self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

        return

    def setup(self, parameters=None):
        """
        Load DB objects required for queries
        """
        # For now, does nothing
        return

    def terminate(self, params):
        """
        _terminate_

        Do one pass, then commit suicide
        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    def exhaustJobs(self, jobList):
        """
        _exhaustJobs_

        Actually do the jobs exhaustion
        """

        self.changeState.propagate(jobList, 'exhausted', 'retrydone')

        # Remove all the files in the exhausted jobs.
        logging.debug("About to fail input files for exhausted jobs")
        for job in jobList:
            job.failInputFiles()

        # Do not build ACDC for utilitarian job types
        jobList = [
            job for job in jobList
            if job['type'] not in ['LogCollect', 'Cleanup']
        ]

        self.handleACDC(jobList)

        return

    def processRetries(self, jobList, state):
        """
        _processRetries_

        Actually do the retries
        """
        logging.info("Processing retries for %d failed jobs of type %sfailed",
                     len(jobList), state)
        retrydoneJobs = []
        cooloffJobs = []
        passJobs = []

        # Retries < max retry count
        for job in jobList:
            allowedRetries = self.maxRetries.get(job['type'],
                                                 self.maxRetries['default'])
            # Retries < allowed max retry count
            if job['retry_count'] < allowedRetries and state != 'create':
                cooloffJobs.append(job)
            # Check if Retries >= allowed max retry count
            elif job['retry_count'] >= allowedRetries or state == 'create':
                retrydoneJobs.append(job)
                msg = "Stopping retries for job %d" % job['id']
                logging.debug(msg)
                logging.debug("JobInfo: %s", job)

        if self.readFWJR:
            # Then we have to check each FWJR for exit status
            cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(
                cooloffJobs)
            retrydoneJobs.extend(retrydoneFWJRJobs)

        # Now to actually do something.
        logging.debug("About to propagate jobs")
        if len(retrydoneJobs) > 0:
            self.changeState.propagate(retrydoneJobs,
                                       'retrydone',
                                       '%sfailed' % state,
                                       updatesummary=True)
        if len(cooloffJobs) > 0:
            self.changeState.propagate(cooloffJobs,
                                       '%scooloff' % state,
                                       '%sfailed' % state,
                                       updatesummary=True)
        if len(passJobs) > 0:
            # Overwrite the transition states and move directly to created
            self.changeState.propagate(passJobs, 'created', 'new')

        return

    def handleACDC(self, jobList):
        """
        _handleACDC_

        Do the ACDC creation and hope it works
        """
        idList = [x['id'] for x in jobList]
        logging.info("Starting to build ACDC with %i jobs", len(idList))
        logging.info("This operation will take some time...")
        loadList = self.loadJobsFromListFull(idList)
        for job in loadList:
            job.getMask()
        self.dataCollection.failedJobs(loadList)
        return

    def readFWJRForErrors(self, jobList):
        """
        _readFWJRForErrors_

        Check the FWJRs of the failed jobs
        and determine those that can be retried
        and which must be retried without going through cooloff.
        Returns a triplet with cooloff, passed and exhausted jobs.
        """
        cooloffJobs = []
        passJobs = []
        exhaustJobs = []

        if self.reqAuxDB:
            self.exitCodesNoRetry = self.reqAuxDB.getWMAgentConfig(
                self.config.Agent.hostName).get("NoRetryExitCodes", [])

        for job in jobList:
            report = Report()
            reportPath = job['fwjr_path']
            if reportPath is None:
                logging.error(
                    "No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff.",
                    job['id'])
                cooloffJobs.append(job)
                continue
            if not os.path.isfile(reportPath):
                logging.error(
                    "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff.",
                    job['id'], reportPath)
                cooloffJobs.append(job)
                continue
            try:
                report.load(reportPath)
                # First let's check the time conditions
                times = report.getFirstStartLastStop()
                startTime = None
                stopTime = None
                if times is not None:
                    startTime = times['startTime']
                    stopTime = times['stopTime']

                # correct the location if the original location is different from recorded in wmbs
                # WARNING: we are not updating job location in wmbs only updating in couchdb by doing this.
                # If location in wmbs needs to be updated, it should happen in JobAccountant.
                locationFromFWJR = report.getSiteName()
                if locationFromFWJR:
                    job["location"] = locationFromFWJR
                    job["site_cms_name"] = locationFromFWJR

                if startTime is None or stopTime is None:
                    # We have no information to make a decision, keep going.
                    logging.debug("No start, stop times for steps for job %i",
                                  job['id'])
                elif stopTime - startTime > self.maxFailTime:
                    msg = "Job %i exhausted after running on node for %i seconds" % (
                        job['id'], stopTime - startTime)
                    logging.debug(msg)
                    exhaustJobs.append(job)
                    continue

                if len([
                        x for x in report.getExitCodes()
                        if x in self.exitCodesNoRetry
                ]):
                    msg = "Job %i exhausted due to a bad exit code (%s)" % (
                        job['id'], str(report.getExitCodes()))
                    logging.error(msg)
                    exhaustJobs.append(job)
                    continue

                if len(
                    [x for x in report.getExitCodes() if x in self.passCodes]):
                    msg = "Job %i restarted immediately due to an exit code (%s)" % (
                        job['id'], str(report.getExitCodes()))
                    logging.debug(msg)
                    passJobs.append(job)
                    continue

                cooloffJobs.append(job)

            except Exception as ex:
                logging.warning(
                    "Exception while trying to check jobs for failures!")
                logging.warning(str(ex))
                logging.warning("Ignoring and sending job to cooloff")
                cooloffJobs.append(job)

        return cooloffJobs, passJobs, exhaustJobs

    def handleRetryDoneJobs(self, jobList):
        """
        _handleRetryDoneJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d retry done jobs", len(jobList))
        myThread.transaction.begin()
        self.exhaustJobs(jobList)
        myThread.transaction.commit()

        return

    def handleFailedJobs(self, jobList, state):
        """
        _handleFailedJobs_

        """
        myThread = threading.currentThread()
        logging.info("About to process %d failures", len(jobList))
        myThread.transaction.begin()
        self.processRetries(jobList, state)
        myThread.transaction.commit()

        return

    def handleErrors(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """
        # Run over created, submitted and executed job failures
        failure_states = ['create', 'submit', 'job']
        for state in failure_states:
            idList = self.getJobs.execute(state="%sfailed" % state)
            logging.info("Found %d failed jobs in state %sfailed", len(idList),
                         state)
            while len(idList) > 0:
                tmpList = idList[:self.maxProcessSize]
                idList = idList[self.maxProcessSize:]
                jobList = self.loadJobsFromList(tmpList)
                self.handleFailedJobs(jobList, state)

        # Run over jobs done with retries
        idList = self.getJobs.execute(state='retrydone')
        logging.info("Found %d jobs done with all retries", len(idList))
        while len(idList) > 0:
            tmpList = idList[:self.maxProcessSize]
            idList = idList[self.maxProcessSize:]
            jobList = self.loadJobsFromList(tmpList)
            self.handleRetryDoneJobs(jobList)

        return

    def loadJobsFromList(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk
        """
        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})
        results = self.idLoad.execute(jobID=binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id=entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    def loadJobsFromListFull(self, idList):
        """
        _loadJobsFromList_

        Load jobs in bulk.
        Include the full metadata.
        """

        binds = []
        for jobID in idList:
            binds.append({"jobid": jobID})

        results = self.loadAction.execute(jobID=binds)

        # You have to have a list
        if isinstance(results, dict):
            results = [results]

        listOfJobs = []
        for entry in results:
            # One job per entry
            tmpJob = Job(id=entry['id'])
            tmpJob.update(entry)
            listOfJobs.append(tmpJob)

        return listOfJobs

    @timeFunction
    def algorithm(self, parameters=None):
        """
        Performs the handleErrors method, looking for each type of failure
        And deal with it as desired.
        """
        logging.debug("Running error handling algorithm")
        try:
            myThread = threading.currentThread()
            self.handleErrors()
        except (CouchConnectionError, HTTPException) as ex:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler. "
            msg += "Transactions postponed until the next polling cycle\n"
            msg += str(ex)
            logging.error(msg)
        except Exception as ex:
            if getattr(myThread, 'transaction', None) is not None:
                myThread.transaction.rollback()
            msg = "Caught unexpected exception in ErrorHandler:\n"
            msg += str(ex)
            logging.exception(msg)
            raise ErrorHandlerException(msg)
Beispiel #39
0
class BuildParentLock(CherryPyPeriodicTask):
    def __init__(self, rest, config):

        super(BuildParentLock, self).__init__(config)
        self.reqmgrAux = ReqMgrAux(config.reqmgr2_url, logger=self.logger)
        self.dbs = DBS3Reader(config.dbs_url)
        # cache of dbs lookups mapping input dataset to parent dataset
        self.dbsLookupCache = {}
        # set of of currently active datasets requiring parent dataset
        self.inputDatasetCache = set()
        self.reqDB = RequestDBReader(config.reqmgrdb_url)
        self.filterKeys = [
            'assignment-approved', 'assigned', 'staging', 'staged', 'failed',
            'acquired', 'running-open', 'running-closed', 'force-complete',
            'completed', 'closed-out'
        ]

    def setConcurrentTasks(self, config):
        """
        sets the list of functions which
        """
        self.concurrentTasks = [{
            'func': self.fetchIncludeParentsRequests,
            'duration': config.updateParentsInterval
        }]

    def fetchIncludeParentsRequests(self, config):
        """
        Fetch active requests from the "requestsincludeparents" couch view that
        have IncludeParents=True, find parents of each dataset and send to
        reqmgr2 auxiliary database.
        """
        # use this boolean to signal whether there were datasets that failed
        # to get their parentage resolved
        incompleteParentage = False
        # use this boolean to signal if new parent datasets need to be locked
        auxDbUpdateRequired = False

        setDsets = set()
        setParents = set()
        dictParents = {}

        self.logger.info("Executing parent lock cherrypy thread")

        # query couch view to find datasets for workflows requiring parent datasets
        # only returning requests with the statuses in filterKeys
        try:
            results = self.reqDB._getCouchView("requestsincludeparents", {},
                                               self.filterKeys)
        except Exception as ex:
            self.logger.error(
                "Error retrieving requests including parent datasets from couchdb."
            )
            self.logger.error("Error: %s", str(ex))
            return

        for row in results["rows"]:
            dataset = row["value"]
            setDsets.add(dataset)

        # check to see if any changes have been made
        if setDsets != self.inputDatasetCache:
            auxDbUpdateRequired = True
            self.inputDatasetCache = setDsets.copy()

        self.logger.info(
            "Found %d unique datasets requiring the parent dataset",
            len(setDsets))
        if auxDbUpdateRequired:
            self.logger.info("Found new parent dataset locks to update.")
            # look up parent datasets first via the local DBS cache, if not found do lookup via DBS
            for dset in setDsets:
                if dset in self.dbsLookupCache:
                    setParents.add(self.dbsLookupCache[dset])
                    self.logger.info(
                        "Resolved parentage via lookup cache for: %s", dset)
                else:
                    try:
                        res = self.dbs.listDatasetParents(dset)
                    except Exception as exc:
                        self.logger.warning(
                            "Failed to resolve parentage for: %s. Error: %s",
                            dset, str(exc))
                        incompleteParentage = True
                        continue
                    self.logger.info("Resolved parentage via DBS for: %s", res)
                    if res:
                        setParents.add(res[0]['parent_dataset'])
                        self.dbsLookupCache[dset] = res[0]['parent_dataset']

            if not incompleteParentage:
                dictParents['parentlocks'] = list(setParents)
                if self.reqmgrAux.updateParentLocks(dictParents):
                    self.logger.info(
                        "Parentage lookup complete and auxiliary database updated."
                    )
                else:
                    self.logger.info(
                        "Error updating parentage document. Using stale data until next cycle."
                    )
            else:
                # then don't replace any data for the moment, simply add new parents
                previousData = self.reqmgrAux.getParentLocks()
                # check to see if response from aux db has been populated
                if previousData and 'parentlocks' in previousData[0]:
                    setPreviousData = set(previousData[0]['parentlocks'])
                    setParents = setParents | setPreviousData
                    dictParents['parentlocks'] = list(setParents)
                    self.reqmgrAux.updateParentLocks(dictParents)
                    self.logger.info(
                        "Parentage lookup complete (with errors) and auxiliary database updated."
                    )
                else:
                    self.logger.info(
                        "Parent locks not returned from auxiliary database. Skipping parentage update."
                    )

        else:
            self.logger.info(
                "No new parent datasets need locked. Skipping update of auxiliary database."
            )

        return