def mark_failed(ids, failures_reasons): """ Mark the list of files as failed :param ids: list of Oracle file ids to update :param failures_reasons: list of strings with transfer failure messages :return: 0 success, 1 failure """ try: oracleDB = HTTPRequests(rest_filetransfers, proxy, proxy) data = dict() data['asoworker'] = 'asoless' data['subresource'] = 'updateTransfers' data['list_of_ids'] = ids data['list_of_transfer_state'] = ["FAILED" for _ in ids] data['list_of_failure_reason'] = failures_reasons data['list_of_retry_value'] = [0 for _ in ids] oracleDB.post('/filetransfers', data=encodeRequest(data)) logging.debug("Marked failed %s", ids) except Exception: logging.exception("Error updating documents") return 1 return 0
def mark_transferred(ids): """ Mark the list of files as tranferred :param ids: list of Oracle file ids to update :return: 0 success, 1 failure """ try: oracleDB = HTTPRequests(rest_filetransfers, proxy, proxy) logging.debug("Marking done %s", ids) data = dict() data['asoworker'] = 'asoless' data['subresource'] = 'updateTransfers' data['list_of_ids'] = ids data['list_of_transfer_state'] = ["DONE" for _ in ids] oracleDB.post('/filetransfers', data=encodeRequest(data)) logging.debug("Marked good %s", ids) except Exception: logging.exception("Error updating documents") return 1 return 0
def uploadWarning(self, warning, userProxy, taskname): try: userServer = HTTPRequests(self.server["host"], userProxy, userProxy, retry=2) configreq = {"subresource": "addwarning", "workflow": taskname, "warning": b64encode(warning)} userServer.post(self.restURInoAPI + "/task", data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning("Cannot add a warning to REST interface. Warning message: %s" % warning)
def deleteWarnings(self, userProxy, taskname): userServer = HTTPRequests(self.server['host'], userProxy, userProxy, retry=2, logger = self.logger) configreq = {'subresource': 'deletewarnings', 'workflow': taskname} try: userServer.post(self.restURInoAPI + '/task', data = urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error("Error deleting warnings: %s", str(hte)) self.logger.warning("Can not delete warnings from REST interface.")
def uploadWarning(self, warning, userProxy, taskname): try: userServer = HTTPRequests(self.server['host'], userProxy, userProxy, retry=2) configreq = {'subresource': 'addwarning', 'workflow': taskname, 'warning': b64encode(warning)} userServer.post(self.restURInoAPI + '/task', data = urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning("Cannot add a warning to REST interface. Warning message: %s" % warning)
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) wmsubs = Subscription(fileset=args[0], workflow=wmwork, split_algo=kwargs['task']['tm_split_algo'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] if kwargs['task']['tm_job_type'] == 'Analysis': if kwargs['task']['tm_split_algo'] == 'FileBased': splitparam['total_files'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'LumiBased': splitparam['total_lumis'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased': splitparam['total_events'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_job_type'] == 'PrivateMC': if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task']['tm_events_per_lumi']: splitparam['events_per_lumi'] = kwargs['task']['tm_events_per_lumi'] if 'tm_generator' in kwargs['task'] and kwargs['task']['tm_generator'] == 'lhe': splitparam['lheInputFiles'] = True splitparam['applyLumiCorrection'] = True factory = jobfactory(**splitparam) numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory]) maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000) if numJobs == 0: msg = "The CRAB3 server backend could not submit any job to the Grid scheduler:" msg += " Splitting task %s" % (kwargs['task']['tm_taskname']) if kwargs['task']['tm_input_dataset']: msg += " on dataset %s" % (kwargs['task']['tm_input_dataset']) msg += " with %s method does not generate any job" % (kwargs['task']['tm_split_algo']) raise TaskWorkerException(msg) elif numJobs > maxJobs: raise TaskWorkerException("The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s" % (numJobs, maxJobs)) #printing duplicated lumis if any lumiChecker = getattr(jobfactory, 'lumiChecker', None) if lumiChecker and lumiChecker.splitLumiFiles: self.logger.warning("The input dataset contains the following duplicated lumis %s" % lumiChecker.splitLumiFiles.keys()) #TODO use self.uploadWarning try: userServer = HTTPRequests(self.server['host'], kwargs['task']['user_proxy'], kwargs['task']['user_proxy'], retry = 2, logger = self.logger) configreq = {'subresource': 'addwarning', 'workflow': kwargs['task']['tm_taskname'], 'warning': b64encode('The CRAB3 server backend detected lumis split across files in the input dataset.' ' Will apply the necessary corrections in the splitting algorithms. You can ignore this message.')} userServer.post(self.restURInoAPI + '/task', data = urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning("Cannot add warning to REST after finding duplicates") return Result(task = kwargs['task'], result = factory)
def uploadWarning(self, warning, userProxy, taskname): try: userServer = HTTPRequests(self.server['host'], userProxy, userProxy, retry=2, logger = self.logger) configreq = {'subresource': 'addwarning', 'workflow': taskname, 'warning': b64encode(warning)} userServer.post(self.restURInoAPI + '/task', data = urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning("Cannot add a warning to REST interface. Warning message: %s" % warning)
def deleteWarnings(self, userProxy, taskname): userServer = HTTPRequests(self.server['host'], userProxy, userProxy, retry=2, logger=self.logger) configreq = {'subresource': 'deletewarnings', 'workflow': taskname} try: userServer.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error("Error deleting warnings: %s", str(hte)) self.logger.warning("Can not delete warnings from REST interface.")
def sendScheddToREST(self, task, schedd): """ Try to set the schedd to the oracle database in the REST interface Raises TaskWorkerException in case of failure """ task['tm_schedd'] = schedd userServer = HTTPRequests(self.server['host'], task['user_proxy'], task['user_proxy'], retry=20, logger=self.logger) configreq = {'workflow':task['tm_taskname'], 'subresource':'updateschedd', 'scheddname':schedd} try: userServer.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: msg = "Unable to contact cmsweb and update scheduler on which task will be submitted. Error msg: %s" % hte.headers self.logger.warning(msg) time.sleep(20) raise TaskWorkerException(msg) #we already tried 20 times, give up
def uploadWarning(self, warning, userProxy, taskname): if not self.server: # When testing, the server can be None self.logger.warning(warning) return truncWarning = truncateError(warning) userServer = HTTPRequests(self.server['host'], userProxy, userProxy, retry=2, logger = self.logger) configreq = {'subresource': 'addwarning', 'workflow': taskname, 'warning': b64encode(truncWarning)} try: userServer.post(self.restURInoAPI + '/task', data = urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error("Error uploading warning: %s", str(hte)) self.logger.warning("Cannot add a warning to REST interface. Warning message: %s", warning)
def updatewebdir(ad): data = {'subresource': 'addwebdir'} host = ad['CRAB_RestHost'] uri = ad['CRAB_RestURInoAPI'] + '/task' data['workflow'] = ad['CRAB_ReqName'] data['webdirurl'] = ad['CRAB_UserWebDir'] cert = ad['X509UserProxy'] try: from RESTInteractions import HTTPRequests from httplib import HTTPException import urllib server = HTTPRequests(host, cert, cert) server.post(uri, data=urllib.urlencode(data)) return 0 except: print traceback.format_exc() return 1
def updatewebdir(ad): data = {'subresource' : 'addwebdir'} host = ad['CRAB_RestHost'] uri = ad['CRAB_RestURInoAPI'] + '/task' data['workflow'] = ad['CRAB_ReqName'] data['webdirurl'] = ad['CRAB_UserWebDir'] cert = ad['X509UserProxy'] try: from RESTInteractions import HTTPRequests from httplib import HTTPException import urllib server = HTTPRequests(host, cert, cert) server.post(uri, data = urllib.urlencode(data)) return 0 except: print traceback.format_exc() return 1
def updateWebDir(ad): """ Need a doc string here. """ data = {'subresource': 'addwebdir'} host = ad['CRAB_RestHost'] uri = ad['CRAB_RestURInoAPI'] + '/task' data['workflow'] = ad['CRAB_ReqName'] data['webdirurl'] = ad['CRAB_UserWebDir'] cert = ad['X509UserProxy'] try: server = HTTPRequests(host, cert, cert) server.post(uri, data=urllib.urlencode(data)) return 0 except HTTPException as hte: printLog(traceback.format_exc()) printLog(hte.headers) printLog(hte.result) return 1
def updateWebDir(ad): """ Need a doc string here. """ data = {'subresource' : 'addwebdir'} host = ad['CRAB_RestHost'] uri = ad['CRAB_RestURInoAPI'] + '/task' data['workflow'] = ad['CRAB_ReqName'] data['webdirurl'] = ad['CRAB_UserWebDir'] cert = ad['X509UserProxy'] try: server = HTTPRequests(host, cert, cert) server.post(uri, data = urllib.urlencode(data)) return 0 except HTTPException as hte: printLog(traceback.format_exc()) printLog(hte.headers) printLog(hte.result) return 1
def updateWebDir(ad): """ Need a doc string here. """ data = {"subresource": "addwebdir"} host = ad["CRAB_RestHost"] uri = ad["CRAB_RestURInoAPI"] + "/task" data["workflow"] = ad["CRAB_ReqName"] data["webdirurl"] = ad["CRAB_UserWebDir"] cert = ad["X509UserProxy"] try: server = HTTPRequests(host, cert, cert) server.post(uri, data=urllib.urlencode(data)) return 0 except HTTPException as hte: printLog(traceback.format_exc()) printLog(hte.headers) printLog(hte.result) return 1
def uploadWarning(self, warning, userProxy, taskname): if not self.server: # When testing, the server can be None self.logger.warning(warning) return truncWarning = truncateError(warning) userServer = HTTPRequests(self.server['host'], userProxy, userProxy, retry=2, logger=self.logger) configreq = { 'subresource': 'addwarning', 'workflow': taskname, 'warning': b64encode(truncWarning) } try: userServer.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error("Error uploading warning: %s", str(hte)) self.logger.warning( "Cannot add a warning to REST interface. Warning message: %s", warning)
def __call__(self): ## retrieving output files location from the server server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) self.logger.debug('Requesting resubmission for failed jobs in task %s' % self.cachedinfo['RequestName'] ) #inputdict = { "TaskResubmit": "Analysis", "ForceResubmit" : force } dictresult, status, reason = server.post(self.uri, data = urlencode({ 'workflow' : self.cachedinfo['RequestName']}) + \ self.sitewhitelist + self.siteblacklist + '&' + urlencode(self.jobids)) self.logger.debug("Result: %s" % dictresult) if status != 200: msg = "Problem retrieving resubmitting the task to the server:\ninput:%s\noutput:%s\nreason:%s" % (str(inputdict), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.logger.info("Resubmit request successfully sent") if dictresult['result'][0]['result'] != 'ok': self.logger.info(dictresult['result'][0]['result'])
def __call__(self): server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) self.logger.debug('Requesting resubmission for failed jobs in task %s' % self.cachedinfo['RequestName'] ) configreq = { 'workflow' : self.cachedinfo['RequestName']} for attr in ['maxmemory', 'maxjobruntime', 'numcores', 'priority']: val = getattr(self, attr, None) if val: configreq[attr] = val dictresult, status, reason = server.post(self.uri, data = urlencode({ 'workflow' : self.cachedinfo['RequestName']}) + \ self.sitewhitelist + self.siteblacklist + '&' + urlencode(self.jobids)) self.logger.debug("Result: %s" % dictresult) if status != 200: msg = "Problem retrieving resubmitting the task to the server:\ninput:%s\noutput:%s\nreason:%s" % (str(inputdict), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.logger.info("Resubmit request successfully sent") if dictresult['result'][0]['result'] != 'ok': self.logger.info(dictresult['result'][0]['result'])
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def getLogging(quiet, debug): """Retrieves a logger and set the proper level :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('twlog.log', when="midnight") logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = logging.getLogger() logger.debug("Logging level initialized to %s." % loglevel) return logger self.STOP = False self.TEST = test self.logger = getLogging(quiet, debug) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException( "No mode provided: need to specify config.TaskWorker.mode in the configuration" ) elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[ self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[ self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException( "No correct mode provided: need to specify config.TaskWorker.mode in the configuration" ) self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey) self.logger.debug("Hostcert: %s, hostkey: %s" % (str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [ retry * 20 * 2 for retry in range(self.config.TaskWorker.max_retry) ] if not len(self.config.TaskWorker.retry_interval ) == self.config.TaskWorker.max_retry: raise ConfigException( "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry." ) if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [ self.getRecurringActionInst(name) for name in recurringActionsNames ] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is alays returning true, because we do not want the worker to day if the server endpoint is not avaialable. Prints a log entry if answer is greater then 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = { 'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus } try: self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq)) except HTTPException, hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % ( str(hte), configreq) if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': msg += "Task Worker could not update work to the server: \n" +\ "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\ "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown')) msg += "Probably no task to be updated\n" if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: msg += "TW could not update work to the server:\n" msg += "%s \n" % (str(traceback.format_exc())) msg += "\turl: %s\n" % (getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" % (getattr(hte, 'result', 'unknown')) self.logger.error(msg) except Exception, exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % ( str(exc), configreq) self.logger.error(msg + traceback.format_exc())
class RetryManagerDaemon(BaseDaemon): """ _RetryManagerPoller_ Polls for Files in CoolOff State and attempts to retry them based on the requirements in the selected plugin """ def __init__(self, config): """ Initialise class members """ BaseDaemon.__init__(self, config, 'RetryManager') if self.config.isOracle: self.oracleDB = HTTPRequests(self.config.oracleDB, self.config.opsProxy, self.config.opsProxy) else: try: server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy) self.db = server.connectDatabase(self.config.files_database) except Exception as e: self.logger.exception('A problem occured when connecting to couchDB: %s' % e) raise self.logger.debug('Connected to files DB') # Set up a factory for loading plugins self.factory = WMFactory(self.config.retryAlgoDir, namespace=self.config.retryAlgoDir) try: self.plugin = self.factory.loadObject(self.config.algoName, self.config, getFromCache=False, listFlag=True) except Exception as ex: msg = "Error loading plugin %s on path %s\n" % (self.config.algoName, self.config.retryAlgoDir) msg += str(ex) self.logger.error(msg) raise RetryManagerException(msg) self.cooloffTime = self.config.cooloffTime def terminate(self, params): """ Run one more time through, then terminate """ logging.debug("Terminating. doing one more pass before we die") self.algorithm(params) def algorithm(self, parameters=None): """ Performs the doRetries method, loading the appropriate plugin for each job and handling it. """ logging.debug("Running retryManager algorithm") if self.config.isOracle: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'retryTransfers' fileDoc['time_to'] = self.cooloffTime self.logger.debug('fileDoc: %s' % fileDoc) try: results = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception: self.logger.exception("Failed to get retry transfers in oracleDB: %s") logging.info("Retried files in cooloff: %s" % str(results)) else: self.doRetries() def processRetries(self, files): """ _processRetries_ Actually does the dirty work of figuring out what to do with jobs """ if len(files) < 1: # We got no files? return propList = [] fileList = self.loadFilesFromList(recList=files) logging.debug("Files in cooloff %s" % fileList) # Now we should have the files propList = self.selectFilesToRetry(fileList) logging.debug("Files to retry %s" % propList) now = str(datetime.datetime.now()) for file in propList: # update couch self.logger.debug("Trying to resubmit %s" % file['id']) try: document = self.db.document(file['id']) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] != 'killed': data = dict() data['state'] = 'new' data['last_update'] = time.time() data['retry'] = now updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + file['id'] updateUri += "?" + urllib.urlencode(data) try: self.db.makeRequest(uri=updateUri, type="PUT", decode=False) except Exception as ex: msg = "Error updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.debug("%s resubmitted" % file['id']) else: continue return def loadFilesFromList(self, recList): """ _loadFilesFromList_ Load jobs in bulk """ all_files = [] index = 0 for record in recList: all_files.append({}) all_files[index]['id'] = record['key'] all_files[index]['state_time'] = record['value'] index += 1 return all_files def selectFilesToRetry(self, fileList): """ _selectFilesToRetry_ Select files to retry """ result = [] if len(fileList) == 0: return result for file in fileList: logging.debug("Current file %s" %file) try: if self.plugin.isReady(file=file, cooloffTime=self.cooloffTime): result.append(file) except Exception as ex: msg = "Exception while checking for cooloff timeout for file %s\n" % file msg += str(ex) logging.error(msg) logging.debug("File: %s\n" % file) raise RetryManagerException(msg) return result def doRetries(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Discover files that are in cooloff query = {'stale': 'ok'} try: files = self.db.loadView('AsyncTransfer', 'getFilesToRetry', query)['rows'] except Exception as e: self.logger.exception('A problem occured when contacting \ couchDB to retrieve LFNs: %s' % e) return logging.info("Found %s files in cooloff" % len(files)) self.processRetries(files)
def processWorker(inputs, results, resthost, resturi, procnum): """Wait for an reference to appear in the input queue, call the referenced object and write the output in the output queue. :arg Queue inputs: the queue where the inputs are shared by the master :arg Queue results: the queue where this method writes the output :return: default returning zero, but not really needed.""" logger = setProcessLogger(str(procnum)) logger.info("Process %s is starting. PID %s", procnum, os.getpid()) procName = "Process-%s" % procnum while True: try: ## Get (and remove) an item from the input queue. If the queue is empty, wait ## until an item is available. workid, work, task, failstatus, inputargs = inputs.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break if work == 'STOP': break outputs = None t0 = time.time() logger.debug("%s: Starting %s on %s" % (procName, str(work), task['tm_taskname'])) try: msg = None outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs) except WorkerHandlerException as we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception as exc: outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc()) finally: if msg: try: logger.info("Uploading error message to REST: %s" % msg) server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry=2) truncMsg = truncateError(msg) configreq = { 'workflow': task['tm_taskname'], 'status': failstatus, 'subresource': 'failure', #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST 'failure': b64encode(truncMsg) } server.post(resturi, data=urllib.urlencode(configreq)) logger.info( "Error message successfully uploaded to the REST") except HTTPException as hte: logger.warning( "Cannot upload failure message to the REST for workflow %s. HTTP headers follows:" % task['tm_taskname']) logger.error(hte.headers) except Exception as exc: logger.warning( "Cannot upload failure message to the REST for workflow %s.\nReason: %s" % (task['tm_taskname'], exc)) logger.exception('Traceback follows:') t1 = time.time() logger.debug("%s: ...work on %s completed in %d seconds: %s" % (procName, task['tm_taskname'], t1 - t0, outputs)) results.put({'workid': workid, 'out': outputs}) logger.debug("Slave %s exiting." % procnum) return 0
def executeInternal(self, *args, **kwargs): self.logger.info( "Data discovery with DBS") ## to be changed into debug dbsurl = self.config.Services.DBSUrl if kwargs['task']['tm_dbs_url']: dbsurl = kwargs['task']['tm_dbs_url'] self.dbs = DBSReader(dbsurl) self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"] taskName = kwargs['task']['tm_taskname'] self.logger.debug("Data discovery through %s for %s", self.dbs, taskName) inputDataset = kwargs['task']['tm_input_dataset'] secondaryDataset = kwargs['task'].get('tm_secondary_input_dataset', None) self.checkDatasetStatus(inputDataset, kwargs) if secondaryDataset: self.checkDatasetStatus(secondaryDataset, kwargs) try: # Get the list of blocks for the locations and then call dls. # The WMCore DBS3 implementation makes one call to dls for each block # with locations = True so we are using locations=False and looking up location later blocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(inputDataset, locations=False) ] if secondaryDataset: secondaryBlocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(secondaryDataset, locations=False) ] except DBSReaderError as dbsexc: #dataset not found in DBS is a known use case if str(dbsexc).find('No matching data'): raise TaskWorkerException( "CRAB could not find dataset %s in this DBS instance: %s" % inputDataset, dbsurl) raise ## Create a map for block's locations: for each block get the list of locations. ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no ## locations are found it gets the original locations from DBS. So it should ## never be the case at this point that some blocks have no locations. ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example: ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'], ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'} try: dbsOnly = self.dbsInstance.split('/')[1] != 'global' locationsMap = self.dbs.listFileBlockLocation(list(blocks), dbsOnly=dbsOnly) except Exception as ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not get the location of the files from dbs or phedex.\n"+\ "This is could be a temporary phedex/dbs glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) self.keepOnlyDisks(locationsMap) if not locationsMap: msg = "Task could not be submitted because there is no DISK replica for dataset %s" % inputDataset if self.otherLocations: msg += "\nN.B.: the input dataset is stored at %s, but those are TAPE locations." % ', '.join( sorted(self.otherLocations)) # submit request to DDM ddmRequest = blocksRequest(blocks, self.config.TaskWorker.DDMServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, verbose=False) self.logger.info("Contacted %s using %s and %s, got:\n%s", self.config.TaskWorker.DDMServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, ddmRequest) # The query above returns a JSON with a format {"result": "OK", "message": "Copy requested", "data": [{"request_id": 18, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:57:37", "last_request": "2018-02-26 23:57:37", "request_count": 1}]} if ddmRequest["result"] == "OK": msg += "\nA disk replica has been requested on %s" % ddmRequest[ "data"][0]["first_request"] # set status to TAPERECALL tapeRecallStatus = 'TAPERECALL' ddmReqId = ddmRequest["data"][0]["request_id"] server = HTTPRequests( url=self.config.TaskWorker.resturl, localcert=kwargs['task']['user_proxy'], localkey=kwargs['task']['user_proxy'], verbose=False) configreq = { 'workflow': taskName, 'taskstatus': tapeRecallStatus, 'ddmreqid': ddmReqId, 'subresource': 'addddmreqid' } try: tapeRecallStatusSet = server.post( self.config.TaskWorker.restURInoAPI + 'task', data=urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error while contacting the REST Interface %s:\n%s" % ( self.config.TaskWorker.resturl, str(hte)) msg += "\nSetting %s status and DDM request ID (%d) failed for task %s" % ( tapeRecallStatus, ddmReqId, taskName) msg += "\nHTTP Headers are: %s" % hte.headers raise TaskWorkerException(msg, retry=True) if tapeRecallStatusSet[2] == "OK": self.logger.info("Status for task %s set to '%s'", taskName, tapeRecallStatus) msg += " and the task will be submitted as soon as it is completed." self.uploadWarning(msg, kwargs['task']['user_proxy'], taskName) raise TapeDatasetException(msg) else: msg += ", please try again in two days." msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK." msg += " You might want to contact your physics group if you need a disk replica." raise TaskWorkerException(msg) if len(blocks) != len(locationsMap): self.logger.warning( "The locations of some blocks have not been found: %s", set(blocks) - set(locationsMap)) # will not need lumi info if user has asked for split by file with no run/lumi mask splitAlgo = kwargs['task']['tm_split_algo'] lumiMask = kwargs['task']['tm_split_args']['lumis'] runRange = kwargs['task']['tm_split_args']['runs'] needLumiInfo = splitAlgo != 'FileBased' or lumiMask != [] or runRange != [] # secondary dataset access relies on run/lumi info if secondaryDataset: needLumiInfo = True if needLumiInfo: self.checkBlocksSize(blocks) if secondaryDataset: self.checkBlocksSize(secondaryBlocks) try: filedetails = self.dbs.listDatasetFileDetails( inputDataset, getParents=True, getLumis=needLumiInfo, validFileOnly=0) if secondaryDataset: moredetails = self.dbs.listDatasetFileDetails( secondaryDataset, getParents=False, getLumis=needLumiInfo, validFileOnly=0) for secfilename, secinfos in moredetails.items(): secinfos['lumiobj'] = LumiList( runsAndLumis=secinfos['Lumis']) self.logger.info( "Beginning to match files from secondary dataset") for dummyFilename, infos in filedetails.items(): infos['Parents'] = [] lumis = LumiList(runsAndLumis=infos['Lumis']) for secfilename, secinfos in moredetails.items(): if (lumis & secinfos['lumiobj']): infos['Parents'].append(secfilename) self.logger.info("Done matching files from secondary dataset") kwargs['task']['tm_use_parent'] = 1 except Exception as ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\ "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves if not filedetails: raise TaskWorkerException(( "Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n" "Aborting submission. Resubmitting your task will not help." ) % ( "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s" ) % (self.dbsInstance, inputDataset)) ## Format the output creating the data structures required by wmcore. Filters out invalid files, ## files whose block has no location, and figures out the PSN result = self.formatOutput(task=kwargs['task'], requestname=taskName, datasetfiles=filedetails, locations=locationsMap, tempDir=kwargs['tempDir']) if not result.result: raise TaskWorkerException(( "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n" "Aborting submission. Resubmitting your task will not help." ) % ( "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s" ) % (self.dbsInstance, inputDataset)) self.logger.debug("Got %s files", len(result.result.getFiles())) return result
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(quiet, debug): """Sets the root logger with the desired verbosity level The root logger logs to logs/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') createLogdir('logs/tasks') if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s." % os.getpid()) logger.debug("Logging level initialized to %s." % loglevel) return logger self.STOP = False self.TEST = test self.logger = setRootLogger(quiet, debug) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException( "No mode provided: need to specify config.TaskWorker.mode in the configuration" ) elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[ self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[ self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException( "No correct mode provided: need to specify config.TaskWorker.mode in the configuration" ) self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry=2) self.logger.debug("Hostcert: %s, hostkey: %s" % (str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [ retry * 20 * 2 for retry in range(self.config.TaskWorker.max_retry) ] if not len(self.config.TaskWorker.retry_interval ) == self.config.TaskWorker.max_retry: raise ConfigException( "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry." ) if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [ self.getRecurringActionInst(name) for name in recurringActionsNames ] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is always returning true, because we do not want the worker to die if the server endpoint is not avaialable. Prints a log entry if answer is greater than 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = { 'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus } try: self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq)) except HTTPException as hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % ( str(hte), configreq) if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': msg += "Task Worker could not update work to the server: \n" +\ "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\ "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown')) msg += "Probably no task to be updated\n" if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: msg += "TW could not update work to the server:\n" msg += "%s \n" % (str(traceback.format_exc())) msg += "\turl: %s\n" % (getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" % (getattr(hte, 'result', 'unknown')) self.logger.error(msg) except Exception as exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % ( str(exc), configreq) self.logger.error(msg + traceback.format_exc()) return True def _getWork(self, limit, getstatus): configreq = { 'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus } pendingwork = [] try: pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data=configreq)[0]['result'] except HTTPException as hte: self.logger.error("HTTP Error during _getWork: %s" % str(hte)) self.logger.error("Could not get any work from the server: \n" + "\tstatus: %s\n" % (hte.headers.get('X-Error-Http', 'unknown')) + "\treason: %s" % (hte.headers.get('X-Error-Detail', 'unknown'))) if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: self.logger.error( "Server could not acquire any work from the server:") self.logger.error("%s " % (str(traceback.format_exc()))) self.logger.error("\turl: %s\n" % (getattr(hte, 'url', 'unknown'))) self.logger.error("\tresult: %s\n" % (getattr(hte, 'result', 'unknown'))) except Exception as exc: self.logger.error("Server could not process the request: %s" % (str(exc))) self.logger.error(traceback.format_exc()) return pendingwork def quit(self, code, traceback_): self.logger.info("Received kill request. Waiting for the workers...") self.STOP = True def updateWork(self, taskname, status): configreq = { 'workflow': taskname, 'status': status, 'subresource': 'state' } retry = True while retry: try: self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq)) retry = False except HTTPException as hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % ( str(hte), configreq) msg += "\tstatus: %s\n" % (hte.headers.get( 'X-Error-Http', 'unknown')) msg += "\treason: %s\n" % (hte.headers.get( 'X-Error-Detail', 'unknown')) msg += "\turl: %s\n" % (getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" % (getattr(hte, 'result', 'unknown')) msg += "%s \n" % (str(traceback.format_exc())) self.logger.error(msg) retry = False if int(hte.headers.get('X-Error-Http', '0')) == 503: #503 - Database/Service unavailable. Maybe Intervention of CMSWEB ongoing? retry = True time_sleep = 30 + random.randint(10, 30) self.logger.info( "Sleeping %s seconds and will try to update again." % str(time_sleep)) time.sleep(time_sleep) except Exception as exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % ( str(exc), configreq) self.logger.error(msg + traceback.format_exc()) retry = False def algorithm(self): """I'm the intelligent guy taking care of getting the work and distribuiting it to the slave processes.""" self.logger.debug("Starting") while (not self.STOP): for status, worktype, failstatus in states(): limit = self.slaves.queueableTasks() if not self._lockWork( limit=limit, getstatus=status, setstatus='HOLDING'): continue ## Warning: If we fail to retrieve tasks on HOLDING (e.g. because cmsweb is down) ## we may end up executing the wrong worktype later on. A solution would be to ## save the previous task state in a new column of the TaskDB. pendingwork = self._getWork(limit=limit, getstatus='HOLDING') self.logger.info("Retrieved a total of %d %s works" % (len(pendingwork), worktype)) self.logger.debug("Retrieved the following works: \n%s" % (str(pendingwork))) self.slaves.injectWorks([(worktype, task, failstatus, None) for task in pendingwork]) for task in pendingwork: self.updateWork(task['tm_taskname'], 'QUEUED') for action in self.recurringActions: if action.isTimeToGo(): #Maybe we should use new slaves and not reuse the ones used for the tasks self.logger.debug("Injecting recurring action: \n%s" % (str(action.__module__))) self.slaves.injectWorks([(handleRecurring, { 'tm_taskname': action.__module__ }, 'FAILED', action.__module__)]) self.logger.info('Master Worker status:') self.logger.info(' - free slaves: %d' % self.slaves.freeSlaves()) self.logger.info(' - acquired tasks: %d' % self.slaves.queuedTasks()) self.logger.info(' - tasks pending in queue: %d' % self.slaves.pendingTasks()) time.sleep(self.config.TaskWorker.polling) finished = self.slaves.checkFinished() self.logger.debug("Master Worker Exiting Main Cycle")
def publishInDBS3(config, taskname, verbose): """ Publish output from one task in DBS """ def mark_good(files, crabServer, logger): """ Mark the list of files as tranferred """ msg = "Marking %s file(s) as published." % len(files) logger.info(msg) nMarked = 0 for lfn in files: data = {} source_lfn = lfn docId = getHashLfn(source_lfn) data['asoworker'] = config.General.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = docId data['list_of_publication_state'] = 'DONE' data['list_of_retry_value'] = 1 data['list_of_failure_reason'] = '' if dryRun: logger.info("DryRun: skip marking good file") else: try: result = crabServer.post(REST_filetransfers, data=encodeRequest(data)) logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result) except Exception as ex: logger.error( "Error updating status for DocumentId: %s lfn: %s", docId, source_lfn) logger.error("Error reason: %s", ex) nMarked += 1 if nMarked % 10 == 0: logger.info('marked %d files', nMarked) def mark_failed(files, crabServer, logger, failure_reason=""): """ Something failed for these files so increment the retry count """ msg = "Marking %s file(s) as failed" % len(files) logger.info(msg) nMarked = 0 for lfn in files: source_lfn = lfn docId = getHashLfn(source_lfn) data = dict() data['asoworker'] = config.General.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = docId data['list_of_publication_state'] = 'FAILED' data['list_of_retry_value'] = 1 data['list_of_failure_reason'] = failure_reason logger.debug("data: %s ", data) if dryRun: logger.debug("DryRun: skip marking failes files") else: try: result = crabServer.post(REST_filetransfers, data=encodeRequest(data)) logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result) except Exception as ex: logger.error( "Error updating status for DocumentId: %s lfn: %s", docId, source_lfn) logger.error("Error reason: %s", ex) nMarked += 1 if nMarked % 10 == 0: logger.info('marked %d files', nMarked) def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) taskFilesDir = config.General.taskFilesDir dryRun = config.TaskPublisher.dryRun username = taskname.split(':')[1].split('_')[0] logdir = config.General.logsDir + '/tasks/' + username logfile = logdir + '/' + taskname + '.log' createLogdir(logdir) logger = logging.getLogger(taskname) logging.basicConfig(filename=logfile, level=logging.INFO, format=config.TaskPublisher.logMsgFormat) if verbose: logger.setLevel(logging.DEBUG) logger.info("Getting files to publish") toPublish = [] # TODO move from new to done when processed fname = taskFilesDir + taskname + ".json" with open(fname) as f: toPublish = json.load(f) if not toPublish: logger.info("Empty data file %s", fname) return "EMPTY" pnn = toPublish[0]["Destination"] # CRABServer REST API's (see CRABInterface) try: instance = config.General.instance except: msg = "No instance provided: need to specify config.General.instance in the configuration" raise ConfigException(msg) if instance in SERVICE_INSTANCES: logger.info('Will connect to CRAB service: %s', instance) restHost = SERVICE_INSTANCES[instance]['restHost'] dbInstance = SERVICE_INSTANCES[instance]['dbInstance'] else: msg = "Invalid instance value '%s'" % instance raise ConfigException(msg) if instance == 'other': logger.info('Will use restHost and dbInstance from config file') try: restHost = config.General.restHost dbInstance = config.General.dbInstance except: msg = "Need to specify config.General.restHost and dbInstance in the configuration" raise ConfigException(msg) restURInoAPI = '/crabserver/' + dbInstance logger.info('Will connect to CRAB Data Base via URL: https://%s/%s', restHost, restURInoAPI) # CRAB REST API's REST_filetransfers = restURInoAPI + '/filetransfers' REST_task = restURInoAPI + '/task' crabServer = HTTPRequests(url=restHost, localcert=config.General.serviceCert, localkey=config.General.serviceKey, retry=3) data = dict() data['subresource'] = 'search' data['workflow'] = taskname try: results = crabServer.get(REST_task, data=encodeRequest(data)) except Exception as ex: logger.error( "Failed to get acquired publications from oracleDB for %s: %s", taskname, ex) return "FAILED" if verbose: logger.info(results[0]['desc']['columns']) try: inputDatasetIndex = results[0]['desc']['columns'].index( "tm_input_dataset") inputDataset = results[0]['result'][inputDatasetIndex] sourceURLIndex = results[0]['desc']['columns'].index("tm_dbs_url") sourceURL = results[0]['result'][sourceURLIndex] publish_dbs_urlIndex = results[0]['desc']['columns'].index( "tm_publish_dbs_url") publish_dbs_url = results[0]['result'][publish_dbs_urlIndex] if not sourceURL.endswith("/DBSReader") and not sourceURL.endswith( "/DBSReader/"): sourceURL += "/DBSReader" except Exception: logger.exception("ERROR") # When looking up parents may need to look in global DBS as well. globalURL = sourceURL globalURL = globalURL.replace('phys01', 'global') globalURL = globalURL.replace('phys02', 'global') globalURL = globalURL.replace('phys03', 'global') globalURL = globalURL.replace('caf', 'global') # DBS client relies on X509 env. vars os.environ['X509_USER_CERT'] = config.General.serviceCert os.environ['X509_USER_KEY'] = config.General.serviceKey logger.info("Source API URL: %s", sourceURL) sourceApi = dbsClient.DbsApi(url=sourceURL) logger.info("Global API URL: %s", globalURL) globalApi = dbsClient.DbsApi(url=globalURL) if publish_dbs_url.endswith('/DBSWriter'): publish_read_url = publish_dbs_url[:-len('/DBSWriter')] + '/DBSReader' publish_migrate_url = publish_dbs_url[:-len('/DBSWriter' )] + '/DBSMigrate' else: publish_migrate_url = publish_dbs_url + '/DBSMigrate' publish_read_url = publish_dbs_url + '/DBSReader' publish_dbs_url += '/DBSWriter' try: logger.info("Destination API URL: %s", publish_dbs_url) destApi = dbsClient.DbsApi(url=publish_dbs_url) logger.info("Destination read API URL: %s", publish_read_url) destReadApi = dbsClient.DbsApi(url=publish_read_url) logger.info("Migration API URL: %s", publish_migrate_url) migrateApi = dbsClient.DbsApi(url=publish_migrate_url) except Exception: logger.exception('Wrong DBS URL %s', publish_dbs_url) return "FAILED" logger.info("inputDataset: %s", inputDataset) noInput = len(inputDataset.split("/")) <= 3 # TODO: fix dbs dep if not noInput: try: existing_datasets = sourceApi.listDatasets(dataset=inputDataset, detail=True, dataset_access_type='*') primary_ds_type = existing_datasets[0]['primary_ds_type'] # There's little chance this is correct, but it's our best guess for now. # CRAB2 uses 'crab2_tag' for all cases existing_output = destReadApi.listOutputConfigs( dataset=inputDataset) except Exception: logger.exception('Wrong DBS URL %s', publish_dbs_url) return "FAILED" if not existing_output: msg = "Unable to list output config for input dataset %s." % ( inputDataset) logger.error(msg) global_tag = 'crab3_tag' else: global_tag = existing_output[0]['global_tag'] else: msg = "This publication appears to be for private MC." logger.info(msg) primary_ds_type = 'mc' global_tag = 'crab3_tag' acquisition_era_name = "CRAB" processing_era_config = { 'processing_version': 1, 'description': 'CRAB3_processing_era' } appName = 'cmsRun' appVer = toPublish[0]["swversion"] pset_hash = toPublish[0]['publishname'].split("-")[-1] gtag = str(toPublish[0]['globaltag']) if gtag == "None": gtag = global_tag try: if toPublish[0]['acquisitionera'] and not toPublish[0][ 'acquisitionera'] in ["null"]: acquisitionera = str(toPublish[0]['acquisitionera']) else: acquisitionera = acquisition_era_name except Exception: acquisitionera = acquisition_era_name _, primName, procName, tier = toPublish[0]['outdataset'].split('/') primds_config = { 'primary_ds_name': primName, 'primary_ds_type': primary_ds_type } msg = "About to insert primary dataset: %s" % (str(primds_config)) logger.debug(msg) if dryRun: logger.info("DryRun: skip insertPrimaryDataset") else: destApi.insertPrimaryDataset(primds_config) msg = "Successfully inserted primary dataset %s." % (primName) logger.info(msg) final = {} failed = [] publish_in_next_iteration = [] published = [] dataset = toPublish[0]['outdataset'] # Find all (valid) files already published in this dataset. try: existingDBSFiles = destReadApi.listFiles(dataset=dataset, detail=True) existingFiles = [f['logical_file_name'] for f in existingDBSFiles] existingFilesValid = [ f['logical_file_name'] for f in existingDBSFiles if f['is_file_valid'] ] msg = "Dataset %s already contains %d files" % (dataset, len(existingFiles)) msg += " (%d valid, %d invalid)." % (len(existingFilesValid), len(existingFiles) - len(existingFilesValid)) logger.info(msg) final['existingFiles'] = len(existingFiles) except Exception as ex: msg = "Error when listing files in DBS: %s" % (str(ex)) msg += "\n%s" % (str(traceback.format_exc())) logger.error(msg) return "FAILED" # check if actions are needed workToDo = False for fileTo in toPublish: #print(existingFilesValid) if fileTo['lfn'] not in existingFilesValid: workToDo = True break if not workToDo: msg = "Nothing uploaded, %s has these files already or not enough files." % ( dataset) logger.info(msg) return "NOTHING TO DO" acquisition_era_config = { 'acquisition_era_name': acquisitionera, 'start_date': 0 } output_config = { 'release_version': appVer, 'pset_hash': pset_hash, 'app_name': appName, 'output_module_label': 'o', 'global_tag': global_tag, } msg = "Published output config." logger.info(msg) dataset_config = { 'dataset': dataset, 'processed_ds_name': procName, 'data_tier_name': tier, 'dataset_access_type': 'VALID', 'physics_group_name': 'CRAB3', 'last_modification_date': int(time.time()), } msg = "About to insert dataset: %s" % (str(dataset_config)) logger.info(msg) # List of all files that must (and can) be published. dbsFiles = [] dbsFiles_f = [] # Set of all the parent files from all the files requested to be published. parentFiles = set() # Set of parent files for which the migration to the destination DBS instance # should be skipped (because they were not found in DBS). parentsToSkip = set() # Set of parent files to migrate from the source DBS instance # to the destination DBS instance. localParentBlocks = set() # Set of parent files to migrate from the global DBS instance # to the destination DBS instance. globalParentBlocks = set() # Loop over all files to publish. for file_ in toPublish: if verbose: logger.info(file_) # Check if this file was already published and if it is valid. if file_['lfn'] not in existingFilesValid: # We have a file to publish. # Get the parent files and for each parent file do the following: # 1) Add it to the list of parent files. # 2) Find the block to which it belongs and insert that block name in # (one of) the set of blocks to be migrated to the destination DBS. for parentFile in list(file_['parents']): if parentFile not in parentFiles: parentFiles.add(parentFile) # Is this parent file already in the destination DBS instance? # (If yes, then we don't have to migrate this block.) blocksDict = destReadApi.listBlocks( logical_file_name=parentFile) if not blocksDict: # No, this parent file is not in the destination DBS instance. # Maybe it is in the same DBS instance as the input dataset? blocksDict = sourceApi.listBlocks( logical_file_name=parentFile) if blocksDict: # Yes, this parent file is in the same DBS instance as the input dataset. # Add the corresponding block to the set of blocks from the source DBS # instance that have to be migrated to the destination DBS. localParentBlocks.add(blocksDict[0]['block_name']) else: # No, this parent file is not in the same DBS instance as input dataset. # Maybe it is in global DBS instance? blocksDict = globalApi.listBlocks( logical_file_name=parentFile) if blocksDict: # Yes, this parent file is in global DBS instance. # Add the corresponding block to the set of blocks from global DBS # instance that have to be migrated to the destination DBS. globalParentBlocks.add( blocksDict[0]['block_name']) # If this parent file is not in the destination DBS instance, is not # the source DBS instance, and is not in global DBS instance, then it # means it is not known to DBS and therefore we can not migrate it. # Put it in the set of parent files for which migration should be skipped. if not blocksDict: parentsToSkip.add(parentFile) # If this parent file should not be migrated because it is not known to DBS, # we remove it from the list of parents in the file-to-publish info dictionary # (so that when publishing, this "parent" file will not appear as a parent). if parentFile in parentsToSkip: msg = "Skipping parent file %s, as it doesn't seem to be known to DBS." % ( parentFile) logger.info(msg) if parentFile in file_['parents']: file_['parents'].remove(parentFile) # Add this file to the list of files to be published. dbsFiles.append(format_file_3(file_)) dbsFiles_f.append(file_) #print file published.append(file_['SourceLFN']) #published.append(file_['lfn'].replace("/store","/store/temp")) # Print a message with the number of files to publish. msg = "Found %d files not already present in DBS which will be published." % ( len(dbsFiles)) logger.info(msg) # If there are no files to publish, continue with the next dataset. if not dbsFiles_f: msg = "Nothing to do for this dataset." logger.info(msg) return "NOTHING TO DO" # Migrate parent blocks before publishing. # First migrate the parent blocks that are in the same DBS instance # as the input dataset. if localParentBlocks: msg = "List of parent blocks that need to be migrated from %s:\n%s" % ( sourceApi.url, localParentBlocks) logger.info(msg) if dryRun: logger.info("DryRun: skipping migration request") else: statusCode, failureMsg = migrateByBlockDBS3( taskname, migrateApi, destReadApi, sourceApi, inputDataset, localParentBlocks, verbose) if statusCode: failureMsg += " Not publishing any files." logger.info(failureMsg) failed.extend([f['SourceLFN'] for f in dbsFiles_f]) #failed.extend([f['lfn'].replace("/store","/store/temp") for f in dbsFiles_f]) failure_reason = failureMsg published = [ x for x in published[dataset] if x not in failed[dataset] ] return "NOTHING TO DO" # Then migrate the parent blocks that are in the global DBS instance. if globalParentBlocks: msg = "List of parent blocks that need to be migrated from %s:\n%s" % ( globalApi.url, globalParentBlocks) logger.info(msg) if dryRun: logger.info("DryRun: skipping migration request") else: statusCode, failureMsg = migrateByBlockDBS3( taskname, migrateApi, destReadApi, globalApi, inputDataset, globalParentBlocks, verbose) if statusCode: failureMsg += " Not publishing any files." logger.info(failureMsg) failed.extend([f['SourceLFN'] for f in dbsFiles_f]) #failed.extend([f['lfn'].replace("/store","/store/temp") for f in dbsFiles_f]) failure_reason = failureMsg published = [ x for x in published[dataset] if x not in failed[dataset] ] return "NOTHING TO DO" # Publish the files in blocks. The blocks must have exactly max_files_per_block # files, unless there are less than max_files_per_block files to publish to # begin with. If there are more than max_files_per_block files to publish, # publish as many blocks as possible and leave the tail of files for the next # PublisherWorker call, unless forced to published. block_count = 0 count = 0 max_files_per_block = config.General.max_files_per_block while True: block_name = "%s#%s" % (dataset, str(uuid.uuid4())) files_to_publish = dbsFiles[count:count + max_files_per_block] try: block_config = { 'block_name': block_name, 'origin_site_name': pnn, 'open_for_writing': 0 } if verbose: msg = "Inserting files %s into block %s." % ( [f['logical_file_name'] for f in files_to_publish], block_name) logger.info(msg) blockDump = createBulkBlock(output_config, processing_era_config, primds_config, dataset_config, acquisition_era_config, block_config, files_to_publish) #logger.debug("Block to insert: %s\n %s" % (blockDump, destApi.__dict__ )) if dryRun: logger.info("DryRun: skip insertBulkBlock") else: destApi.insertBulkBlock(blockDump) block_count += 1 except Exception as ex: #logger.error("Error for files: %s" % [f['SourceLFN'] for f in toPublish]) logger.error("Error for files: %s", [f['lfn'] for f in toPublish]) failed.extend([f['SourceLFN'] for f in toPublish]) #failed.extend([f['lfn'].replace("/store","/store/temp") for f in toPublish]) msg = "Error when publishing (%s) " % ", ".join(failed) msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) failure_reason = str(ex) fname = '/tmp/failed-block-at-%s.txt' % time.time() with open(fname, 'w') as fd: fd.write(blockDump) logger.error("FAILING BLOCK SAVED AS %s", file) count += max_files_per_block files_to_publish_next = dbsFiles_f[count:count + max_files_per_block] if len(files_to_publish_next) < max_files_per_block: publish_in_next_iteration.extend( [f["SourceLFN"] for f in files_to_publish_next]) #publish_in_next_iteration.extend([f["lfn"].replace("/store","/store/temp") for f in files_to_publish_next]) break published = [ x for x in published if x not in failed + publish_in_next_iteration ] # Fill number of files/blocks published for this dataset. final['files'] = len(dbsFiles) - len(failed) - len( publish_in_next_iteration) final['blocks'] = block_count # Print a publication status summary for this dataset. msg = "End of publication status for dataset %s:" % (dataset) msg += " failed %s" % len(failed) if verbose: msg += ": %s" % failed msg += ", published %s" % len(published) if verbose: msg += ": %s" % published msg += ", publish_in_next_iteration %s" % len(publish_in_next_iteration) if verbose: msg += ": %s" % publish_in_next_iteration msg += ", results %s" % (final) logger.info(msg) try: if published: mark_good(published, crabServer, logger) data['workflow'] = taskname data['subresource'] = 'updatepublicationtime' crabServer.post(REST_task, data=encodeRequest(data)) if failed: logger.debug("Failed files: %s ", failed) mark_failed(failed, crabServer, logger, failure_reason) except Exception as ex: logger.exception("Status update failed: %s", ex) return 0
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000) data = args[0] splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] if kwargs['task']['tm_job_type'] == 'Analysis': totalUnits = kwargs['task']['tm_totalunits'] if kwargs['task']['tm_split_algo'] == 'FileBased': if totalUnits < 1.0: totalUnits = int(totalUnits * len(data.getFiles()) + 0.5) splitparam['total_files'] = totalUnits elif kwargs['task']['tm_split_algo'] == 'LumiBased': if totalUnits < 1.0: totalUnits = int(totalUnits * sum( len(run.lumis) for f in data.getFiles() for run in f['runs']) + 0.5) splitparam['total_lumis'] = totalUnits elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased': if totalUnits < 1.0: totalUnits = int(totalUnits * sum(f['events'] for f in data.getFiles()) + 0.5) splitparam['total_events'] = totalUnits elif kwargs['task']['tm_split_algo'] == 'Automatic': # REST backwards compatibility fix if 'seconds_per_job' in kwargs['task']['tm_split_args']: kwargs['task']['tm_split_args'][ 'minutes_per_job'] = kwargs['task'][ 'tm_split_args'].pop('seconds_per_job') splitparam['algorithm'] = 'FileBased' splitparam['total_files'] = len(data.getFiles()) numProbes = getattr(self.config.TaskWorker, 'numAutomaticProbes', 5) splitparam['files_per_job'] = (len(data.getFiles()) + numProbes - 1) // numProbes elif kwargs['task']['tm_job_type'] == 'PrivateMC': if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][ 'tm_events_per_lumi']: splitparam['events_per_lumi'] = kwargs['task'][ 'tm_events_per_lumi'] if 'tm_generator' in kwargs['task'] and kwargs['task'][ 'tm_generator'] == 'lhe': splitparam['lheInputFiles'] = True splitparam['applyLumiCorrection'] = True wmsubs = Subscription( fileset=data, workflow=wmwork, split_algo=splitparam['algorithm'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) try: splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) factory = jobfactory(**splitparam) numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory]) except RuntimeError: msg = "The splitting on your task generated more than {0} jobs (the maximum).".format( maxJobs) raise TaskWorkerException(msg) if numJobs == 0: msg = "The CRAB3 server backend could not submit any job to the Grid scheduler:" msg += " Splitting task %s" % (kwargs['task']['tm_taskname']) if kwargs['task']['tm_input_dataset']: msg += " on dataset %s" % (kwargs['task']['tm_input_dataset']) msg += " with %s method does not generate any job" % ( kwargs['task']['tm_split_algo']) raise TaskWorkerException(msg) elif numJobs > maxJobs: raise TaskWorkerException( "The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s" % (numJobs, maxJobs)) minRuntime = getattr(self.config.TaskWorker, 'minAutomaticRuntimeMins', 180) if kwargs['task']['tm_split_algo'] == 'Automatic' and \ kwargs['task']['tm_split_args']['minutes_per_job'] < minRuntime: msg = "Minimum runtime requirement for automatic splitting is {0} minutes.".format( minRuntime) raise TaskWorkerException(msg) #printing duplicated lumis if any lumiChecker = getattr(jobfactory, 'lumiChecker', None) if lumiChecker and lumiChecker.splitLumiFiles: self.logger.warning( "The input dataset contains the following duplicated lumis %s", lumiChecker.splitLumiFiles.keys()) #TODO use self.uploadWarning try: userServer = HTTPRequests(self.server['host'], kwargs['task']['user_proxy'], kwargs['task']['user_proxy'], retry=2, logger=self.logger) configreq = { 'subresource': 'addwarning', 'workflow': kwargs['task']['tm_taskname'], 'warning': b64encode( 'The CRAB3 server backend detected lumis split across files in the input dataset.' ' Will apply the necessary corrections in the splitting algorithms. You can ignore this message.' ) } userServer.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning( "Cannot add warning to REST after finding duplicates") return Result(task=kwargs['task'], result=(factory, args[0]))
class FileTransfersTest(unittest.TestCase): """ _DashboardAPITest_ Unit tests for the FileTransfers API """ def setUp(self): """ Setup for unit tests """ self.server = HTTPRequests(os.environ['SERVER_HOST'], os.environ['X509_USER_PROXY'], os.environ['X509_USER_PROXY']) self.lfnBase = '/store/temp/user/%s/my_cool_dataset-%s/file-%s-%s.root' self.fileDoc = {'id': 'OVERWRITE', 'username': '******', 'taskname': 'OVERWRITE', 'start_time': 0, 'destination': 'T2_CH_CERN', 'destination_lfn': 'OVERWRITE', 'source': 'T2_US_Caltech', 'source_lfn': 'OVERWRITE', 'filesize': random.randint(1, 9999), 'publish': 1, 'transfer_state': 'OVERWRITE', 'publication_state': 'OVERWRITE', 'job_id': 1, 'job_retry_count': 0, 'type': 'log', 'rest_host': 'cmsweb.cern.ch', 'rest_uri': '/crabserver/prod/'} self.ids = [] self.users = ['jbalcas', 'mmascher', 'dciangot', 'riahi', 'erupeika', 'sbelforte'] # just random users for tests self.tasks = {} self.totalFiles = 10 def testFileTransferPUT(self): """ _testFileTransferPUT_ Just test simple testFileTransferPUT with fake data """ # We just sent fake data which is not monitored by dashboard. # Also only the first time to decide is publication ON or NOT for user in self.users: timestamp = time.strftime('%y%m%d_%H%M%S', time.gmtime()) for i in range(self.totalFiles): now = int(time.time()) # Generate a taskname workflowName = "" taskname = "" if user not in self.tasks: workflowName = "".join([random.choice(string.ascii_lowercase) for _ in range(20)]) + "_" + str(now) publicationState = random.choice(['NEW', 'NOT_REQUIRED']) else: workflowName = self.tasks[user]['workflowName'] publicationState = self.tasks[user]['publication'] transferState = random.choice(['NEW', 'DONE']) taskname = generateTaskName(user, workflowName, timestamp) finalLfn = self.lfnBase % (user, workflowName, i, random.randint(1, 9999)) idHash = getHashLfn(finalLfn) self.fileDoc['id'] = idHash self.fileDoc['job_id'] = i self.fileDoc['username'] = user self.fileDoc['taskname'] = taskname self.fileDoc['start_time'] = int(time.time()) self.fileDoc['source_lfn'] = finalLfn self.fileDoc['destination_lfn'] = finalLfn self.fileDoc['transfer_state'] = transferState self.fileDoc['publication_state'] = publicationState print(self.fileDoc) self.server.put('/crabserver/dev/fileusertransfers', data=encodeRequest(self.fileDoc)) # if I will put the same doc twice, it should raise an error. # self.server.put('/crabserver/dev/fileusertransfers', data=urllib.urlencode(self.fileDoc)) # This tasks are for the future and next calls if user not in self.tasks: self.tasks[user] = {'workflowName': workflowName, 'taskname': taskname, 'listOfIds': [], 'publication': publicationState, 'toTransfer': 0, 'toPublish': 0, 'total': self.totalFiles} if self.tasks[user]['publication'] == 'NEW': self.tasks[user]['toPublish'] += 1 if transferState == 'NEW': self.tasks[user]['toTransfer'] += 1 self.tasks[user]['listOfIds'].append(idHash) # This should raise an error for username in self.tasks: taskname = self.tasks[username]['taskname'] for query in ['getTransferStatus', 'getPublicationStatus']: result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': query, 'username': username, 'taskname': taskname})) print(result) print(result[0]['result']) taskInfoDict = oracleOutputMapping(result, 'id') print(taskInfoDict) for key, docDict in taskInfoDict.items(): result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'getById', 'id': key})) randomUsers = random.sample(set(self.users), 3) # Take half of the users and kill their transfers for specific task for username in randomUsers: taskname = self.tasks[username]['taskname'] result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfers', 'username': username, 'taskname': taskname})) print(result) # oneUser is left for killing a list of IDs # leftUsers will be killing transfers one by one for specific id. leftUsers = list(set(self.users) - set(randomUsers)) oneUser = random.sample(set(leftUsers), 1) leftUsers = list(set(leftUsers) - set(oneUser)) for username in leftUsers: # First get all left ids for this users result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'getTransferStatus', 'username': username, 'taskname': self.tasks[username]['taskname']})) resultOut = oracleOutputMapping(result, None) print("**"*50) for outDict in resultOut: print(outDict) result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfersById', 'username': username, 'listOfIds': outDict['id']})) print(result) print(resultOut) print(result) for username in oneUser: result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfersById', 'username': username, 'listOfIds': self.tasks[username]['listOfIds']}, ['listOfIds'])) # As it asks to kill all which are in new, need to double check what we submitted before and if the output of killed is correct print(result) print(self.tasks[username])
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def getLogging(quiet, debug): """Retrieves a logger and set the proper level :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('twlog.log', when="midnight") logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = logging.getLogger() logger.debug("Logging level initialized to %s." %loglevel) return logger self.STOP = False self.TEST = test self.logger = getLogging(quiet, debug) self.config = config restinstance = None self.resturl = '/crabserver/prod/workflowdb' if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: restinstance = MODEURL[self.config.TaskWorker.mode]['host'] self.resturl = self.resturl.replace('prod', MODEURL[self.config.TaskWorker.mode]['instance']) else: restinstance = self.config.TaskWorker.resturl self.resturl = self.resturl.replace('prod', MODEURL[self.config.TaskWorker.mode]['instance']) if self.resturl is None or restinstance is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") self.server = HTTPRequests(restinstance, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey) self.logger.debug("Hostcert: %s, hostkey: %s" %(str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)] if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry: raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.") if self.TEST: self.slaves = TestWorker(self.config, restinstance, self.resturl) else: self.slaves = Worker(self.config, restinstance, self.resturl) self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is alays returning true, because we do not want the worker to day if the server endpoint is not avaialable. Prints a log entry if answer is greater then 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus} try: self.server.post(self.resturl, data = urllib.urlencode(configreq)) except HTTPException, hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq) if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': msg += "Task Worker could not update work to the server: \n" +\ "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\ "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown')) msg += "Probably no task to be updated\n" if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: msg += "TW could not update work to the server:\n" msg += "%s \n" %(str(traceback.format_exc())) msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown')) self.logger.error(msg) except Exception, exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq) self.logger.error(msg + traceback.format_exc())
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger): procName = "Process-%s" % procnum while True: try: ## Get (and remove) an item from the input queue. If the queue is empty, wait ## until an item is available. workid, work, task, failstatus, inputargs = inputs.get() if work == 'STOP': break taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname']) except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break outputs = None t0 = time.time() logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname']) try: msg = None outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs) except WorkerHandlerException as we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception as exc: #pylint: disable=broad-except outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc()) finally: if msg: try: logger.info("Uploading error message to REST: %s", msg) server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry = 20, logger = logger) truncMsg = truncateError(msg) configreq = {'workflow': task['tm_taskname'], 'status': failstatus, 'subresource': 'failure', #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST 'failure': b64encode(truncMsg)} server.post(resturi, data = urllib.urlencode(configreq)) logger.info("Error message successfully uploaded to the REST") except HTTPException as hte: logger.warning("Cannot upload failure message to the REST for workflow %s. HTTP headers follows:", task['tm_taskname']) logger.error(hte.headers) except Exception as exc: #pylint: disable=broad-except logger.warning("Cannot upload failure message to the REST for workflow %s.\nReason: %s", task['tm_taskname'], exc) logger.exception('Traceback follows:') t1 = time.time() logger.debug("%s: ...work on %s completed in %d seconds: %s", procName, task['tm_taskname'], t1-t0, outputs) try: out, _, _ = executeCommand("ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid()) msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip()) logger.debug(msg) except: logger.exception("Problem getting worker RSS:") removeTaskLogHandler(logger, taskhandler) results.put({ 'workid': workid, 'out' : outputs })
def monitor(user, taskname, log): """ function monitoring the Rucio replica locks of a rule and updating db statuses accordingly :param user: user HN name :type user: str :param taskname: CRAB taskname :type taskname: str :param log: log object :type log: logging """ os.environ["X509_CERT_DIR"] = os.getcwd() proxy = None if os.path.exists('task_process/rest_filetransfers.txt'): with open("task_process/rest_filetransfers.txt", "r") as _rest: rest_filetransfers = _rest.readline().split('\n')[0] proxy = os.getcwd() + "/" + _rest.readline() log.info("Proxy: %s", proxy) os.environ["X509_USER_PROXY"] = proxy if not proxy: log.info('No proxy available yet - waiting for first post-job') return None # Prepare user and task info for monitoring scope = "user." + user name = taskname log.info("Initializing Monitor Rucio client for %s", taskname) crabInj = CRABDataInjector("", "", scope=scope, account=user, auth_type='x509_proxy') id_map = {} lfn_map = {} source_rse = {} # create maps for lfn --> oracle id, source rse if os.path.exists('task_process/transfers.txt'): with open('task_process/transfers.txt', 'r') as _list: for _data in _list.readlines(): try: doc = json.loads(_data) id_map.update({doc['destination_lfn']: doc['id']}) lfn_map.update({doc['id']: doc['destination_lfn']}) source_rse.update( {doc['destination_lfn']: doc['source'] + "_Temp"}) except Exception: continue if os.path.exists('task_process/transfers_direct.txt'): with open('task_process/transfers_direct.txt', 'r') as _list: for _data in _list.readlines(): try: doc = json.loads(_data) id_map.update({doc['destination_lfn']: doc['id']}) lfn_map.update({doc['id']: doc['destination_lfn']}) except Exception: continue # get the rule for this rucio dataset try: rules_ = crabInj.cli.list_did_rules(scope, name) # {u'name': u'/store/user/dciangot/DStarToD0Pi_D0KPi_DStarFilter_TuneCP5_13TeV-pythia8-evtgen/crab_DStar_rucio_rucio_198_7/190129_085050/0000/DS2b_17_1.root', u'rse': u'T2_IT_Pisa', u'state': u'OK', u'scope': u'user.dciangot', u'rse_id': u'200b6830ca424d87a2e0ae855341b084', u'rule_id': u'4bc56a77ac6743e791dfedaa11db1e1c'} list_good = [] list_failed = [] list_failed_tmp = [] list_stuck = [] list_update = [] rules = next(rules_) log.debug("RULES %s", rules) except Exception: log.exception("Failed to retrieve rule information") return locks_generator = None # get replica locks and monitor status try: locks_generator = crabInj.cli.list_replica_locks(rules['id']) except Exception: if rules['state'] == 'STUCK': transfers = crabInj.cli.examine_replication_rule( rules['id'])['transfers'] for lfn in transfers: list_stuck.append((lfn['name'], 'Rule STUCK.')) else: log.exception('Unable to get replica locks') return # analyze replica locks info for each file sitename = None # TODO: should we split in threads ? for file_ in locks_generator: log.debug("LOCK %s", file_) filename = file_['name'] status = file_['state'] log.info("state %s", status) sitename = file_['rse'] if status == "OK": list_good.append(filename) if status == "STUCK": list_failed_tmp.append((filename, "Transfer Stuck", sitename)) if status == "REPLICATING": try: ftsJobID = crabInj.cli.list_request_by_did( filename, sitename, scope)["external_id"] if ftsJobID: list_update.append((filename, ftsJobID)) except Exception: log.exception("Replica lock not found") # Expose FTS job ID in case of failure (if available) for name_ in [x[0] for x in list_failed_tmp]: try: ftsJobID = crabInj.cli.list_request_by_did(name_, sitename, scope)["external_id"] if ftsJobID: list_failed.append((name_, "FTS job ID: %s" % ftsJobID)) else: list_failed.append(( name_, "No FTS job ID available for stuck transfers. Rucio could have failed to submit FTS job." )) except Exception: log.error( "No FTS job ID available for stuck transfer %s. Rucio could have failed to submit FTS job." % name_) list_failed.append(( name_, "No FTS job ID available for stuck transfers. Rucio could have failed to submit FTS job." )) # Filter out files already staged directly from the wn direct_files = [] if os.path.exists('task_process/transfers/registered_direct_files.txt'): with open("task_process/transfers/registered_direct_files.txt", "r") as list_file: direct_files = [x.split('\n')[0] for x in list_file.readlines()] log.debug( "Checking if some failed files were directly staged from wn: {0}" .format(str(direct_files))) list_failed = [x for x in list_failed if x[0] not in direct_files] log.debug("{0} files to be marked as failed.".format( str(len(list_failed)))) try: oracleDB = HTTPRequests(rest_filetransfers, proxy, proxy) except Exception: log.exception("Failed to set connection to oracleDB") return # Mark FAILED files on the DB and remove them from dataset and rucio replicas try: if len(list_failed) > 0: list_failed_name = [{ 'scope': scope, 'name': x[0] } for x in list_failed] log.debug("Detaching %s" % list_failed_name) crabInj.cli.detach_dids(scope, name, list_failed_name) sources = list( set([source_rse[x['name']] for x in list_failed_name])) for source in sources: to_delete = [ x for x in list_failed_name if source_rse[x['name']] == source ] log.debug("Deleting %s from %s" % (to_delete, source)) crabInj.delete_replicas(source, to_delete) mark_failed([id_map[x[0]] for x in list_failed], [x[1] for x in list_failed], oracleDB) except ReplicaNotFound: try: mark_failed([id_map[x[0]] for x in list_failed], [x[1] for x in list_failed], oracleDB) except Exception: log.exception("Failed to update status for failed files") except Exception: log.exception("Failed to update status for failed files") # Mark files of STUCK rules on the DB and remove them from dataset and rucio replicas try: if len(list_stuck) > 0: list_stuck_name = [{ 'scope': scope, 'name': x[0] } for x in list_stuck] log.debug("Detaching %s" % list_stuck_name) crabInj.cli.detach_dids(scope, name, list_stuck_name) sources = list( set([source_rse[x['name']] for x in list_stuck_name])) for source in sources: to_delete = [ x for x in list_stuck_name if source_rse[x['name']] == source ] log.debug("Deleting %s from %s" % (to_delete, source)) crabInj.delete_replicas(source, to_delete) mark_failed([id_map[x[0]] for x in list_stuck], [x[1] for x in list_stuck], oracleDB) except ReplicaNotFound: try: mark_failed([id_map[x[0]] for x in list_failed], [x[1] for x in list_failed], oracleDB) except Exception: log.exception("Failed to update status for failed files") except Exception: log.exception("Failed to update status for stuck rule") # Mark successful transfers as done on oracle DB try: mark_transferred([id_map[x] for x in list_good], oracleDB) except Exception: log.exception("Failed to update status for transferred files") try: already_list = [] list_update_filt = [] # Keep track of what has been already marked. Avoiding double updates at next iteration if os.path.exists("task_process/transfers/submitted_files.txt"): with open("task_process/transfers/submitted_files.txt", "r") as list_file: for _data in list_file.readlines(): already_list.append(_data.split("\n")[0]) list_update_filt = [ x for x in list_update if x not in already_list and x[0] not in direct_files ] # Insert FTS job ID in oracle DB for all the available tranfers if len(list_update_filt) > 0: list_update = list_update_filt fileDoc = dict() fileDoc['asoworker'] = 'rucio' fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = [id_map[x[0]] for x in list_update] fileDoc['list_of_transfer_state'] = [ "SUBMITTED" for _ in list_update ] fileDoc['list_of_fts_instance'] = [ 'https://fts3.cern.ch:8446/' for _ in list_update ] fileDoc['list_of_fts_id'] = [x[1] for x in list_update] oracleDB.post('/filetransfers', data=encodeRequest(fileDoc)) log.debug("Marked submitted %s" % [id_map[x[0]] for x in list_update]) with open("task_process/transfers/submitted_files.txt", "a+") as list_file: for update in list_update: log.debug("{0}\n".format(str(update))) list_file.write("{0}\n".format(str(update))) else: log.info("Nothing to update (fts job ID)") except Exception: log.exception('Failed to update file status for FTSJobID inclusion.')
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def getLogging(quiet, debug): """Retrieves a logger and set the proper level :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('twlog.log', when="midnight") logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = logging.getLogger() logger.debug("Logging level initialized to %s." %loglevel) return logger self.TEST = test self.logger = getLogging(quiet, debug) self.config = config restinstance = None self.resturl = '/crabserver/prod/workflowdb' if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: restinstance = MODEURL[self.config.TaskWorker.mode]['host'] self.resturl = self.resturl.replace('prod', MODEURL[self.config.TaskWorker.mode]['instance']) else: restinstance = self.config.TaskWorker.resturl self.resturl = self.resturl.replace('prod', MODEURL[self.config.TaskWorker.mode]['instance']) if self.resturl is None or restinstance is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") self.server = HTTPRequests(restinstance, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, version=__version__) self.logger.debug("Hostcert: %s, hostkey: %s" % (str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) if self.TEST: self.slaves = TestWorker(self.config, restinstance, self.resturl) else: self.slaves = Worker(self.config, restinstance, self.resturl) self.slaves.begin() def _lockWork(self, limit, getstatus, setstatus): """Today this is alays returning true, because we do not want the worker to day if the server endpoint is not avaialable. Prints a log entry if answer is greater then 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus} try: self.server.post(self.resturl, data = urllib.urlencode(configreq)) except HTTPException, hte: if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': self.logger.error("Server could not acquire any work from the server: \n" + "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) + "\treason: %s" %(hte.headers.get('X-Error-Detail', 'unknown'))) self.logger.error("Probably no task to be processed") if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: self.logger.error("Server could not acquire any work from the server:") self.logger.error("%s " %(str(traceback.format_exc()))) self.logger.error("\turl: %s\n" %(getattr(hte, 'url', 'unknown'))) self.logger.error("\tresult: %s\n" %(getattr(hte, 'result', 'unknown'))) except Exception, exc: self.logger.error("Server could not process the request: %s" %(str(exc)))
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger): procName = "Process-%s" % procnum while True: try: ## Get (and remove) an item from the input queue. If the queue is empty, wait ## until an item is available. workid, work, task, failstatus, inputargs = inputs.get() if work == 'STOP': break taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname']) except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break outputs = None t0 = time.time() logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname']) try: msg = None outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs) except TapeDatasetException as tde: outputs = Result(task=task, err=str(tde)) except WorkerHandlerException as we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception as exc: #pylint: disable=broad-except outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc()) finally: if msg: try: logger.info("Uploading error message to REST: %s", msg) server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry=20, logger=logger) truncMsg = truncateError(msg) configreq = { 'workflow': task['tm_taskname'], 'status': failstatus, 'subresource': 'failure', #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST 'failure': b64encode(truncMsg) } server.post(resturi, data=urllib.urlencode(configreq)) logger.info( "Error message successfully uploaded to the REST") except HTTPException as hte: logger.warning( "Cannot upload failure message to the REST for workflow %s. HTTP headers follows:", task['tm_taskname']) logger.error(hte.headers) except Exception as exc: #pylint: disable=broad-except logger.warning( "Cannot upload failure message to the REST for workflow %s.\nReason: %s", task['tm_taskname'], exc) logger.exception('Traceback follows:') t1 = time.time() logger.debug("%s: ...work on %s completed in %d seconds: %s", procName, task['tm_taskname'], t1 - t0, outputs) try: out, _, _ = executeCommand( "ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid()) msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip()) logger.debug(msg) except: logger.exception("Problem getting worker RSS:") removeTaskLogHandler(logger, taskhandler) results.put({'workid': workid, 'out': outputs})
class TransferWorker: """ Submit user transfers to FTS """ def __init__(self, user, tfc_map, config): """ store the user transfer info and retrieve user proxy. """ self.user = user[0] self.group = user[1] self.role = user[2] self.tfc_map = tfc_map self.config = config self.dropbox_dir = '%s/dropbox/outputs' % self.config.componentDir logging.basicConfig(level=config.log_level) self.logger = logging.getLogger('AsyncTransfer-Worker-%s' % self.user) formatter = getCommonLogFormatter(self.config) for handler in logging.getLogger().handlers: handler.setFormatter(formatter) self.pfn_to_lfn_mapping = {} self.max_retry = config.max_retry self.uiSetupScript = getattr(self.config, 'UISetupScript', None) self.submission_command = getattr(self.config, 'submission_command', 'glite-transfer-submit') self.cleanEnvironment = '' self.userDN = '' self.init = True if getattr(self.config, 'cleanEnvironment', False): self.cleanEnvironment = 'unset LD_LIBRARY_PATH; unset X509_USER_CERT; unset X509_USER_KEY;' self.logger.debug("Trying to get DN for %s" % self.user) try: self.userDN = getDNFromUserName(self.user, self.logger, ckey=self.config.opsProxy, cert=self.config.opsProxy) except Exception as ex: msg = "Error retrieving the user DN" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) self.init = False return if not self.userDN: self.init = False return defaultDelegation = {'logger': self.logger, 'credServerPath' : self.config.credentialDir, # It will be moved to be getfrom couchDB 'myProxySvr': 'myproxy.cern.ch', 'min_time_left' : getattr(self.config, 'minTimeLeft', 36000), 'serverDN' : self.config.serverDN, 'uisource' : self.uiSetupScript, 'cleanEnvironment' : getattr(self.config, 'cleanEnvironment', False)} # Set up a factory for loading plugins self.factory = WMFactory(self.config.pluginDir, namespace=self.config.pluginDir) self.commandTimeout = 1200 try: if self.config.isOracle: self.oracleDB = HTTPRequests(self.config.oracleDB, self.config.opsProxy, self.config.opsProxy) else: server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy) self.db = server.connectDatabase(self.config.files_database) config_server = CouchServer(dburl=self.config.config_couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy) self.config_db = config_server.connectDatabase(self.config.config_database) self.fts_server_for_transfer = getFTServer("T1_UK_RAL", 'getRunningFTSserver', self.config_db, self.logger) except Exception: self.logger.exception('Failed to contact DB') self.cache_area = "" if hasattr(self.config, "cache_area"): self.cache_area = self.config.cache_area if not self.config.isOracle: query = {'key': self.user} try: self.user_cache_area = self.db.loadView('DBSPublisher', 'cache_area', query)['rows'] self.cache_area = "https://" + self.user_cache_area[0]['value'][0] + \ self.user_cache_area[0]['value'][1] + "/filemetadata" except Exception: self.logger.exception("Error getting user cache_area.") pass try: defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(self.cache_area)[0] except IndexError: self.logger.error('MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area) if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey self.valid_proxy = False self.user_proxy = self.config.opsProxy try: defaultDelegation['userDN'] = self.userDN defaultDelegation['group'] = self.group defaultDelegation['role'] = self.role self.logger.debug('delegation: %s' % defaultDelegation ) self.valid_proxy, self.user_proxy = getProxy(defaultDelegation, self.logger) except Exception as ex: msg = "Error getting the user proxy" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) self.context = dict() def __call__(self): """ a. makes the RESTFTS job b. submits FTS c. update status and create dropbox json """ stdout, stderr, rc = None, None, 99999 #fts_url_delegation = self.fts_server_for_transfer.replace('8446', '8443') if self.user_proxy: try: self.context = fts3.Context(self.fts_server_for_transfer, self.user_proxy, self.user_proxy, verify=True) self.logger.debug(fts3.delegate(self.context, lifetime=timedelta(hours=48), force=False)) init_time = str(time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())) jobs, jobs_lfn, jobs_pfn, jobs_report = self.files_for_transfer() except: self.logger.exception('delegation failed') self.logger.debug("Processing files for %s " % self.user_proxy) if jobs: self.command(jobs, jobs_lfn, jobs_pfn, jobs_report) else: self.logger.debug("User proxy of %s could not be delagated! Trying next time." % self.user) self.logger.info('Transfers completed') return def source_destinations_by_user(self): """ Get all the destinations for a user """ if self.config.isOracle: self.logger.debug('Running acquiredTransfers query... ' + self.user) fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquiredTransfers' fileDoc['grouping'] = 1 fileDoc['username'] = self.user if self.group == '': group = None if self.role == '': role = None fileDoc['vogroup'] = group fileDoc['vorole'] = role fileDoc['limit'] = self.config.max_files_per_transfer result = [] self.logger.debug('Request: ' + str(fileDoc)) try: results = self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) result = oracleOutputMapping(results) res = [[x['source'], x['destination']] for x in result] res.sort() res = list(k for k, _ in itertools.groupby(res)) except Exception as ex: self.logger.error("Failed to get acquired transfers \ from oracleDB: %s" %ex) return [], {} return res, result else: query = {'group': True, 'startkey':[self.user, self.group, self.role], 'endkey':[self.user, self.group, self.role, {}, {}]} try: sites = self.db.loadView(self.config.ftscp_design, 'ftscp_all', query) except: return [] return [[x[4], x[3]] for x in sites['rows']] def files_for_transfer(self): """ Process a queue of work per transfer source:destination for a user. Return one job per source:destination. """ if self.config.isOracle: source_dests, docs = self.source_destinations_by_user() else: source_dests = self.source_destinations_by_user() jobs = {} jobs_lfn = {} jobs_pfn = {} jobs_report = {} self.logger.info('%s has %s links to transfer on: %s' % (self.user, len(source_dests), str(source_dests))) try: count = 0 for (source, destination) in source_dests: count += 1 self.logger.info('dest1: %s source: %s' % (docs[0]['destination'],source)) if self.config.isOracle: if self.group == '': group = None else: group = self.group if self.role == '': role = None else: role = self.role active_docs = [x for x in docs if x['destination'] == destination and x['source'] == source and x['username'] == self.user and x['user_group'] == group and x['user_role'] == role ] # self.logger.info('%s' % active_docs) def map_active(inputdoc): """ map active_users """ outDict = dict() outDict['key'] = [inputdoc['username'], inputdoc['user_group'], inputdoc['user_role'], inputdoc['destination'], inputdoc['source'], inputdoc['id']] outDict['value'] = [inputdoc['source_lfn'], inputdoc['destination_lfn']] return outDict active_files = [map_active(x) for x in active_docs] #active_files = active_files[:1000] self.logger.debug('%s - %s has %s files to transfer \ from %s to %s' % (count, self.user, len(active_files), source, destination)) else: query = {'reduce': False, 'limit': self.config.max_files_per_transfer, 'key': [self.user, self.group, self.role, destination, source], 'stale': 'ok'} try: active_files = self.db.loadView(self.config.ftscp_design, 'ftscp_all', query)['rows'] except: continue self.logger.debug('%s has %s files to transfer from %s to %s' % (self.user, len(active_files), source, destination)) new_job = [] lfn_list = [] pfn_list = [] dash_report = [] # take these active files and make a copyjob entry def tfc_map(item): self.logger.debug('Preparing PFNs...') source_pfn = self.apply_tfc_to_lfn('%s:%s' % (source, item['value'][0])) destination_pfn = self.apply_tfc_to_lfn('%s:%s' % (destination, item['value'][1])) self.logger.debug('PFNs prepared... %s %s' %(destination_pfn,source_pfn)) if source_pfn and destination_pfn and self.valid_proxy: try: acquired_file, dashboard_report = self.mark_acquired([item]) self.logger.debug('Files have been marked acquired') except Exception as ex: self.logger.error("%s" % ex) raise if acquired_file: self.logger.debug('Starting FTS Job creation...') # Prepare Monitor metadata lfn_list.append(item['value'][0]) pfn_list.append(source_pfn) # Prepare FTS Dashboard metadata dash_report.append(dashboard_report) new_job.append('%s %s' % (source_pfn, destination_pfn)) self.logger.debug('FTS job created...') else: pass else: self.mark_failed([item]) #self.logger.debug('Preparing job... %s' % (active_files)) map(tfc_map, active_files) self.logger.debug('Job prepared...') if new_job: jobs[(source, destination)] = new_job jobs_lfn[(source, destination)] = lfn_list jobs_pfn[(source, destination)] = pfn_list jobs_report[(source, destination)] = dash_report self.logger.debug('FTS job ready for submission over %s ---> %s ...going to next job' % (source, destination)) self.logger.debug('ftscp input created for %s (%s jobs)' % (self.user, len(jobs.keys()))) return jobs, jobs_lfn, jobs_pfn, jobs_report except Exception: self.logger.exception("fail") return jobs, jobs_lfn, jobs_pfn, jobs_report def apply_tfc_to_lfn(self, file): """ Take a CMS_NAME:lfn string and make a pfn. Update pfn_to_lfn_mapping dictionary. """ try: site, lfn = tuple(file.split(':')) except: self.logger.error('it does not seem to be an lfn %s' %file.split(':')) return None if site in self.tfc_map: pfn = self.tfc_map[site].matchLFN('srmv2', lfn) # TODO: improve fix for wrong tfc on sites try: if pfn.find("\\") != -1: pfn = pfn.replace("\\", "") if len(pfn.split(':')) == 1: self.logger.error('Broken tfc for file %s at site %s' % (lfn, site)) return None except IndexError: self.logger.error('Broken tfc for file %s at site %s' % (lfn, site)) return None except AttributeError: self.logger.error('Broken tfc for file %s at site %s' % (lfn, site)) return None # Add the pfn key into pfn-to-lfn mapping if pfn not in self.pfn_to_lfn_mapping: self.pfn_to_lfn_mapping[pfn] = lfn return pfn else: self.logger.error('Wrong site %s!' % site) return None def command(self, jobs, jobs_lfn, jobs_pfn, jobs_report): """ For each job the worker has to complete: Delete files that have failed previously Create a temporary copyjob file Submit the copyjob to the appropriate FTS server Parse the output of the FTS transfer and return complete and failed files for recording """ # Output: {"userProxyPath":"/path/to/proxy","LFNs":["lfn1","lfn2","lfn3"],"PFNs":["pfn1","pfn2","pfn3"], # "FTSJobid":'id-of-fts-job', "username": '******'} # Loop through all the jobs for the links we have failure_reasons = [] for link, copyjob in jobs.items(): submission_error = False fts_job = {} # Validate copyjob file before doing anything self.logger.debug("Valid %s" % self.validate_copyjob(copyjob)) if not self.validate_copyjob(copyjob): continue rest_copyjob = { "params":{ "bring_online": None, "verify_checksum": False, "copy_pin_lifetime": -1, "max_time_in_queue": self.config.max_h_in_queue, "job_metadata": {"issuer": "ASO"}, "spacetoken": None, "source_spacetoken": None, "fail_nearline": False, "overwrite": True, "gridftp": None }, "files": [] } transfers = list() #for SrcDest in copyjob: # self.logger.debug("Creating FTS job...") # self.logger.debug("%s -> %s" % (SrcDest.split(" ")[0], SrcDest.split(" ")[1])) # transfers.append(fts3.new_transfer(SrcDest.split(" ")[0], # SrcDest.split(" ")[1]) # ) #except: # self.logger.exception("Failure during new_transfer") for SrcDest in copyjob: tempDict = {"sources": [], "metadata": None, "destinations": []} tempDict["sources"].append(SrcDest.split(" ")[0]) tempDict["destinations"].append(SrcDest.split(" ")[1]) rest_copyjob["files"].append(tempDict) #self.logger.debug("FTS job Created with %s files..." % (transfers)) self.logger.debug("Subbmitting this REST copyjob %s" % rest_copyjob) url = self.fts_server_for_transfer + '/jobs' self.logger.debug("Running FTS submission command") self.logger.debug("FTS server: %s" % self.fts_server_for_transfer) self.logger.debug("link: %s -> %s" % link) heade = {"Content-Type ": "application/json"} """ try: job = fts3.new_job(transfers, overwrite=True, verify_checksum=False, metadata={"issuer": "ASO"}, copy_pin_lifetime=-1, bring_online=None, source_spacetoken=None, spacetoken=None # TODO: check why not on fts3 (clone repo maybe?) # max_time_in_queue=6 ) jobid = fts3.submit(self.context, job) self.logger.info("Monitor link: https://fts3-pilot.cern.ch:8449/fts3/ftsmon/#/job/"+jobid) except Exception as ex: msg = "Error submitting to FTS" msg += str(ex) msg += str(traceback.format_exc()) self.logger.debug(msg) failure_reasons.append(msg) submission_error = True """ buf = StringIO.StringIO() try: connection = RequestHandler(config={'timeout': 300, 'connecttimeout': 300}) except Exception as ex: msg = str(ex) msg += str(traceback.format_exc()) self.logger.debug(msg) try: response, datares = connection.request(url, rest_copyjob, heade, verb='POST', doseq=True, ckey=self.user_proxy, cert=self.user_proxy, capath='/etc/grid-security/certificates', cainfo=self.user_proxy, verbose=True) self.logger.debug("Submission done") self.logger.debug('Submission header status: %s' % response.status) self.logger.debug('Submission header reason: %s' % response.reason) self.logger.debug('Submission result %s' % datares) except Exception as ex: msg = "Error submitting to FTS: %s " % url msg += str(ex) msg += str(traceback.format_exc()) self.logger.debug(msg) failure_reasons.append(msg) submission_error = True buf.close() if not submission_error: res = {} try: res = json.loads(datares) except Exception as ex: msg = "Couldn't load submission acknowledgment from FTS" msg += str(ex) msg += str(traceback.format_exc()) self.logger.debug(msg) submission_error = True failure_reasons.append(msg) if 'job_id' in res: fileId_list = [] files_res = [] files_ = {} job_id = res['job_id'] file_url = self.fts_server_for_transfer + '/jobs/' + job_id +'/files' self.logger.debug("Submitting to %s" % file_url) file_buf = StringIO.StringIO() try: response, files_ = connection.request(file_url, {}, heade, doseq=True, ckey=self.user_proxy, cert=self.user_proxy, capath='/etc/grid-security/certificates', cainfo=self.user_proxy, verbose=True) files_res = json.loads(files_) except Exception as ex: msg = "Error contacting FTS to retrieve file: %s " % file_url msg += str(ex) msg += str(traceback.format_exc()) self.logger.debug(msg) submission_error = True failure_reasons.append(msg) #self.logger.debug("List files in job %s" % files_) file_buf.close() for file_in_job in files_res: if 'file_id' in file_in_job: fileId_list.append(file_in_job['file_id']) else: msg = "Could not load submitted file %s from FTS" % file_url self.logger.debug(msg) submission_error = True failure_reasons.append(msg) self.logger.debug("File id list %s" % fileId_list) if submission_error: self.logger.debug("Submission failed") self.logger.info("Mark failed %s files" % len(jobs_lfn[link])) self.logger.debug("Mark failed %s files" % jobs_lfn[link]) failed_files = self.mark_failed(jobs_lfn[link], force_fail=False, submission_error=True) self.logger.info("Marked failed %s" % len(failed_files)) continue fts_job['userProxyPath'] = self.user_proxy fts_job['LFNs'] = jobs_lfn[link] fts_job['PFNs'] = jobs_pfn[link] fts_job['FTSJobid'] = job_id fts_job['files_id'] = fileId_list fts_job['username'] = self.user self.logger.debug("Creating json file %s in %s" % (fts_job, self.dropbox_dir)) ftsjob_file = open('%s/Monitor.%s.json' % (self.dropbox_dir, fts_job['FTSJobid']), 'w') jsondata = json.dumps(fts_job) ftsjob_file.write(jsondata) ftsjob_file.close() self.logger.debug("%s ready." % fts_job) # Prepare Dashboard report for lfn in fts_job['LFNs']: lfn_report = dict() lfn_report['FTSJobid'] = fts_job['FTSJobid'] index = fts_job['LFNs'].index(lfn) lfn_report['PFN'] = fts_job['PFNs'][index] lfn_report['FTSFileid'] = fts_job['files_id'][index] lfn_report['Workflow'] = jobs_report[link][index][2] lfn_report['JobVersion'] = jobs_report[link][index][1] job_id = '%d_https://glidein.cern.ch/%d/%s_%s' % (int(jobs_report[link][index][0]), int(jobs_report[link][index][0]), lfn_report['Workflow'].replace("_", ":"), lfn_report['JobVersion']) lfn_report['JobId'] = job_id lfn_report['URL'] = self.fts_server_for_transfer self.logger.debug("Creating json file %s in %s for FTS3 Dashboard" % (lfn_report, self.dropbox_dir)) dash_job_file = open('/tmp/DashboardReport/Dashboard.%s.json' % getHashLfn(lfn_report['PFN']), 'w') jsondata = json.dumps(lfn_report) dash_job_file.write(jsondata) dash_job_file.close() self.logger.debug("%s ready for FTS Dashboard report." % lfn_report) return def validate_copyjob(self, copyjob): """ the copyjob file is valid when source pfn and destination pfn are not None. """ for task in copyjob: if task.split()[0] == 'None' or task.split()[1] == 'None': return False return True def mark_acquired(self, files=[]): """ Mark the list of files as tranferred """ lfn_in_transfer = [] dash_rep = () if self.config.isOracle: toUpdate = list() for lfn in files: if lfn['value'][0].find('temp') == 7: self.logger.debug("Marking acquired %s" % lfn) docId = lfn['key'][5] self.logger.debug("Marking acquired %s" % docId) toUpdate.append(docId) try: docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers','fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId, None)[0] except Exception as ex: self.logger.error("Error during dashboard report update: %s" %ex) lfn_in_transfer.append(lfn) dash_rep = (document['jobid'], document['job_retry_count'], document['taskname']) try: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = files[0]['key'][5] fileDoc['list_of_transfer_state'] = "SUBMITTED" self.logger.debug("Marking acquired %s" % (fileDoc)) result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) self.logger.debug("Marked acquired %s of %s" % (fileDoc, result)) except Exception as ex: self.logger.error("Error during status update: %s" %ex) # TODO: no need of mark good right? the postjob should updated the status in case of direct stageout I think return lfn_in_transfer, dash_rep else: for lfn in files: if lfn['value'][0].find('temp') == 7: docId = getHashLfn(lfn['value'][0]) self.logger.debug("Marking acquired %s" % docId) # Load document to get the retry_count try: document = self.db.document(docId) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] == 'new' or document['state'] == 'retry': data = dict() data['state'] = 'acquired' data['last_update'] = time.time() updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId updateUri += "?" + urllib.urlencode(data) try: self.db.makeRequest(uri=updateUri, type="PUT", decode=False) except Exception as ex: msg = "Error updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.debug("Marked acquired %s of %s" % (docId, lfn)) lfn_in_transfer.append(lfn) dash_rep = (document['jobid'], document['job_retry_count'], document['workflow']) else: continue else: good_lfn = lfn['value'][0].replace('store', 'store/temp', 1) self.mark_good([good_lfn]) return lfn_in_transfer, dash_rep def mark_good(self, files=[]): """ Mark the list of files as tranferred """ for lfn in files: try: document = self.db.document(getHashLfn(lfn)) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed': outputLfn = document['lfn'].replace('store/temp', 'store', 1) try: now = str(datetime.datetime.now()) last_update = time.time() data = dict() data['end_time'] = now data['state'] = 'done' data['lfn'] = outputLfn data['last_update'] = last_update updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + getHashLfn(lfn) updateUri += "?" + urllib.urlencode(data) self.db.makeRequest(uri=updateUri, type="PUT", decode=False) except Exception as ex: msg = "Error updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue try: self.db.commit() except Exception as ex: msg = "Error commiting documents in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.debug("transferred file updated") def mark_failed(self, files=[], force_fail=False, submission_error=False): """ Something failed for these files so increment the retry count """ updated_lfn = [] for lfn in files: data = {} if not isinstance(lfn, dict): if 'temp' not in lfn: temp_lfn = lfn.replace('store', 'store/temp', 1) else: temp_lfn = lfn else: if 'temp' not in lfn['value'][0]: temp_lfn = lfn['value'][0].replace('store', 'store/temp', 1) else: temp_lfn = lfn['value'][0] # Load document and get the retry_count if self.config.isOracle: docId = getHashLfn(temp_lfn) self.logger.debug("Marking failed %s" % docId) try: docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) except Exception as ex: self.logger.error("Error updating failed docs: %s" %ex) continue document = oracleOutputMapping(docbyId, None)[0] self.logger.debug("Document: %s" % document) fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = docId if force_fail or document['transfer_retry_count'] + 1 > self.max_retry: fileDoc['list_of_transfer_state'] = 'FAILED' fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_transfer_state'] = 'RETRY' if submission_error: fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS" fileDoc['list_of_retry_value'] = 1 elif not self.valid_proxy: fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: user's proxy expired" fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_failure_reason'] = "Site config problem." fileDoc['list_of_retry_value'] = 1 self.logger.debug("update: %s" % fileDoc) try: updated_lfn.append(docId) result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: msg = "Error updating document" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue else: docId = getHashLfn(temp_lfn) try: document = self.db.document(docId) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed': now = str(datetime.datetime.now()) last_update = time.time() # Prepare data to update the document in couch if force_fail or len(document['retry_count']) + 1 > self.max_retry: data['state'] = 'failed' else: data['state'] = 'retry' if submission_error: data['failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS" elif not self.valid_proxy: data['failure_reason'] = "Job could not be submitted to FTS: user's proxy expired" else: data['failure_reason'] = "Site config problem." data['last_update'] = last_update data['retry'] = now # Update the document in couch self.logger.debug("Marking failed %s" % docId) try: updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId updateUri += "?" + urllib.urlencode(data) self.db.makeRequest(uri=updateUri, type="PUT", decode=False) updated_lfn.append(docId) self.logger.debug("Marked failed %s" % docId) except Exception as ex: msg = "Error in updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue try: self.db.commit() except Exception as ex: msg = "Error commiting documents in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.debug("failed file updated") return updated_lfn def mark_incomplete(self): """ Mark the list of files as acquired """ self.logger('Something called mark_incomplete which should never be called')
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(quiet, debug): """Sets the root logger with the desired verbosity level The root logger logs to logs/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') createLogdir('logs/tasks') if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger self.STOP = False self.TEST = test self.logger = setRootLogger(quiet, debug) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") #Let's increase the server's retries for recoverable errors in the MasterWorker #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...) self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 20, logger = self.logger) self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)] if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry: raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.") if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is always returning true, because we do not want the worker to die if the server endpoint is not avaialable. Prints a log entry if answer is greater than 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus} try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during _lockWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) return False except Exception: #pylint: disable=broad-except self.logger.exception("Server could not process the _lockWork request (prameters are %s)", configreq) return False return True def _getWork(self, limit, getstatus): configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus} pendingwork = [] try: pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result'] except HTTPException as hte: msg = "HTTP Error during _getWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception("Server could not process the _getWork request (prameters are %s)", configreq) return pendingwork def quit_(self, dummyCode, dummyTraceback): self.logger.info("Received kill request. Setting STOP flag in the master process...") self.STOP = True def updateWork(self, taskname, command, status): configreq = {'workflow': taskname, 'command': command, 'status': status, 'subresource': 'state'} try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during updateWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception("Server could not process the updateWork request (prameters are %s)", configreq) def failQueuedTasks(self): """ This method is used at the TW startup and it fails QUEUED tasks that supposedly could not communicate with the REST and update their status. The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of the command field. """ limit = self.slaves.nworkers * 2 total = 0 while True: pendingwork = self._getWork(limit=limit, getstatus='QUEUED') for task in pendingwork: self.logger.debug("Failing QUEUED task %s", task['tm_taskname']) dummyWorktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']] self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus) if not len(pendingwork): self.logger.info("Finished failing QUEUED tasks (total %s)", total) break #too bad "do..while" does not exist in python... else: total += len(pendingwork) self.logger.info("Failed %s tasks (limit %s), getting next chunk of tasks", len(pendingwork), limit) def algorithm(self): """I'm the intelligent guy taking care of getting the work and distributing it to the slave processes.""" self.logger.debug("Failing QUEUED tasks before startup.") self.failQueuedTasks() self.logger.debug("Starting main loop.") while(not self.STOP): limit = self.slaves.queueableTasks() if not self._lockWork(limit=limit, getstatus='NEW', setstatus='HOLDING'): continue pendingwork = self._getWork(limit=limit, getstatus='HOLDING') if len(pendingwork) > 0: self.logger.info("Retrieved a total of %d works", len(pendingwork)) self.logger.debug("Retrieved the following works: \n%s", str(pendingwork)) toInject = [] for task in pendingwork: worktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']] toInject.append((worktype, task, failstatus, None)) for task in pendingwork: self.updateWork(task['tm_taskname'], task['tm_task_command'], 'QUEUED') self.slaves.injectWorks(toInject) for action in self.recurringActions: if action.isTimeToGo(): #Maybe we should use new slaves and not reuse the ones used for the tasks self.logger.debug("Injecting recurring action: \n%s", (str(action.__module__))) self.slaves.injectWorks([(handleRecurring, {'tm_taskname' : action.__module__}, 'FAILED', action.__module__)]) self.logger.info('Master Worker status:') self.logger.info(' - free slaves: %d', self.slaves.freeSlaves()) self.logger.info(' - acquired tasks: %d', self.slaves.queuedTasks()) self.logger.info(' - tasks pending in queue: %d', self.slaves.pendingTasks()) time.sleep(self.config.TaskWorker.polling) dummyFinished = self.slaves.checkFinished() self.logger.debug("Master Worker Exiting Main Cycle")
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(quiet, debug): """Sets the root logger with the desired verbosity level The root logger logs to logs/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') createLogdir('logs/tasks') if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s." % os.getpid()) logger.debug("Logging level initialized to %s." % loglevel) return logger self.STOP = False self.TEST = test self.logger = setRootLogger(quiet, debug) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 2) self.logger.debug("Hostcert: %s, hostkey: %s" %(str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)] if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry: raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.") if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is always returning true, because we do not want the worker to die if the server endpoint is not avaialable. Prints a log entry if answer is greater than 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus} try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) except HTTPException as hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq) if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': msg += "Task Worker could not update work to the server: \n" +\ "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\ "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown')) msg += "Probably no task to be updated\n" if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: msg += "TW could not update work to the server:\n" msg += "%s \n" %(str(traceback.format_exc())) msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown')) self.logger.error(msg) except Exception as exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq) self.logger.error(msg + traceback.format_exc()) return True def _getWork(self, limit, getstatus): configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus} pendingwork = [] try: pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result'] except HTTPException as hte: self.logger.error("HTTP Error during _getWork: %s" % str(hte)) self.logger.error("Could not get any work from the server: \n" + "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) + "\treason: %s" %(hte.headers.get('X-Error-Detail', 'unknown'))) if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: self.logger.error("Server could not acquire any work from the server:") self.logger.error("%s " %(str(traceback.format_exc()))) self.logger.error("\turl: %s\n" %(getattr(hte, 'url', 'unknown'))) self.logger.error("\tresult: %s\n" %(getattr(hte, 'result', 'unknown'))) except Exception as exc: self.logger.error("Server could not process the request: %s" %(str(exc))) self.logger.error(traceback.format_exc()) return pendingwork def quit(self, code, traceback_): self.logger.info("Received kill request. Waiting for the workers...") self.STOP = True def updateWork(self, taskname, status): configreq = {'workflow': taskname, 'status': status, 'subresource': 'state'} retry = True while retry: try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) retry = False except HTTPException as hte: #Using a msg variable and only one self.logger.error so that messages do not get shuffled msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq) msg += "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) msg += "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown')) msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown')) msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown')) msg += "%s \n" %(str(traceback.format_exc())) self.logger.error(msg) retry = False if int(hte.headers.get('X-Error-Http', '0')) == 503: #503 - Database/Service unavailable. Maybe Intervention of CMSWEB ongoing? retry = True time_sleep = 30 + random.randint(10, 30) self.logger.info("Sleeping %s seconds and will try to update again." % str(time_sleep)) time.sleep(time_sleep) except Exception as exc: msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq) self.logger.error(msg + traceback.format_exc()) retry = False def algorithm(self): """I'm the intelligent guy taking care of getting the work and distribuiting it to the slave processes.""" self.logger.debug("Starting") while(not self.STOP): for status, worktype, failstatus in states(): limit = self.slaves.queueableTasks() if not self._lockWork(limit=limit, getstatus=status, setstatus='HOLDING'): continue ## Warning: If we fail to retrieve tasks on HOLDING (e.g. because cmsweb is down) ## we may end up executing the wrong worktype later on. A solution would be to ## save the previous task state in a new column of the TaskDB. pendingwork = self._getWork(limit=limit, getstatus='HOLDING') self.logger.info("Retrieved a total of %d %s works" %(len(pendingwork), worktype)) self.logger.debug("Retrieved the following works: \n%s" %(str(pendingwork))) self.slaves.injectWorks([(worktype, task, failstatus, None) for task in pendingwork]) for task in pendingwork: self.updateWork(task['tm_taskname'], 'QUEUED') for action in self.recurringActions: if action.isTimeToGo(): #Maybe we should use new slaves and not reuse the ones used for the tasks self.logger.debug("Injecting recurring action: \n%s" %(str(action.__module__))) self.slaves.injectWorks([(handleRecurring, {'tm_taskname' : action.__module__}, 'FAILED', action.__module__)]) self.logger.info('Master Worker status:') self.logger.info(' - free slaves: %d' % self.slaves.freeSlaves()) self.logger.info(' - acquired tasks: %d' % self.slaves.queuedTasks()) self.logger.info(' - tasks pending in queue: %d' % self.slaves.pendingTasks()) time.sleep(self.config.TaskWorker.polling) finished = self.slaves.checkFinished() self.logger.debug("Master Worker Exiting Main Cycle")
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger): procName = "Process-%s" % procnum while True: try: ## Get (and remove) an item from the input queue. If the queue is empty, wait ## until an item is available. workid, work, task, failstatus, inputargs = inputs.get() taskhandler = addTaskLogHandler(logger, task["tm_username"], task["tm_taskname"]) except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break if work == "STOP": break outputs = None t0 = time.time() logger.debug("%s: Starting %s on %s", procName, str(work), task["tm_taskname"]) try: msg = None outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs) except WorkerHandlerException as we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception as exc: # pylint: disable=broad-except outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task["tm_taskname"]) msg += "\n" + str(traceback.format_exc()) finally: if msg: try: logger.info("Uploading error message to REST: %s", msg) server = HTTPRequests( resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry=20, logger=logger, ) truncMsg = truncateError(msg) configreq = { "workflow": task["tm_taskname"], "status": failstatus, "subresource": "failure", # limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST "failure": b64encode(truncMsg), } server.post(resturi, data=urllib.urlencode(configreq)) logger.info("Error message successfully uploaded to the REST") except HTTPException as hte: logger.warning( "Cannot upload failure message to the REST for workflow %s. HTTP headers follows:", task["tm_taskname"], ) logger.error(hte.headers) except Exception as exc: # pylint: disable=broad-except logger.warning( "Cannot upload failure message to the REST for workflow %s.\nReason: %s", task["tm_taskname"], exc, ) logger.exception("Traceback follows:") t1 = time.time() logger.debug("%s: ...work on %s completed in %d seconds: %s", procName, task["tm_taskname"], t1 - t0, outputs) removeTaskLogHandler(logger, taskhandler) results.put({"workid": workid, "out": outputs})
def submit(trans_tuple, job_data, log, direct=False): """Manage threads for transfers submission through Rucio :param trans_tuple: ordered list of needed xfer info (transfers, to_submit_columns) :type trans_tuple: tuple :param job_data: general CRAB job metadata :type job_data: dict :param log: log object :type log: logging :param direct: job output stored on temp or directly, defaults to False :param direct: bool, optional """ threadLock = threading.Lock() threads = [] to_update = [] toTrans = trans_tuple[0] columns = trans_tuple[1] proxy = job_data['proxy'] rest_filetransfers = job_data['rest'] user = job_data['username'] destination = job_data['destination'] taskname = job_data['taskname'] try: phedex = PhEDEx(responseType='xml', httpDict={'key': proxy, 'cert': proxy, 'pycurl': True}) except Exception: log.exception('PhEDEx exception.') return # Split threads by source RSEs sources = list(set([x[columns.index('source')] for x in toTrans])) os.environ["X509_CERT_DIR"] = os.getcwd() log.info("Connection to %s with proxy in:\n %s" % (rest_filetransfers,proxy)) oracleDB = HTTPRequests(rest_filetransfers, proxy, proxy) #verbose=True) # mapping lfn <--> pfn for source in sources: ids = [x[columns.index('id')] for x in toTrans if x[columns.index('source')] == source] src_lfns = [x[columns.index('source_lfn')] for x in toTrans if x[columns.index('source')] == source] dst_lfns = [x[columns.index('destination_lfn')] for x in toTrans if x[columns.index('source')] == source] sorted_source_pfns = [] sorted_dest_lfns = [] sorted_dest_pfns = [] # workaround for phedex.getPFN issue --> shuffling output order w.r.t. the list in input try: for chunk in chunks(src_lfns, 10): unsorted_source_pfns = [[k[1], str(x)] for k, x in phedex.getPFN(source, chunk).items()] for order_lfn in chunk: for lfn, pfn in unsorted_source_pfns: if order_lfn == lfn: sorted_source_pfns.append(pfn) break for chunk in chunks(dst_lfns, 10): unsorted_dest_pfns = [[k[1], str(x)] for k, x in phedex.getPFN(toTrans[0][4], chunk).items()] for order_lfn in chunk: for lfn, pfn in unsorted_dest_pfns: if order_lfn == lfn: sorted_dest_pfns.append(pfn) sorted_dest_lfns.append(lfn) break except Exception as ex: log.error("Failed to map lfns to pfns: %s", ex) mark_failed(ids, ["Failed to map lfn to pfn: " + str(ex) for _ in ids], oracleDB) source_pfns = sorted_source_pfns dest_lfns = sorted_dest_lfns # saving file sizes and checksums filesizes = [x[columns.index('filesize')] for x in toTrans if x[columns.index('source')] == source] checksums = [x[columns.index('checksums')] for x in toTrans if x[columns.index('source')] == source] pubnames = [x[columns.index('publishname')] for x in toTrans if x[columns.index('source')] == source] # ordered list of replicas information jobs = zip(source_pfns, dest_lfns, ids, checksums, filesizes, pubnames) job_columns = ['source_pfns', 'dest_lfns', 'ids', 'checksums', 'filesizes', 'pubnames'] # ordered list of transfers details tx_from_source = [[job, source, taskname, user, destination] for job in jobs] tx_columns = ['job', 'source', 'taskname', 'user', 'destination'] # split submission process in chunks of max 200 files for files in chunks(tx_from_source, 200): if not direct: log.info("Submitting: %s", files) thread = submit_thread(threadLock, log, (files, tx_columns), job_columns, proxy, to_update) thread.start() threads.append(thread) elif direct: log.info("Registering direct stageout: %s", files) thread = submit_thread(threadLock, log, (files, tx_columns), job_columns, proxy, to_update, direct=True) thread.start() threads.append(thread) for t in threads: t.join() if len(to_update) == 0: return False # update statuses in oracle table as per threads result for fileDoc in to_update: try: log.debug("%s/filetransfers?%s" % (rest_filetransfers, encodeRequest(fileDoc))) oracleDB.post('/filetransfers', data=encodeRequest(fileDoc)) log.info("Marked submitted %s files" % (fileDoc['list_of_ids'])) except Exception: log.exception('Failed to mark files as submitted on DBs') return True
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, logWarning, logDebug, sequential=False, console=False, name='master'): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg bool logWarning: it tells if a quiet logger is needed :arg bool logDebug: it tells if needs a verbose logger :arg bool sequential: it tells if to run in sequential (no subprocesses) mode. :arg bool console: it tells if to log to console. :arg string name: defines a name for the log of this master process""" def createLogdir(dirname): """ Create the directory dirname ignoring errors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def createAndCleanLogDirectories(logsDir): # it can be named with the time stamp a TW started createLogdir(logsDir) createLogdir(logsDir + '/tasks') currentProcessesDir = logsDir + '/processes/' createLogdir(currentProcessesDir) # when running inside a container process logs will start with same # process numbers, i.e. same name, at any container restart. # to avoid clashes and confusion, we will put away all previous processes # logs when a TW instance starts. To this goal each TW which runs # creates a directory where new containers will move its logs, so # identify LastLogs_timestamp directory latestLogDir = None # the logs directory could be empty files = os.listdir(currentProcessesDir) files.sort( reverse=True ) # if there are multiple Latest*, will hit the latest first for f in files: if f.startswith('Latest'): latestLogDir = currentProcessesDir + f break if files and latestLogDir: # rename from Latest to Old oldLogsDir = latestLogDir.replace('Latest', 'Old') shutil.move(latestLogDir, oldLogsDir) else: print( "LatestLogDir not found in logs/processes, create a dummy dir to store old files" ) oldLogsDir = currentProcessesDir + 'OldLog-Unknwown' createLogdir(oldLogsDir) # move process logs for latest TW run to old directory for f in files: if f.startswith('proc.c3id'): shutil.move(currentProcessesDir + f, oldLogsDir) # create a new LateastLogs directory where to store logs from this TaskWorker YYMMDD_HHMMSS = time.strftime('%y%m%d_%H%M%S', time.localtime()) myDir = currentProcessesDir + 'LatestLogs-' + YYMMDD_HHMMSS createLogdir(myDir) def setRootLogger(logWarning, logDebug, console, name): """Sets the root logger with the desired verbosity level The root logger logs to logsDir/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool logWarning: it tells if a quiet logger is needed :arg bool logDebug: it tells if needs a verbose logger :arg bool console: it tells if to log to console :arg string name: define a name for the log file of this master process :return logger: a logger with the appropriate logger level.""" # this must only done for real Master, not when it is used by TapeRecallStatus logsDir = config.TaskWorker.logsDir if name == 'master': createAndCleanLogDirectories(logsDir) if console: logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog(logsDir + '/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if logWarning: loglevel = logging.WARNING if logDebug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger(name, logsDir) logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger self.STOP = False self.TEST = sequential self.logger = setRootLogger(logWarning, logDebug, console, name) self.config = config self.restHost = None dbInstance = None try: instance = self.config.TaskWorker.instance except: msg = "No instance provided: need to specify config.TaskWorker.instance in the configuration" raise ConfigException(msg) if instance in SERVICE_INSTANCES: self.logger.info('Will connect to CRAB service: %s', instance) self.restHost = SERVICE_INSTANCES[instance]['restHost'] dbInstance = SERVICE_INSTANCES[instance]['dbInstance'] else: msg = "Invalid instance value '%s'" % instance raise ConfigException(msg) if instance is 'other': self.logger.info( 'Will use restHost and dbInstance from config file') try: self.restHost = self.config.TaskWorker.restHost dbInstance = self.config.TaskWorker.dbInstance except: msg = "Need to specify config.TaskWorker.restHost and dbInstance in the configuration" raise ConfigException(msg) self.restURInoAPI = '/crabserver/' + dbInstance self.logger.info('Will connect via URL: https://%s/%s', self.restHost, self.restURInoAPI) #Let's increase the server's retries for recoverable errors in the MasterWorker #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...) self.server = HTTPRequests(self.restHost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry=20, logger=self.logger) self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [ retry * 20 * 2 for retry in range(self.config.TaskWorker.max_retry) ] if not len(self.config.TaskWorker.retry_interval ) == self.config.TaskWorker.max_retry: raise ConfigException( "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry." ) # use the config to pass some useful global stuff to all workers # will use TaskWorker.cmscert/key to talk with CMSWEB self.config.TaskWorker.envForCMSWEB = newX509env( X509_USER_CERT=self.config.TaskWorker.cmscert, X509_USER_KEY=self.config.TaskWorker.cmskey) if self.TEST: self.slaves = TestWorker(self.config, self.restHost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, self.restHost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [ self.getRecurringActionInst(name) for name in recurringActionsNames ] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)(self.config.TaskWorker.logsDir) def _lockWork(self, limit, getstatus, setstatus): """Today this is always returning true, because we do not want the worker to die if the server endpoint is not avaialable. Prints a log entry if answer is greater than 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = { 'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus } try: self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during _lockWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) return False except Exception: #pylint: disable=broad-except self.logger.exception( "Server could not process the _lockWork request (prameters are %s)", configreq) return False return True def getWork(self, limit, getstatus, ignoreTWName=False): configreq = { 'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus } if ignoreTWName: configreq['workername'] = '%' pendingwork = [] try: pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data=configreq)[0]['result'] except HTTPException as hte: msg = "HTTP Error during getWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception( "Server could not process the getWork request (prameters are %s)", configreq) return pendingwork def quit_(self, dummyCode, dummyTraceback): self.logger.info( "Received kill request. Setting STOP flag in the master process..." ) self.STOP = True def updateWork(self, taskname, command, status): """ Update taskname setting the status and the command for it Return True if the change succeded, False otherwise """ configreq = { 'workflow': taskname, 'command': command, 'status': status, 'subresource': 'state' } try: self.server.post(self.restURInoAPI + '/workflowdb', data=urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during updateWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception( "Server could not process the updateWork request (prameters are %s)", configreq) else: return True #success return False #failure def failQueuedTasks(self): """ This method is used at the TW startup and it fails QUEUED tasks that supposedly could not communicate with the REST and update their status. The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of the command field. """ limit = self.slaves.nworkers * 2 total = 0 while True: pendingwork = self.getWork(limit=limit, getstatus='QUEUED') for task in pendingwork: self.logger.debug("Failing QUEUED task %s", task['tm_taskname']) if task['tm_task_command']: dummyWorktype, failstatus = STATE_ACTIONS_MAP[ task['tm_task_command']] else: failstatus = 'FAILED' self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus) if not len(pendingwork): self.logger.info("Finished failing QUEUED tasks (total %s)", total) break #too bad "do..while" does not exist in python... else: total += len(pendingwork) self.logger.info( "Failed %s tasks (limit %s), getting next chunk of tasks", len(pendingwork), limit) def failBannedTask(self, task): """ This method is used at the TW startup and it fails NEW tasks which I do not like The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of the command field. Initial implementation bans based on a list of usernames, other task attributes can be checked if needed by adding a bit of code Returns: True : if the task was declared bad and was failed False: for normal (good) tasks """ bannedUsernames = getattr(self.config.TaskWorker, 'bannedUsernames', []) if task['tm_username'] in bannedUsernames: self.logger.debug("Forcefully failing task %s", task['tm_taskname']) if task['tm_task_command']: dummyWorktype, failstatus = STATE_ACTIONS_MAP[ task['tm_task_command']] else: failstatus = 'FAILED' self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus) # TODO look into logging a message for the user return True return False def algorithm(self): """I'm the intelligent guy taking care of getting the work and distributing it to the slave processes.""" self.logger.debug("Failing QUEUED tasks before startup.") self.failQueuedTasks() self.logger.debug("Master Worker Starting Main Cycle.") while not self.STOP: limit = self.slaves.queueableTasks() if not self._lockWork( limit=limit, getstatus='NEW', setstatus='HOLDING'): time.sleep(self.config.TaskWorker.polling) continue pendingwork = self.getWork(limit=limit, getstatus='HOLDING') if len(pendingwork) > 0: self.logger.info("Retrieved a total of %d works", len(pendingwork)) self.logger.debug("Retrieved the following works: \n%s", str(pendingwork)) toInject = [] for task in pendingwork: if self.failBannedTask(task): continue if self.updateWork(task['tm_taskname'], task['tm_task_command'], 'QUEUED'): worktype, failstatus = STATE_ACTIONS_MAP[ task['tm_task_command']] toInject.append((worktype, task, failstatus, None)) else: #The task stays in HOLDING and will be acquired again later self.logger.info( "Skipping %s since it could not be updated to QUEUED. Will be retried in the next iteration", task['tm_taskname']) self.slaves.injectWorks(toInject) for action in self.recurringActions: if action.isTimeToGo(): #Maybe we should use new slaves and not reuse the ones used for the tasks self.logger.debug("Injecting recurring action: \n%s", (str(action.__module__))) self.slaves.injectWorks([(handleRecurring, { 'tm_username': '******', 'tm_taskname': action.__module__ }, 'FAILED', action.__module__)]) self.logger.info('Master Worker status:') self.logger.info(' - free slaves: %d', self.slaves.freeSlaves()) self.logger.info(' - acquired tasks: %d', self.slaves.queuedTasks()) self.logger.info(' - tasks pending in queue: %d', self.slaves.pendingTasks()) time.sleep(self.config.TaskWorker.polling) dummyFinished = self.slaves.checkFinished() self.logger.debug("Master Worker Exiting Main Cycle.")
from __future__ import print_function from __future__ import division from RESTInteractions import HTTPRequests from ServerUtilities import encodeRequest, oracleOutputMapping server = HTTPRequests('cmsweb-testbed.cern.ch', '/data/srv/asyncstageout/state/asyncstageout/creds/OpsProxy', '/data/srv/asyncstageout/state/asyncstageout/creds/OpsProxy') fileDoc = {} fileDoc['asoworker'] = 'asodciangot1' fileDoc['subresource'] = 'acquireTransfers' result = server.post('/crabserver/dev/filetransfers', data=encodeRequest(fileDoc)) print(result) """ fileDoc = {} fileDoc['asoworker'] = 'asodciangot1' fileDoc['subresource'] = 'acquiredTransfers' fileDoc['grouping'] = 0 result = server.get('/crabserver/dev/filetransfers', data=encodeRequest(fileDoc)) #print(oracleOutputMapping(result)) ids = [str(x['id']) for x in oracleOutputMapping(result)]
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg logging logger: the logger :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger.""" def getLogging(quiet, debug): """Retrieves a logger and set the proper level :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('twlog.log', when="midnight") logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = logging.getLogger() logger.debug("Logging level initialized to %s." % loglevel) return logger self.TEST = test self.logger = getLogging(quiet, debug) self.config = config restinstance = None self.resturl = '/crabserver/prod/workflowdb' if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException( "No mode provided: need to specify config.TaskWorker.mode in the configuration" ) elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: restinstance = MODEURL[self.config.TaskWorker.mode]['host'] self.resturl = self.resturl.replace( 'prod', MODEURL[self.config.TaskWorker.mode]['instance']) else: restinstance = self.config.TaskWorker.resturl self.resturl = self.resturl.replace( 'prod', MODEURL[self.config.TaskWorker.mode]['instance']) if self.resturl is None or restinstance is None: raise ConfigException( "No correct mode provided: need to specify config.TaskWorker.mode in the configuration" ) self.server = HTTPRequests(restinstance, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, version=__version__) self.logger.debug("Hostcert: %s, hostkey: %s" % (str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))) if self.TEST: self.slaves = TestWorker(self.config, restinstance, self.resturl) else: self.slaves = Worker(self.config, restinstance, self.resturl) self.slaves.begin() def _lockWork(self, limit, getstatus, setstatus): """Today this is alays returning true, because we do not want the worker to day if the server endpoint is not avaialable. Prints a log entry if answer is greater then 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = { 'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus } try: self.server.post(self.resturl, data=urllib.urlencode(configreq)) except HTTPException, hte: if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \ not hte.headers.get('X-Error-Http', -1) == '400': self.logger.error( "Server could not acquire any work from the server: \n" + "\tstatus: %s\n" % (hte.headers.get('X-Error-Http', 'unknown')) + "\treason: %s" % (hte.headers.get('X-Error-Detail', 'unknown'))) self.logger.error("Probably no task to be processed") if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']: self.logger.error( "Server could not acquire any work from the server:") self.logger.error("%s " % (str(traceback.format_exc()))) self.logger.error("\turl: %s\n" % (getattr(hte, 'url', 'unknown'))) self.logger.error("\tresult: %s\n" % (getattr(hte, 'result', 'unknown'))) except Exception, exc: self.logger.error("Server could not process the request: %s" % (str(exc)))
class RetryManagerDaemon(BaseDaemon): """ _RetryManagerPoller_ Polls for Files in CoolOff State and attempts to retry them based on the requirements in the selected plugin """ def __init__(self, config): """ Initialise class members """ BaseDaemon.__init__(self, config, 'RetryManager') if self.config.isOracle: try: self.oracleDB = HTTPRequests(self.config.oracleDB, self.config.opsProxy, self.config.opsProxy) except: self.logger.exception('Failed to connect to Oracle') else: try: server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy) self.db = server.connectDatabase(self.config.files_database) except Exception as e: self.logger.exception('A problem occured when connecting to couchDB: %s' % e) raise self.logger.debug('Connected to files DB') # Set up a factory for loading plugins self.factory = WMFactory(self.config.retryAlgoDir, namespace=self.config.retryAlgoDir) try: self.plugin = self.factory.loadObject(self.config.algoName, self.config, getFromCache=False, listFlag=True) except Exception as ex: msg = "Error loading plugin %s on path %s\n" % (self.config.algoName, self.config.retryAlgoDir) msg += str(ex) self.logger.error(msg) raise RetryManagerException(msg) self.cooloffTime = self.config.cooloffTime def terminate(self, params): """ Run one more time through, then terminate """ logging.debug("Terminating. doing one more pass before we die") self.algorithm(params) def algorithm(self, parameters=None): """ Performs the doRetries method, loading the appropriate plugin for each job and handling it. """ logging.debug("Running retryManager algorithm") if self.config.isOracle: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'retryTransfers' fileDoc['time_to'] = self.cooloffTime self.logger.debug('fileDoc: %s' % fileDoc) try: results = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception: self.logger.exception("Failed to get retry transfers in oracleDB: %s") return logging.info("Retried files in cooloff: %s,\n now getting transfers to kill" % str(results)) fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'getTransfersToKill' fileDoc['grouping'] = 0 try: results = self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) result = oracleOutputMapping(results) except Exception as ex: self.logger.error("Failed to get killed transfers \ from oracleDB: %s" % ex) return usersToKill = list(set([(x['username'], x['user_group'], x['user_role']) for x in result])) self.logger.debug("Users with transfers to kill: %s" % usersToKill) transfers = Queue() for i in range(self.config.kill_threads): worker = Thread(target=self.killThread, args=(i, transfers,)) worker.setDaemon(True) worker.start() for user in usersToKill: user_trans = [x for x in result if (x['username'], x['user_group'], x['user_role']) == user] self.logger.info("Inserting %s transfers of user %s in the killing queue" % (len(user_trans), user)) transfers.put(user_trans) transfers.join() self.logger.info("Transfers killed.") else: self.doRetries() def killThread(self, thread_id, transfers): """This is the worker thread function for kill command. """ while True: transfer_list = transfers.get() self.logger.info("Starting thread %s" % (thread_id)) user = transfer_list[0]['username'] group = transfer_list[0]['user_group'] role = transfer_list[0]['user_role'] uiSetupScript = getattr(self.config, 'UISetupScript', None) self.logger.debug("Trying to get DN for %s %s %s %s" % (user, self.logger, self.config.opsProxy, self.config.opsProxy)) try: userDN = getDNFromUserName(user, self.logger, ckey=self.config.opsProxy, cert=self.config.opsProxy) except Exception as ex: msg = "Error retrieving the user DN" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if not userDN: transfers.task_done() time.sleep(1) continue self.logger.debug("user DN: %s" % userDN) try: defaultDelegation = {'logger': self.logger, 'credServerPath': self.config.credentialDir, 'myProxySvr': 'myproxy.cern.ch', 'min_time_left': getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': uiSetupScript, 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False)} if hasattr(self.config, "cache_area"): cache_area = self.config.cache_area defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(cache_area)[0] except IndexError: self.logger.error('MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area) transfers.task_done() time.sleep(1) continue if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey try: defaultDelegation['userDN'] = userDN defaultDelegation['group'] = group if group else '' defaultDelegation['role'] = role if group else '' self.logger.debug('delegation: %s' % defaultDelegation) valid_proxy, user_proxy = getProxy(defaultDelegation, self.logger) except Exception as ex: msg = "Error getting the user proxy" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) transfers.task_done() time.sleep(1) continue # TODO: take server from db, right now, take only the first of the list and assuming it valid for all try: # TODO: debug u added during info upload. To be fixed soon! For now worked around fts_server = transfer_list[0]['fts_instance'].split('u')[1] self.logger.info("Delegating proxy to %s" % fts_server) context = fts3.Context(fts_server, user_proxy, user_proxy, verify=True) self.logger.debug(fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) self.logger.info("Proxy delegated. Grouping files by jobId") jobs = {} for fileToKill in transfer_list: # TODO: debug u added during info upload. To be fixed soon! For now worked around jid = str(fileToKill['fts_id']).split('u')[1] if jid not in jobs: jobs[jid] = [] jobs[jid].append(fileToKill) self.logger.info("Found %s jobIds", len(jobs.keys())) self.logger.debug("jobIds: %s", jobs.keys) # list for files killed or failed to killed = [] too_late = [] for ftsJobId, files in jobs.iteritems(): self.logger.info("Cancelling tranfers in %s" % ftsJobId) ref_lfns = [str(x['destination_lfn'].split('/store/')[1]) for x in files] source_lfns = [x['source_lfn'] for x in files] job_list = fts3.get_job_status(context, ftsJobId, list_files=True) tx = job_list['files'] # TODO: this workaround is needed to get FTS file id, we may want to add a column in the db? idListToKill = [x['file_id'] for x in tx if x['dest_surl'].split('/cms/store/')[1] in ref_lfns] # needed for the state update lfnListToKill = [ref_lfns.index(str(x['dest_surl'].split('/cms/store/')[1])) for x in tx if x['dest_surl'].split('/cms/store/')[1] in ref_lfns] self.logger.debug("List of ids to cancel for job %s: %s" % (ftsJobId, idListToKill)) res = fts3.cancel(context, ftsJobId, idListToKill) self.logger.debug('Kill command result: %s' % json.dumps(res)) if not isinstance(res, list): res = [res] # Verify if the kill command succeeded for k, kill_res in enumerate(res): indexToUpdate = lfnListToKill[k] if kill_res in ("FINISHEDDIRTY", "FINISHED", "FAILED"): self.logger.debug(source_lfns[indexToUpdate]) too_late.append(getHashLfn(source_lfns[indexToUpdate])) else: killed.append(getHashLfn(source_lfns[indexToUpdate])) # TODO: decide how to update status for too_late files killed += too_late self.logger.debug('Updating status of killed files: %s' % killed) if len(killed) > 0: data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = killed data['list_of_transfer_state'] = ["KILLED" for _ in killed] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked killed %s" % killed) except: # TODO: split and improve try/except self.logger.exception('Kill command failed') transfers.task_done() def processRetries(self, files): """ _processRetries_ Actually does the dirty work of figuring out what to do with jobs """ if len(files) < 1: # We got no files? return propList = [] fileList = self.loadFilesFromList(recList=files) logging.debug("Files in cooloff %s" % fileList) # Now we should have the files propList = self.selectFilesToRetry(fileList) logging.debug("Files to retry %s" % propList) now = str(datetime.datetime.now()) for file in propList: # update couch self.logger.debug("Trying to resubmit %s" % file['id']) try: document = self.db.document(file['id']) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] != 'killed': data = dict() data['state'] = 'new' data['last_update'] = time.time() data['retry'] = now updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + file['id'] updateUri += "?" + urllib.urlencode(data) try: self.db.makeRequest(uri=updateUri, type="PUT", decode=False) except Exception as ex: msg = "Error updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.debug("%s resubmitted" % file['id']) else: continue return def loadFilesFromList(self, recList): """ _loadFilesFromList_ Load jobs in bulk """ all_files = [] index = 0 for record in recList: all_files.append({}) all_files[index]['id'] = record['key'] all_files[index]['state_time'] = record['value'] index += 1 return all_files def selectFilesToRetry(self, fileList): """ _selectFilesToRetry_ Select files to retry """ result = [] if len(fileList) == 0: return result for file in fileList: logging.debug("Current file %s" %file) try: if self.plugin.isReady(file=file, cooloffTime=self.cooloffTime): result.append(file) except Exception as ex: msg = "Exception while checking for cooloff timeout for file %s\n" % file msg += str(ex) logging.error(msg) logging.debug("File: %s\n" % file) raise RetryManagerException(msg) return result def doRetries(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Discover files that are in cooloff query = {'stale': 'ok'} try: files = self.db.loadView('AsyncTransfer', 'getFilesToRetry', query)['rows'] except Exception as e: self.logger.exception('A problem occured when contacting \ couchDB to retrieve LFNs: %s' % e) return logging.info("Found %s files in cooloff" % len(files)) self.processRetries(files)
def execute(self, *args, **kw): userServer = HTTPRequests(self.server['host'], kw['task']['user_proxy'], kw['task']['user_proxy'], retry=2) retryIssues = [] retryIssuesBySchedd = {} goodSchedulers = [] try: goodSchedulers = self.server.get(self.restURInoAPI + '/info', data={'subresource': 'backendurls'})[0]['result'][0]['htcondorSchedds'] except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning("Unable to contact cmsweb. Will use only on schedulers which was chosen by CRAB3 frontend.") self.logger.info("Good schedulers list got from crabserver: %s " % goodSchedulers) submissionFailure = False if kw['task']['tm_schedd'] not in goodSchedulers: self.logger.info("Scheduler which is chosen is not in crabserver output %s." % goodSchedulers) self.logger.info("No late binding of schedd. Will use %s for submission." % kw['task']['tm_schedd']) goodSchedulers = [kw['task']['tm_schedd']] else: #Make sure that first scheduler is used which is chosen by HTCondorLocator try: goodSchedulers.remove(kw['task']['tm_schedd']) except ValueError: pass goodSchedulers.insert(0,kw['task']['tm_schedd']) self.logger.info("Final good schedulers list after shuffle: %s " % goodSchedulers) #Check memory and walltime and if user requires too much: # upload warning back to crabserver # change walltime to max 47h Issue: #4742 if kw['task']['tm_maxjobruntime'] > 2800: msg = "task requests %s minutes of runtime but only %s is guaranteed to be available. Jobs may not find a site where to run. CRAB3 have changed this value to %s minutes" % (kw['task']['tm_maxjobruntime'], '2800', '2800') self.logger.warning(msg) args[0][1]['tm_maxjobruntime'] = '2800' self.uploadWarning(msg, kw['task']['user_proxy'], kw['task']['tm_taskname']) if kw['task']['tm_maxmemory'] > 2500: msg = "task requests %s memory but only %s is guaranteed to be available. Jobs may not find a site where to run and stay idle forever" % (kw['task']['tm_maxmemory'], '2500') self.logger.warning(msg) self.uploadWarning(msg, kw['task']['user_proxy'], kw['task']['tm_taskname']) for schedd in goodSchedulers: #If submission failure is true, trying to change a scheduler configreq = {'workflow': kw['task']['tm_taskname'], 'subresource': 'updateschedd', 'scheddname': schedd} try: userServer.post(self.restURInoAPI + '/task', data = urllib.urlencode(configreq)) kw['task']['tm_schedd'] = schedd except HTTPException as hte: msg = "Unable to contact cmsweb and update scheduler on which task will be submitted. Error msg: %s" % hte.headers self.logger.warning(msg) time.sleep(20) retryIssuesBySchedd.setdefault(schedd, []).append(msg) continue for retry in range(self.config.TaskWorker.max_retry + 1): #max_retry can be 0 self.logger.debug("Trying to submit task %s %s time." % (kw['task']['tm_taskname'], str(retry))) submissionFailure = False execInt = "" try: execInt = self.executeInternal(*args, **kw) return execInt except Exception as e: msg = "Failed to submit task %s; '%s'" % (kw['task']['tm_taskname'], str(e)) self.logger.error(msg) retryIssues.append(msg) if retry < self.config.TaskWorker.max_retry: #do not sleep on the last retry self.logger.error("Will retry in %s seconds." % str(self.config.TaskWorker.retry_interval[retry])) time.sleep(self.config.TaskWorker.retry_interval[retry]) submissionFailure = True if submissionFailure: msg = "Failed to submit task %s to %s with errors %s" % (kw['task']['tm_taskname'], schedd, retryIssues) retryIssuesBySchedd[schedd] = retryIssues msg = "The CRAB3 server backend could not submit your jobs to the Grid schedulers. This could be a temporary glitch, please retry again later and contact"+\ " the experts if the error persist. The submission was retried %s times on %s schedulers, these are the failures: %s" \ % (len(retryIssues), len(retryIssuesBySchedd), str(retryIssuesBySchedd)) self.logger.error(msg) raise TaskWorkerException(msg)
class ReporterWorker: def __init__(self, user, config): """ store the user and tfc the worker """ self.user = user self.config = config self.dropbox_dir = '%s/dropbox/inputs' % self.config.componentDir logging.basicConfig(level=config.log_level) self.site_tfc_map = {} self.logger = logging.getLogger('AsyncTransfer-Reporter-%s' % self.user) formatter = getCommonLogFormatter(self.config) for handler in logging.getLogger().handlers: handler.setFormatter(formatter) self.uiSetupScript = getattr(self.config, 'UISetupScript', None) self.cleanEnvironment = '' self.userDN = '' self.init = True if getattr(self.config, 'cleanEnvironment', False): self.cleanEnvironment = 'unset LD_LIBRARY_PATH; unset X509_USER_CERT; unset X509_USER_KEY;' # TODO: improve how the worker gets a log self.logger.debug("Trying to get DN") try: self.userDN = getDNFromUserName(self.user, self.logger) except Exception as ex: msg = "Error retrieving the user DN" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) self.init = False return if not self.userDN: self.init = False return defaultDelegation = { 'logger': self.logger, 'credServerPath': self.config.credentialDir, # It will be moved to be getfrom couchDB 'myProxySvr': 'myproxy.cern.ch', 'min_time_left' : getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': self.uiSetupScript, 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False) } if hasattr(self.config, "cache_area"): try: defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(self.config.cache_area)[0] except IndexError: self.logger.error('MyproxyAccount parameter cannot be retrieved from %s' % self.config.cache_area) pass if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey self.valid = False try: self.valid, proxy = getProxy(self.userDN, "", "", defaultDelegation, self.logger) except Exception as ex: msg = "Error getting the user proxy" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) if self.valid: self.userProxy = proxy else: # Use the operator's proxy when the user proxy in invalid. # This will be moved soon self.logger.error('Did not get valid proxy. Setting proxy to ops proxy') self.userProxy = config.opsProxy if self.config.isOracle: try: self.oracleDB = HTTPRequests(self.config.oracleDB, config.opsProxy, config.opsProxy) except Exception: self.logger.exception() raise else: server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy) self.db = server.connectDatabase(self.config.files_database) # Set up a factory for loading plugins self.factory = WMFactory(self.config.pluginDir, namespace = self.config.pluginDir) self.commandTimeout = 1200 self.max_retry = config.max_retry # Proxy management in Couch os.environ['X509_USER_PROXY'] = self.userProxy try: self.phedex = PhEDEx(responseType='xml', dict={'key':self.config.opsProxy, 'cert':self.config.opsProxy}) except Exception as e: self.logger.exception('PhEDEx exception: %s' % e) def __call__(self): """ a. makes the ftscp copyjob b. submits ftscp c. deletes successfully transferred files from the DB """ self.logger.info("Retrieving files for %s" % self.user) files_to_update = self.files_for_update() self.logger.info("%s files to process" % len(files_to_update)) self.logger.debug("%s files to process" % files_to_update) for input_file in files_to_update: remove_good = True remove_failed = True failed_lfns = [] failure_reason = [] good_lfns = [] self.logger.info("Updating %s" % input_file) if os.path.basename(input_file).startswith('Reporter'): try: json_data = json.loads(open(input_file).read()) except ValueError as e: self.logger.error("Error loading %s" % e) self.logger.debug('Removing %s' % input_file) os.unlink(input_file) continue except Exception as e: self.logger.error("Error loading %s" % e) self.logger.debug('Removing %s' % input_file) os.unlink(input_file) continue if json_data: self.logger.debug('Inputs: %s %s %s' % (json_data['LFNs'], json_data['transferStatus'], json_data['failure_reason'])) if 'FAILED' or 'abandoned' or 'CANCELED' or 'lost' in json_data['transferStatus']: # Sort failed files failed_indexes = [i for i, x in enumerate(json_data['transferStatus']) if x == 'FAILED' or x == 'CANCELED'] abandoned_indexes = [i for i, x in enumerate(json_data['transferStatus']) if x == 'abandoned'] failed_indexes.extend(abandoned_indexes) self.logger.info('failed indexes %s' % len(failed_indexes)) self.logger.debug('failed indexes %s' % failed_indexes) for i in failed_indexes: failed_lfns.append(json_data['LFNs'][i]) failure_reason.append(json_data['failure_reason'][i]) self.logger.debug('Marking failed %s %s' %(failed_lfns, failure_reason)) updated_failed_lfns = self.mark_failed(failed_lfns, failure_reason) if 'Done' or 'FINISHED' in json_data['transferStatus']: # Sort good files good_indexes = [i for i, x in enumerate(json_data['transferStatus']) if (x == 'Done' or x == 'FINISHED' or x == 'Finishing') ] self.logger.info('good indexes %s' % len(good_indexes)) self.logger.debug('good indexes %s' % good_indexes) for i in good_indexes: good_lfns.append(json_data['LFNs'][i]) self.logger.info('Marking good %s' %(good_lfns)) try: updated_good_lfns = self.mark_good(good_lfns) except: self.logger.exception('Either no files to mark or failed to update state') # Remove the json file self.logger.debug('Removing %s' % input_file) os.unlink( input_file ) else: self.logger.info('Empty file %s' % input_file) continue else: self.logger.info('File not for the Reporter %s' % input_file) continue self.logger.info('Update completed') return def files_for_update(self): """ Retrieve the list of files to update. """ files_to_update = [] user_dir = os.path.join(self.dropbox_dir, self.user) self.logger.info('Looking into %s' % user_dir) for user_file in os.listdir(user_dir): files_to_update.append(os.path.join(self.dropbox_dir, self.user, user_file)) return files_to_update def mark_good(self, files): """ Mark the list of files as tranferred """ updated_lfn = [] good_ids = [] if len(files) == 0: return updated_lfn for it, lfn in enumerate(files): hash_lfn = getHashLfn(lfn) self.logger.info("Marking good %s" % hash_lfn) self.logger.debug("Marking good %s" % lfn) if not self.config.isOracle: try: document = self.db.document(hash_lfn) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.info("Doc %s Loaded" % hash_lfn) try: now = str(datetime.datetime.now()) last_update = time.time() if self.config.isOracle: docId = getHashLfn(lfn) good_ids.append(docId) updated_lfn.append(lfn) else: if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed': outputLfn = document['lfn'].replace('store/temp', 'store', 1) data = dict() data['end_time'] = now data['state'] = 'done' data['lfn'] = outputLfn data['last_update'] = last_update updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + getHashLfn(lfn) updateUri += "?" + urllib.urlencode(data) self.db.makeRequest(uri = updateUri, type = "PUT", decode = False) updated_lfn.append(lfn) self.logger.debug("Marked good %s" % lfn) else: updated_lfn.append(lfn) try: self.db.commit() except Exception as ex: msg = "Error commiting documents in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue except Exception as ex: msg = "Error updating document" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if self.config.isOracle: try: data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = good_ids data['list_of_transfer_state'] = ["DONE" for x in good_ids] result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked good %s" % good_ids) except Exception: self.logger.exception('Error updating document') return {} self.logger.info("Transferred file %s updated, removing now source file" %docId) try: docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers','fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId, None)[0] except Exception: msg = "Error getting file from source" self.logger.exception(msg) return {} if document["source"] not in self.site_tfc_map: self.logger.debug("site not found... gathering info from phedex") self.site_tfc_map[document["source"]] = self.get_tfc_rules(document["source"]) pfn = self.apply_tfc_to_lfn( '%s:%s' %(document["source"], lfn)) self.logger.debug("File has to be removed now from source site: %s" %pfn) self.remove_files(self.userProxy, pfn) self.logger.debug("Transferred file removed from source") return updated_lfn def remove_files(self, userProxy, pfn): command = 'env -i X509_USER_PROXY=%s gfal-rm -v -t 180 %s' % \ (userProxy, pfn) logging.debug("Running remove command %s" % command) try: rc, stdout, stderr = execute_command(command, self.logger, 3600) except Exception as ex: self.logger.error(ex) if rc: logging.info("Deletion command failed with output %s and error %s" %(stdout, stderr)) else: logging.info("File Deleted.") return def get_tfc_rules(self, site): """ Get the TFC regexp for a given site. """ self.phedex.getNodeTFC(site) try: tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site}) except Exception: self.logger.exception('A problem occured when getting the TFC regexp: %s') return None return readTFC(tfc_file) def apply_tfc_to_lfn(self, file): """ Take a CMS_NAME:lfn string and make a pfn. Update pfn_to_lfn_mapping dictionary. """ try: site, lfn = tuple(file.split(':')) except Exception: self.logger.exception('It does not seem to be an lfn %s' %file.split(':')) return None if site in self.site_tfc_map: pfn = self.site_tfc_map[site].matchLFN('srmv2', lfn) # TODO: improve fix for wrong tfc on sites try: if pfn.find("\\") != -1: pfn = pfn.replace("\\","") if pfn.split(':')[0] != 'srm' and pfn.split(':')[0] != 'gsiftp' : self.logger.error('Broken tfc for file %s at site %s' % (lfn, site)) return None except IndexError: self.logger.error('Broken tfc for file %s at site %s' % (lfn, site)) return None except AttributeError: self.logger.error('Broken tfc for file %s at site %s' % (lfn, site)) return None return pfn else: self.logger.error('Wrong site %s!' % site) return None def mark_failed(self, files=[], failures_reasons=[], force_fail=False): """ Something failed for these files so increment the retry count """ updated_lfn = [] for lfn in files: data = {} self.logger.debug("Document: %s" % lfn) if not isinstance(lfn, dict): if 'temp' not in lfn: temp_lfn = lfn.replace('store', 'store/temp', 1) else: temp_lfn = lfn else: if 'temp' not in lfn['value']: temp_lfn = lfn['value'].replace('store', 'store/temp', 1) else: temp_lfn = lfn['value'] docId = getHashLfn(temp_lfn) # Load document to get the retry_count if self.config.isOracle: try: self.logger.debug("Document: %s" %docId) docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId)[0] data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = docId if force_fail or document['transfer_retry_count'] + 1 > self.max_retry: data['list_of_transfer_state'] = 'FAILED' data['list_of_retry_value'] = 0 else: data['list_of_transfer_state'] = 'RETRY' fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)]) if fatal_error: data['list_of_transfer_state'] = 'FAILED' data['list_of_failure_reason'] = failures_reasons[files.index(lfn)] data['list_of_retry_value'] = 0 self.logger.debug("update: %s" % data) result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) if not data['list_of_transfer_state'] == 'RETRY': updated_lfn.append(lfn) self.logger.debug("Marked failed %s" % lfn) except Exception as ex: self.logger.error("Error updating document status: %s" %ex) continue else: try: document = self.db.document( docId ) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed': now = str(datetime.datetime.now()) last_update = time.time() # Prepare data to update the document in couch if force_fail or len(document['retry_count']) + 1 > self.max_retry: data['state'] = 'failed' data['end_time'] = now else: data['state'] = 'retry' fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)]) if fatal_error: data['state'] = 'failed' data['end_time'] = now self.logger.debug("Failure list: %s" % failures_reasons) self.logger.debug("Files: %s" % files) self.logger.debug("LFN %s" % lfn) data['failure_reason'] = failures_reasons[files.index(lfn)] data['last_update'] = last_update data['retry'] = now # Update the document in couch self.logger.debug("Marking failed %s" % docId) try: updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId updateUri += "?" + urllib.urlencode(data) self.db.makeRequest(uri = updateUri, type = "PUT", decode = False) updated_lfn.append(docId) self.logger.debug("Marked failed %s" % docId) except Exception as ex: msg = "Error in updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue try: self.db.commit() except Exception as ex: msg = "Error commiting documents in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue else: updated_lfn.append(docId) self.logger.debug("failed file updated") return updated_lfn def determine_fatal_error(self, failure=""): """ Determine if transfer error is fatal or not. """ permanent_failure_reasons = [ ".*canceled because it stayed in the queue for too long.*", ".*permission denied.*", ".*disk quota exceeded.*", ".*operation not permitted*", ".*mkdir\(\) fail.*", ".*open/create error.*", ".*mkdir\: cannot create directory.*", ".*does not have enough space.*" ] failure = str(failure).lower() for permanent_failure_reason in permanent_failure_reasons: if re.match(permanent_failure_reason, failure): return True return False def mark_incomplete(self, files=[]): """ Mark the list of files as acquired """ self.logger('Something called mark_incomplete which should never be called')
def execute(self, *args, **kwargs): wmwork = Workflow(name=kwargs['task']['tm_taskname']) wmsubs = Subscription( fileset=args[0], workflow=wmwork, split_algo=kwargs['task']['tm_split_algo'], type=self.jobtypeMapper[kwargs['task']['tm_job_type']]) splitter = SplitterFactory() jobfactory = splitter(subscription=wmsubs) splitparam = kwargs['task']['tm_split_args'] splitparam['algorithm'] = kwargs['task']['tm_split_algo'] if kwargs['task']['tm_job_type'] == 'Analysis': if kwargs['task']['tm_split_algo'] == 'FileBased': splitparam['total_files'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'LumiBased': splitparam['total_lumis'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_split_algo'] == 'EventAwareLumiBased': splitparam['total_events'] = kwargs['task']['tm_totalunits'] elif kwargs['task']['tm_job_type'] == 'PrivateMC': if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][ 'tm_events_per_lumi']: splitparam['events_per_lumi'] = kwargs['task'][ 'tm_events_per_lumi'] if 'tm_generator' in kwargs['task'] and kwargs['task'][ 'tm_generator'] == 'lhe': splitparam['lheInputFiles'] = True splitparam['applyLumiCorrection'] = True factory = jobfactory(**splitparam) numJobs = sum([len(jobgroup.getJobs()) for jobgroup in factory]) maxJobs = getattr(self.config.TaskWorker, 'maxJobsPerTask', 10000) if numJobs == 0: msg = "The CRAB3 server backend could not submit any job to the Grid scheduler:" msg += " Splitting task %s" % (kwargs['task']['tm_taskname']) if kwargs['task']['tm_input_dataset']: msg += " on dataset %s" % (kwargs['task']['tm_input_dataset']) msg += " with %s method does not generate any job" % ( kwargs['task']['tm_split_algo']) raise TaskWorkerException(msg) elif numJobs > maxJobs: raise TaskWorkerException( "The splitting on your task generated %s jobs. The maximum number of jobs in each task is %s" % (numJobs, maxJobs)) #printing duplicated lumis if any lumiChecker = getattr(jobfactory, 'lumiChecker', None) if lumiChecker and lumiChecker.splitLumiFiles: self.logger.warning( "The input dataset contains the following duplicated lumis %s" % lumiChecker.splitLumiFiles.keys()) #TODO use self.uploadWarning try: userServer = HTTPRequests(self.server['host'], kwargs['task']['user_proxy'], kwargs['task']['user_proxy']) configreq = { 'subresource': 'addwarning', 'workflow': kwargs['task']['tm_taskname'], 'warning': b64encode( 'The CRAB3 server backend detected lumis split across files in the input dataset.' ' Will apply the necessary corrections in the splitting algorithms. You can ignore this message.' ) } userServer.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning( "Cannot add warning to REST after finding duplicates") return Result(task=kwargs['task'], result=factory)
def processWorker(inputs, results, resthost, resturi, procnum): """Wait for an reference to appear in the input queue, call the referenced object and write the output in the output queue. :arg Queue inputs: the queue where the inputs are shared by the master :arg Queue results: the queue where this method writes the output :return: default returning zero, but not really needed.""" logger = setProcessLogger(str(procnum)) logger.info("Process %s is starting. PID %s", procnum, os.getpid()) procName = "Process-%s" % procnum while True: try: workid, work, task, inputargs = inputs.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break if work == 'STOP': break outputs = None t0 = time.time() logger.debug("%s: Starting %s on %s" %(procName, str(work), task['tm_taskname'])) try: msg = None outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs) except WorkerHandlerException as we: outputs = Result(task=task, err=str(we)) msg = str(we) except Exception as exc: outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc()) finally: if msg: try: logger.info("Uploading error message to REST: %s" % msg) server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry = 2) truncMsg = truncateError(msg) configreq = { 'workflow': task['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST 'failure': b64encode(truncMsg)} server.post(resturi, data = urllib.urlencode(configreq)) logger.info("Error message successfully uploaded to the REST") except HTTPException as hte: logger.warning("Cannot upload failure message to the REST for workflow %s. HTTP headers follows:" % task['tm_taskname']) logger.error(hte.headers) except Exception as exc: logger.warning("Cannot upload failure message to the REST for workflow %s.\nReason: %s" % (task['tm_taskname'], exc)) logger.exception('Traceback follows:') t1 = time.time() logger.debug("%s: ...work on %s completed in %d seconds: %s" % (procName, task['tm_taskname'], t1-t0, outputs)) results.put({ 'workid': workid, 'out' : outputs }) logger.debug("Slave %s exiting." % procnum) return 0
def execute(self, *args, **kwargs): userServer = HTTPRequests(self.server['host'], kwargs['task']['user_proxy'], kwargs['task']['user_proxy'], retry=2, logger=self.logger) retryIssuesBySchedd = {} goodSchedulers = [] try: goodSchedulers = self.server.get(self.restURInoAPI + '/info', data={'subresource': 'backendurls'})[0]['result'][0]['htcondorSchedds'] goodSchedulers = list(set(goodSchedulers)) #we do not care about except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning("Unable to contact cmsweb. Will use only on schedulers which was chosen by CRAB3 frontend.") self.logger.info("Good schedulers list got from crabserver: %s ", goodSchedulers) if kwargs['task']['tm_schedd'] not in goodSchedulers: self.logger.info("Scheduler which is chosen is not in crabserver output %s.", goodSchedulers) self.logger.info("No late binding of schedd. Will use %s for submission.", kwargs['task']['tm_schedd']) goodSchedulers = [kwargs['task']['tm_schedd']] else: #Make sure that first scheduler is used which is chosen by HTCondorLocator try: goodSchedulers.remove(kwargs['task']['tm_schedd']) except ValueError: pass goodSchedulers.insert(0, kwargs['task']['tm_schedd']) self.logger.info("Final good schedulers list after shuffle: %s ", goodSchedulers) #Check memory and walltime and if user requires too much: # upload warning back to crabserver # change walltime to max 47h Issue: #4742 stdmaxjobruntime = 2750 stdmaxmemory = 2500 if kwargs['task']['tm_maxjobruntime'] > stdmaxjobruntime: msg = "Task requests %s minutes of runtime, but only %s minutes are guaranteed to be available." % (kwargs['task']['tm_maxjobruntime'], stdmaxjobruntime) msg += " Jobs may not find a site where to run." msg += " CRAB has changed this value to %s minutes." % (stdmaxjobruntime) self.logger.warning(msg) args[0][1]['tm_maxjobruntime'] = str(stdmaxjobruntime) self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) if kwargs['task']['tm_maxmemory'] > stdmaxmemory: msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % (kwargs['task']['tm_maxmemory'], stdmaxmemory) msg += " Jobs may not find a site where to run and stay idle forever." self.logger.warning(msg) self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) for schedd in goodSchedulers: #If submission failure is true, trying to change a scheduler configreq = {'workflow': kwargs['task']['tm_taskname'], 'subresource': 'updateschedd', 'scheddname': schedd} try: userServer.post(self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) kwargs['task']['tm_schedd'] = schedd except HTTPException as hte: msg = "Unable to contact cmsweb and update scheduler on which task will be submitted. Error msg: %s" % hte.headers self.logger.warning(msg) time.sleep(20) retryIssuesBySchedd[schedd] = [msg] continue retryIssues = [] for retry in range(self.config.TaskWorker.max_retry + 1): #max_retry can be 0 self.logger.debug("Trying to submit task %s %s time.", kwargs['task']['tm_taskname'], str(retry)) try: execInt = self.executeInternal(*args, **kwargs) return execInt except Exception as ex: msg = "Failed to submit task %s; '%s'"% (kwargs['task']['tm_taskname'], str(ex)) self.logger.exception(msg) retryIssues.append(msg) if retry < self.config.TaskWorker.max_retry: #do not sleep on the last retry self.logger.error("Will retry in %s seconds.", self.config.TaskWorker.retry_interval[retry]) time.sleep(self.config.TaskWorker.retry_interval[retry]) ## All the submission retries to the current schedd have failed. Record the ## failures. retryIssuesBySchedd[schedd] = retryIssues ## All the submission retries to all possible schedds have failed. msg = "The CRAB server backend was not able to submit the jobs to the Grid schedulers." msg += " This could be a temporary glitch. Please try again later." msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL) msg += " The submission was retried %s times on %s schedulers." % (sum(map(len, retryIssuesBySchedd.values())), len(retryIssuesBySchedd)) msg += " These are the failures per Grid scheduler: %s" % (str(retryIssuesBySchedd)) self.logger.error(msg) raise TaskWorkerException(msg)
except Exception, exc: outputs = Result(task=task, err=str(exc)) msg = "%s: I just had a failure for %s" % (procName, str(exc)) msg += "\n\tworkid=" + str(workid) msg += "\n\ttask=" + str(task['tm_taskname']) msg += "\n" + str(traceback.format_exc()) finally: if msg: try: server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey) configreq = { 'workflow': task['tm_taskname'], 'status': "FAILED", 'subresource': 'failure', 'failure': b64encode(msg)} server.post(resturi, data = urllib.urlencode(configreq)) logger.info("Error message successfully uploaded to the REST") except Exception, exc: logger.warning("Cannot upload failure message to the REST for workflow %s.\nReason: %s" % (task['tm_taskname'], exc)) t1 = time.time() logger.debug("%s: ...work on %s completed in %d seconds: %s" % (procName, task['tm_taskname'], t1-t0, outputs)) results.put({ 'workid': workid, 'out' : outputs }) logger.debug("Slave exiting.") return 0
def submit(phedex, ftsContext, toTrans): """ submit tranfer jobs - group files to be transferred by source site - prepare jobs chunks of max 200 transfers - submit fts job :param ftsContext: fts client ftsContext :param toTrans: [source pfn, destination pfn, oracle file id, source site] :return: list of jobids submitted """ threadLock = threading.Lock() threads = [] jobids = [] to_update = [] oracleDB = HTTPRequests(rest_filetransfers, proxy, proxy) sources = list(set([x[3] for x in toTrans])) for source in sources: ids = [x[2] for x in toTrans if x[3] == source] username = toTrans[0][5] taskname = toTrans[0][6] src_lfns = [x[0] for x in toTrans if x[3] == source] dst_lfns = [x[1] for x in toTrans if x[3] == source] sorted_source_pfns = [] sorted_dest_pfns = [] try: for chunk in chunks(src_lfns, 10): unsorted_source_pfns = [[k[1], str(x)] for k, x in phedex.getPFN(source, chunk).items()] for order_lfn in chunk: for lfn, pfn in unsorted_source_pfns: if order_lfn == lfn: sorted_source_pfns.append(pfn) break for chunk in chunks(dst_lfns, 10): unsorted_dest_pfns = [[k[1], str(x)] for k, x in phedex.getPFN(toTrans[0][4], chunk).items()] for order_lfn in chunk: for lfn, pfn in unsorted_dest_pfns: if order_lfn == lfn: sorted_dest_pfns.append(pfn) break except Exception as ex: logging.error("Failed to map lfns to pfns: %s", ex) mark_failed(ids, ["Failed to map lfn to pfn: " + str(ex) for _ in ids]) source_pfns = sorted_source_pfns dest_pfns = sorted_dest_pfns tx_from_source = [[x[0], x[1], x[2], source, username, taskname] for x in zip(source_pfns, dest_pfns, ids)] for files in chunks(tx_from_source, 200): thread = submit_thread(threadLock, logging, ftsContext, files, source, jobids, to_update) thread.start() threads.append(thread) for t in threads: t.join() for fileDoc in to_update: _ = oracleDB.post('/filetransfers', data=encodeRequest(fileDoc)) logging.info("Marked submitted %s files", fileDoc['list_of_ids']) return jobids
class FileTransfersTest(unittest.TestCase): """ _DashboardAPITest_ Unit tests for the FileTransfers API """ def setUp(self): """ Setup for unit tests """ self.server = HTTPRequests(os.environ['SERVER_HOST'], os.environ['X509_USER_PROXY'], os.environ['X509_USER_PROXY']) self.lfnBase = '/store/temp/user/%s/my_cool_dataset-%s/file-%s-%s.root' self.fileDoc = { 'id': 'OVERWRITE', 'username': '******', 'taskname': 'OVERWRITE', 'start_time': 0, 'destination': 'T2_CH_CERN', 'destination_lfn': 'OVERWRITE', 'source': 'T2_US_Caltech', 'source_lfn': 'OVERWRITE', 'filesize': random.randint(1, 9999), 'publish': 1, 'transfer_state': 'OVERWRITE', 'publication_state': 'OVERWRITE', 'job_id': 1, 'job_retry_count': 0, 'type': 'log', 'rest_host': 'cmsweb.cern.ch', 'rest_uri': '/crabserver/prod/' } self.ids = [] self.users = [ 'jbalcas', 'mmascher', 'dciangot', 'riahi', 'erupeika', 'sbelforte' ] # just random users for tests self.tasks = {} self.totalFiles = 10 def testFileTransferPUT(self): """ _testFileTransferPUT_ Just test simple testFileTransferPUT with fake data """ # We just sent fake data which is not monitored by dashboard. # Also only the first time to decide is publication ON or NOT for user in self.users: timestamp = time.strftime('%y%m%d_%H%M%S', time.gmtime()) for i in range(self.totalFiles): now = int(time.time()) # Generate a taskname workflowName = "" taskname = "" if user not in self.tasks: workflowName = "".join([ random.choice(string.ascii_lowercase) for _ in range(20) ]) + "_" + str(now) publicationState = random.choice(['NEW', 'NOT_REQUIRED']) else: workflowName = self.tasks[user]['workflowName'] publicationState = self.tasks[user]['publication'] transferState = random.choice(['NEW', 'DONE']) taskname = generateTaskName(user, workflowName, timestamp) finalLfn = self.lfnBase % (user, workflowName, i, random.randint(1, 9999)) idHash = getHashLfn(finalLfn) self.fileDoc['id'] = idHash self.fileDoc['job_id'] = i self.fileDoc['username'] = user self.fileDoc['taskname'] = taskname self.fileDoc['start_time'] = int(time.time()) self.fileDoc['source_lfn'] = finalLfn self.fileDoc['destination_lfn'] = finalLfn self.fileDoc['transfer_state'] = transferState self.fileDoc['publication_state'] = publicationState print(self.fileDoc) self.server.put('/crabserver/dev/fileusertransfers', data=encodeRequest(self.fileDoc)) # if I will put the same doc twice, it should raise an error. # self.server.put('/crabserver/dev/fileusertransfers', data=urllib.urlencode(self.fileDoc)) # This tasks are for the future and next calls if user not in self.tasks: self.tasks[user] = { 'workflowName': workflowName, 'taskname': taskname, 'listOfIds': [], 'publication': publicationState, 'toTransfer': 0, 'toPublish': 0, 'total': self.totalFiles } if self.tasks[user]['publication'] == 'NEW': self.tasks[user]['toPublish'] += 1 if transferState == 'NEW': self.tasks[user]['toTransfer'] += 1 self.tasks[user]['listOfIds'].append(idHash) # This should raise an error for username in self.tasks: taskname = self.tasks[username]['taskname'] for query in ['getTransferStatus', 'getPublicationStatus']: result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': query, 'username': username, 'taskname': taskname })) print(result) print(result[0]['result']) taskInfoDict = oracleOutputMapping(result, 'id') print(taskInfoDict) for key, docDict in taskInfoDict.items(): result = self.server.get( '/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'getById', 'id': key })) randomUsers = random.sample( set(self.users), 3 ) # Take half of the users and kill their transfers for specific task for username in randomUsers: taskname = self.tasks[username]['taskname'] result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'killTransfers', 'username': username, 'taskname': taskname })) print(result) # oneUser is left for killing a list of IDs # leftUsers will be killing transfers one by one for specific id. leftUsers = list(set(self.users) - set(randomUsers)) oneUser = random.sample(set(leftUsers), 1) leftUsers = list(set(leftUsers) - set(oneUser)) for username in leftUsers: # First get all left ids for this users result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'getTransferStatus', 'username': username, 'taskname': self.tasks[username]['taskname'] })) resultOut = oracleOutputMapping(result, None) print("**" * 50) for outDict in resultOut: print(outDict) result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'killTransfersById', 'username': username, 'listOfIds': outDict['id'] })) print(result) print(resultOut) print(result) for username in oneUser: result = self.server.post( '/crabserver/dev/fileusertransfers', data=encodeRequest( { 'subresource': 'killTransfersById', 'username': username, 'listOfIds': self.tasks[username]['listOfIds'] }, ['listOfIds'])) # As it asks to kill all which are in new, need to double check what we submitted before and if the output of killed is correct print(result) print(self.tasks[username])
class MasterWorker(object): """I am the master of the TaskWorker""" def __init__(self, config, quiet, debug, test=False): """Initializer :arg WMCore.Configuration config: input TaskWorker configuration :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :arg bool test: it tells if to run in test (no subprocesses) mode.""" def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(quiet, debug): """Sets the root logger with the desired verbosity level The root logger logs to logs/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') createLogdir('logs/processes') createLogdir('logs/tasks') if self.TEST: #if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger self.STOP = False self.TEST = test self.logger = setRootLogger(quiet, debug) self.config = config resthost = None self.restURInoAPI = None if not self.config.TaskWorker.mode in MODEURL.keys(): raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration") elif MODEURL[self.config.TaskWorker.mode]['host'] is not None: resthost = MODEURL[self.config.TaskWorker.mode]['host'] self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] else: resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_- self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance'] if resthost is None: raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration") #Let's increase the server's retries for recoverable errors in the MasterWorker #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...) self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 20, logger = self.logger) self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)) # Retries for any failures if not hasattr(self.config.TaskWorker, 'max_retry'): self.config.TaskWorker.max_retry = 0 if not hasattr(self.config.TaskWorker, 'retry_interval'): self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)] if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry: raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.") # use the config to pass some useful global stuff to all workers # will use TaskWorker.cmscert/key to talk with CMSWEB self.config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT = self.config.TaskWorker.cmscert, X509_USER_KEY = self.config.TaskWorker.cmskey) if self.TEST: self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb') else: self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb') self.slaves.begin() recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', []) self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames] def getRecurringActionInst(self, actionName): mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName) return getattr(mod, actionName)() def _lockWork(self, limit, getstatus, setstatus): """Today this is always returning true, because we do not want the worker to die if the server endpoint is not avaialable. Prints a log entry if answer is greater than 400: * the server call succeeded or * the server could not find anything to update or * the server has an internal error""" configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus} try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during _lockWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) return False except Exception: #pylint: disable=broad-except self.logger.exception("Server could not process the _lockWork request (prameters are %s)", configreq) return False return True def getWork(self, limit, getstatus): configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus} pendingwork = [] try: pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result'] except HTTPException as hte: msg = "HTTP Error during getWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception("Server could not process the getWork request (prameters are %s)", configreq) return pendingwork def quit_(self, dummyCode, dummyTraceback): self.logger.info("Received kill request. Setting STOP flag in the master process...") self.STOP = True def updateWork(self, taskname, command, status): """ Update taskname setting the status and the command for it Return True if the change succeded, False otherwise """ configreq = {'workflow': taskname, 'command': command, 'status': status, 'subresource': 'state'} try: self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error during updateWork: %s\n" % str(hte) msg += "HTTP Headers are %s: " % hte.headers self.logger.error(msg) except Exception: #pylint: disable=broad-except self.logger.exception("Server could not process the updateWork request (prameters are %s)", configreq) else: return True #success return False #failure def failQueuedTasks(self): """ This method is used at the TW startup and it fails QUEUED tasks that supposedly could not communicate with the REST and update their status. The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of the command field. """ limit = self.slaves.nworkers * 2 total = 0 while True: pendingwork = self.getWork(limit=limit, getstatus='QUEUED') for task in pendingwork: self.logger.debug("Failing QUEUED task %s", task['tm_taskname']) if task['tm_task_command']: dummyWorktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']] else: failstatus = 'FAILED' self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus) if not len(pendingwork): self.logger.info("Finished failing QUEUED tasks (total %s)", total) break #too bad "do..while" does not exist in python... else: total += len(pendingwork) self.logger.info("Failed %s tasks (limit %s), getting next chunk of tasks", len(pendingwork), limit) def algorithm(self): """I'm the intelligent guy taking care of getting the work and distributing it to the slave processes.""" self.logger.debug("Failing QUEUED tasks before startup.") self.failQueuedTasks() self.logger.debug("Master Worker Starting Main Cycle.") while(not self.STOP): limit = self.slaves.queueableTasks() if not self._lockWork(limit=limit, getstatus='NEW', setstatus='HOLDING'): time.sleep(self.config.TaskWorker.polling) continue pendingwork = self.getWork(limit=limit, getstatus='HOLDING') if len(pendingwork) > 0: self.logger.info("Retrieved a total of %d works", len(pendingwork)) self.logger.debug("Retrieved the following works: \n%s", str(pendingwork)) toInject = [] for task in pendingwork: if self.updateWork(task['tm_taskname'], task['tm_task_command'], 'QUEUED'): worktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']] toInject.append((worktype, task, failstatus, None)) else: #The task stays in HOLDING and will be acquired again later self.logger.info("Skipping %s since it could not be updated to QUEUED. Will be retried in the next iteration", task['tm_taskname']) self.slaves.injectWorks(toInject) for action in self.recurringActions: if action.isTimeToGo(): #Maybe we should use new slaves and not reuse the ones used for the tasks self.logger.debug("Injecting recurring action: \n%s", (str(action.__module__))) self.slaves.injectWorks([(handleRecurring, {'tm_username': '******', 'tm_taskname' : action.__module__}, 'FAILED', action.__module__)]) self.logger.info('Master Worker status:') self.logger.info(' - free slaves: %d', self.slaves.freeSlaves()) self.logger.info(' - acquired tasks: %d', self.slaves.queuedTasks()) self.logger.info(' - tasks pending in queue: %d', self.slaves.pendingTasks()) time.sleep(self.config.TaskWorker.polling) dummyFinished = self.slaves.checkFinished() self.logger.debug("Master Worker Exiting Main Cycle.")
def execute(self, *args, **kwargs): userServer = HTTPRequests( self.server["host"], kwargs["task"]["user_proxy"], kwargs["task"]["user_proxy"], retry=20, logger=self.logger, ) retryIssuesBySchedd = {} goodSchedulers = [] try: goodSchedulers = self.server.get(self.restURInoAPI + "/info", data={"subresource": "backendurls"})[0][ "result" ][0]["htcondorSchedds"] goodSchedulers = list(set(goodSchedulers)) # we do not care about except HTTPException as hte: self.logger.error(hte.headers) self.logger.warning( "Unable to contact cmsweb. Will use only on schedulers which was chosen by CRAB3 frontend." ) self.logger.info("Good schedulers list got from crabserver: %s ", goodSchedulers) if kwargs["task"]["tm_schedd"] not in goodSchedulers: self.logger.info("Scheduler which is chosen is not in crabserver output %s.", goodSchedulers) self.logger.info("No late binding of schedd. Will use %s for submission.", kwargs["task"]["tm_schedd"]) goodSchedulers = [kwargs["task"]["tm_schedd"]] else: # Make sure that first scheduler is used which is chosen by HTCondorLocator try: goodSchedulers.remove(kwargs["task"]["tm_schedd"]) except ValueError: pass goodSchedulers.insert(0, kwargs["task"]["tm_schedd"]) self.logger.info("Final good schedulers list after shuffle: %s ", goodSchedulers) # Check memory and walltime and if user requires too much: # upload warning back to crabserver # change walltime to max 47h Issue: #4742 stdmaxjobruntime = 2750 stdmaxmemory = 2500 if kwargs["task"]["tm_maxjobruntime"] > stdmaxjobruntime: msg = "Task requests %s minutes of runtime, but only %s minutes are guaranteed to be available." % ( kwargs["task"]["tm_maxjobruntime"], stdmaxjobruntime, ) msg += " Jobs may not find a site where to run." msg += " CRAB has changed this value to %s minutes." % (stdmaxjobruntime) self.logger.warning(msg) args[0][1]["tm_maxjobruntime"] = str(stdmaxjobruntime) self.uploadWarning(msg, kwargs["task"]["user_proxy"], kwargs["task"]["tm_taskname"]) if kwargs["task"]["tm_maxmemory"] > stdmaxmemory: msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % ( kwargs["task"]["tm_maxmemory"], stdmaxmemory, ) msg += " Jobs may not find a site where to run and stay idle forever." self.logger.warning(msg) self.uploadWarning(msg, kwargs["task"]["user_proxy"], kwargs["task"]["tm_taskname"]) for schedd in goodSchedulers: # If submission failure is true, trying to change a scheduler configreq = {"workflow": kwargs["task"]["tm_taskname"], "subresource": "updateschedd", "scheddname": schedd} try: userServer.post(self.restURInoAPI + "/task", data=urllib.urlencode(configreq)) kwargs["task"]["tm_schedd"] = schedd except HTTPException as hte: msg = ( "Unable to contact cmsweb and update scheduler on which task will be submitted. Error msg: %s" % hte.headers ) self.logger.warning(msg) time.sleep(20) retryIssuesBySchedd[schedd] = [msg] continue retryIssues = [] for retry in range(self.config.TaskWorker.max_retry + 1): # max_retry can be 0 self.logger.debug("Trying to submit task %s %s time.", kwargs["task"]["tm_taskname"], str(retry)) try: execInt = self.executeInternal(*args, **kwargs) scheddStats.success(schedd, self.clusterId) return execInt except Exception as ex: scheddStats.failure(schedd) msg = "Failed to submit task %s; '%s'" % (kwargs["task"]["tm_taskname"], str(ex)) self.logger.exception(msg) retryIssues.append(msg) if retry < self.config.TaskWorker.max_retry: # do not sleep on the last retry self.logger.error("Will retry in %s seconds.", self.config.TaskWorker.retry_interval[retry]) time.sleep(self.config.TaskWorker.retry_interval[retry]) finally: self.logger.info(scheddStats) ## All the submission retries to the current schedd have failed. Record the ## failures. retryIssuesBySchedd[schedd] = retryIssues ## All the submission retries to all possible schedds have failed. msg = "The CRAB server backend was not able to submit the jobs to the Grid schedulers." msg += " This could be a temporary glitch. Please try again later." msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL) msg += " The submission was retried %s times on %s schedulers." % ( sum(map(len, retryIssuesBySchedd.values())), len(retryIssuesBySchedd), ) msg += " These are the failures per Grid scheduler: %s" % (str(retryIssuesBySchedd)) self.logger.error(msg) raise TaskWorkerException(msg)
class update(object): def __init__(self, logger, config): """ Initialize connection to the db and logging/config :param logger: pass the logging :param config: refer to the configuration file """ self.oracleDB = HTTPRequests(config.oracleDB, config.opsProxy, config.opsProxy) self.config = config self.logger = logger def retry(self): """ Retry documents older than self.config.cooloffTime :return: """ fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'retryTransfers' fileDoc['time_to'] = self.config.cooloffTime self.logger.debug('fileDoc: %s' % fileDoc) results = dict() try: results = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception: self.logger.exception("Failed to get retry transfers in oracleDB: %s") self.logger.info("Retried files in cooloff: %s" % str(results)) return 0 def acquire(self): """ Get a number (1k for current oracle rest) of documents and bind them to this aso NEW -> ACQUIRED (asoworker NULL -> config.asoworker) :return: """ self.logger.info('Retrieving users...') fileDoc = dict() fileDoc['subresource'] = 'activeUsers' fileDoc['grouping'] = 0 fileDoc['asoworker'] = self.config.asoworker try: self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" % ex) return 1 users = list() try: docs = oracleOutputMapping(result) users = [[x['username'], x['user_group'], x['user_role']] for x in docs] self.logger.info('Users to process: %s' % str(users)) except: self.logger.exception('User data malformed. ') for user in users: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquireTransfers' fileDoc['username'] = user[0] self.logger.debug("Retrieving transfers from oracleDB for user: %s " % user) try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" % ex) return users def getAcquired(self, users): """ Get a number of documents to be submitted (in ACQUIRED status) and return results of the query for logs :return: """ documents = list() for user in users: username = user[0] group = user[1] role = user[2] fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquiredTransfers' fileDoc['grouping'] = 1 fileDoc['username'] = username if group == '': group = None if role == '': role = None fileDoc['vogroup'] = group fileDoc['vorole'] = role self.logger.debug("Retrieving users from oracleDB") try: results = self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) documents += oracleOutputMapping(results) except Exception as ex: self.logger.error("Failed to get acquired transfers \ from oracleDB: %s" % ex) return documents def submitted(self, files): """ Mark the list of files as submitted once the FTS submission succeeded ACQUIRED -> SUBMITTED Return the lfns updated successfully and report data for dashboard :param files: tuple (source_lfn, dest_lfn) :return: """ lfn_in_transfer = [] dash_rep = () id_list = list() docId = '' for lfn in files: lfn = lfn[0] if lfn.find('temp') == 7: self.logger.debug("Marking acquired %s" % lfn) docId = getHashLfn(lfn) self.logger.debug("Marking acquired %s" % docId) try: id_list.append(docId) lfn_in_transfer.append(lfn) except Exception as ex: self.logger.error("Error getting id: %s" % ex) raise lfn_in_transfer.append(lfn) # TODO: add dashboard stuff # dash_rep = (document['jobid'], document['job_retry_count'], document['taskname']) try: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = id_list fileDoc['list_of_transfer_state'] = ["SUBMITTED" for x in id_list] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) self.logger.debug("Marked acquired %s" % (id_list)) except Exception as ex: self.logger.error("Error during status update: %s" % ex) return lfn_in_transfer, dash_rep def transferred(self, files): """ Mark the list of files as tranferred """ good_ids = list() updated_lfn = list() try: for lfn in files: lfn = lfn[0] if lfn.find('temp') == 7: docId = getHashLfn(lfn) good_ids.append(docId) updated_lfn.append(lfn) self.logger.debug("Marking done %s" % lfn) self.logger.debug("Marking done %s" % docId) data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = good_ids data['list_of_transfer_state'] = ["DONE" for x in good_ids] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked good %s" % good_ids) except Exception: self.logger.exception("Error updating documents") return 1 return 0 def failed(self, files, failures_reasons=[], max_retry=3, force_fail=False, submission_error=False): """ :param files: tuple (source_lfn, dest_lfn) :param failures_reasons: list(str) with reasons of failure :param max_retry: number of retry before giving up :param force_fail: flag for triggering failure without retry :param submission_error: error during fts submission :return: """ updated_lfn = [] for Lfn in files: lfn = Lfn[0] # Load document and get the retry_count docId = getHashLfn(lfn) self.logger.debug("Marking failed %s" % docId) try: docbyId = self.oracleDB.get(self.config.oracleUserFileTrans.replace('filetransfer','fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId, None)[0] self.logger.debug("Document: %s" % document) except Exception as ex: self.logger.error("Error updating failed docs: %s" % ex) return 1 fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = docId if not len(failures_reasons) == 0: try: fileDoc['list_of_failure_reason'] = failures_reasons[files.index(Lfn)] except: fileDoc['list_of_failure_reason'] = "unexcpected error, missing reasons" self.logger.exception("missing reasons") if force_fail or document['transfer_retry_count'] + 1 > max_retry: fileDoc['list_of_transfer_state'] = 'FAILED' fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_transfer_state'] = 'RETRY' if submission_error: fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS" fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_retry_value'] = 1 self.logger.debug("update: %s" % fileDoc) try: updated_lfn.append(docId) self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception: self.logger.exception('ERROR updating failed documents') return 1 self.logger.debug("failed file updated") return 0 def acquirePub(self): """ :return: """ fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquirePublication' self.logger.debug("Retrieving publications from oracleDB") try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire publications \ from oracleDB: %s" % ex) def getPub(self): """ :return: """ to_pub_docs = list() filedoc = dict() filedoc['asoworker'] = self.config.asoworker filedoc['subresource'] = 'acquiredPublication' filedoc['grouping'] = 0 try: results = self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(filedoc)) to_pub_docs = oracleOutputMapping(results) except Exception as ex: self.logger.error("Failed to get acquired publications \ from oracleDB: %s" % ex) return to_pub_docs return to_pub_docs def pubDone(self, workflow, files): """ :param files: :param workflow: :return: """ wfnamemsg = "%s: " % workflow data = dict() id_list = list() for lfn in files: source_lfn = lfn docId = getHashLfn(source_lfn) id_list.append(docId) msg = "Marking file %s as published." % lfn msg += " Document id: %s (source LFN: %s)." % (docId, source_lfn) self.logger.info(wfnamemsg + msg) data['asoworker'] = self.config.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = id_list data['list_of_publication_state'] = ['DONE' for x in id_list] try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("updated done: %s " % id_list) except Exception as ex: self.logger.error("Error during status update for published docs: %s" % ex) def pubFailed(self, task, files, failure_reasons=list(), force_failure=False): """ :param files: :param failure_reasons: :return: """ id_list = list() for Lfn in files: source_lfn = Lfn[0] docId = getHashLfn(source_lfn) id_list.append(docId) self.logger.debug("Marking failed %s" % docId) fileDoc = dict() fileDoc['asoworker'] = 'asodciangot1' fileDoc['subresource'] = 'updatePublication' fileDoc['list_of_ids'] = id_list fileDoc['list_of_publication_state'] = ['FAILED' for x in id_list] # TODO: implement retry, publish_retry_count missing from input? fileDoc['list_of_retry_value'] = [1 for x in id_list] fileDoc['list_of_failure_reason'] = failure_reasons try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) self.logger.debug("updated failed: %s " % id_list) except Exception: msg = "Error updating failed documents" self.logger.exception(msg) def lastPubTime(self, workflow): """ :param workflow: :return: """ data = dict() data['workflow'] = workflow data['subresource'] = 'updatepublicationtime' try: result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'), data=encodeRequest(data)) self.logger.debug("%s last publication type update: %s " % (workflow, str(result))) except Exception: msg = "Error updating last publication time" self.logger.exception(msg) def searchTask(self, workflow): """ :param workflow: :return: """ data = dict() data['workflow'] = workflow data['subresource'] = 'search' try: result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'), data=encodeRequest(data)) self.logger.debug("task: %s " % str(result[0])) self.logger.debug("task: %s " % getColumn(result[0], 'tm_last_publication')) except Exception as ex: self.logger.error("Error during task doc retrieving: %s" % ex) return {} return oracleOutputMapping(result)