class WorkQueueReqMgrInterface(object): """Helper class for ReqMgr interaction""" def __init__(self, **kwargs): if not kwargs.get('logger'): import logging kwargs['logger'] = logging self.logger = kwargs['logger'] # this will break all in one test self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None)) centralurl = kwargs.get("central_logdb_url", "") identifier = kwargs.get("log_reporter", "") # set the thread name before creat the log db. # only sets that when it is not set already myThread = threading.currentThread() if myThread.getName() == "MainThread": myThread.setName(self.__class__.__name__) self.logdb = LogDB(centralurl, identifier, logger=self.logger) self.previous_state = {} def __call__(self, queue): """Synchronize WorkQueue and RequestManager""" msg = '' try: # pull in new work work = self.queueNewRequests(queue) msg += "New Work: %d\n" % work except Exception: self.logger.exception("Error caught during RequestManager pull") try: # get additional open-running work extraWork = self.addNewElementsToOpenRequests(queue) msg += "Work added: %d\n" % extraWork except Exception: self.logger.exception("Error caught during RequestManager split") try: # report back to ReqMgr uptodate_elements = self.report(queue) msg += "Updated ReqMgr status for: %s\n" % ", ".join([x['RequestName'] for x in uptodate_elements]) except Exception: self.logger.exception("Error caught during RequestManager update") else: try: # Delete finished requests from WorkQueue self.deleteFinishedWork(queue, uptodate_elements) except Exception: self.logger.exception("Error caught during work deletion") queue.backend.recordTaskActivity('reqmgr_sync', msg) def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] if queue.params['DrainMode']: self.logger.info('Draining queue: Skip requesting work from ReqMgr') return 0 try: workLoads = self.getAvailableRequests() except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: try: try: Lexicon.couchurl(workLoadUrl) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError(None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) self.logdb.delete(reqName, "error", this_thread=True) except TERMINAL_EXCEPTIONS as ex: # fatal error - report back to ReqMgr self.logger.error('Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nError: "%s"' % str(ex) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nSee log for details.\nError: "%s"' % str(ex) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue try: self.reportRequestStatus(reqName, "acquired") except Exception as ex: self.logger.warning("Unable to update ReqMgr state: %s" % str(ex)) self.logger.warning('Will try again later') self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) obtained from RequestManager" % work) return work def report(self, queue): """Report queue status to ReqMgr.""" new_state = {} uptodate_elements = [] now = time.time() elements = queue.statusInbox(dictKey="RequestName") if not elements: return new_state for ele in elements: ele = elements[ele][0] # 1 element tuple try: request = self.reqMgr2.getRequestByNames(ele['RequestName']) if not request: msg = 'Failed to get request "%s" from ReqMgr2. Will try again later.' % ele['RequestName'] self.logger.warning(msg) continue request = request[0][ele['RequestName']] if request['RequestStatus'] in ('failed', 'completed', 'announced', 'epic-FAILED', 'closed-out', 'rejected'): # requests can be done in reqmgr but running in workqueue # if request has been closed but agent cleanup actions # haven't been run (or agent has been retired) # Prune out obviously too old ones to avoid build up if queue.params.get('reqmgrCompleteGraceTime', -1) > 0: if (now - float(ele.updatetime)) > queue.params['reqmgrCompleteGraceTime']: # have to check all elements are at least running and are old enough request_elements = queue.statusInbox(WorkflowName=request['RequestName']) if not any( [x for x in request_elements if x['Status'] != 'Running' and not x.inEndState()]): last_update = max([float(x.updatetime) for x in request_elements]) if (now - last_update) > queue.params['reqmgrCompleteGraceTime']: self.logger.info( "Finishing request %s as it is done in reqmgr" % request['RequestName']) queue.doneWork(WorkflowName=request['RequestName']) continue else: pass # assume workqueue status will catch up later elif request['RequestStatus'] in ['aborted', 'force-complete']: queue.cancelWork(WorkflowName=request['RequestName']) # Check consistency of running-open/closed and the element closure status elif request['RequestStatus'] == 'running-open' and not ele.get('OpenForNewData', False): self.reportRequestStatus(ele['RequestName'], 'running-closed') elif request['RequestStatus'] == 'running-closed' and ele.get('OpenForNewData', False): queue.closeWork(ele['RequestName']) # we do not want to move the request to 'failed' status elif ele['Status'] == 'Failed': continue elif ele['Status'] not in self._reqMgrToWorkQueueStatus(request['RequestStatus']): self.reportElement(ele) uptodate_elements.append(ele) except Exception as ex: msg = 'Error talking to ReqMgr about request "%s": %s' % (ele['RequestName'], str(ex)) self.logger.exception(msg) return uptodate_elements def deleteFinishedWork(self, queue, elements): """Delete work from queue that is finished in ReqMgr""" finished = [] for element in elements: if element.inEndState() and self._workQueueToReqMgrStatus(element['Status']) in ('aborted', 'failed', 'completed', 'announced', 'epic-FAILED', 'closed-out', 'rejected'): finished.append(element['RequestName']) return queue.deleteWorkflows(*finished) def getAvailableRequests(self): """ Get available requests and sort by team and priority returns [(team, request_name, request_spec_url)] """ tempResults = self.reqMgr2.getRequestByStatus("assigned") filteredResults = [] for requests in tempResults: for request in requests.values(): filteredResults.append(request) filteredResults.sort(key=itemgetter('RequestPriority'), reverse=True) filteredResults.sort(key=lambda r: r["Teams"][0]) results = [(x["Teams"][0], x["RequestName"], x["RequestWorkflow"]) for x in filteredResults] return results def reportRequestStatus(self, request, status, message=None): """Change state in RequestManager Optionally, take a message to append to the request """ if message: self.logdb.post(request, str(message), 'info') reqmgrStatus = self._workQueueToReqMgrStatus(status) if reqmgrStatus: # only send known states try: self.reqMgr2.updateRequestStatus(request, reqmgrStatus) except Exception as ex: msg = "%s : fail to update status will try later: %s" % (request, str(ex)) msg += traceback.format_exc() self.logdb.post(request, msg, 'warning') raise ex return def _workQueueToReqMgrStatus(self, status): """Map WorkQueue Status to that reported to ReqMgr""" statusMapping = {'Acquired': 'acquired', 'Running': 'running-open', 'Failed': 'failed', 'Canceled': 'aborted', 'CancelRequested': 'aborted', 'Done': 'completed' } if status in statusMapping: # if wq status passed convert to reqmgr status return statusMapping[status] elif status in REQUEST_STATE_LIST: # if reqmgr status passed return reqmgr status return status else: # unknown status return None def _reqMgrToWorkQueueStatus(self, status): """Map ReqMgr status to that in a WorkQueue element, it is not a 1-1 relation""" statusMapping = {'acquired': ['Acquired'], 'running': ['Running'], 'running-open': ['Running'], 'running-closed': ['Running'], 'failed': ['Failed'], 'aborted': ['Canceled', 'CancelRequested'], 'force-complete': ['Canceled', 'CancelRequested'], 'completed': ['Done']} if status in statusMapping: return statusMapping[status] else: return [] def reportElement(self, element): """Report element to ReqMgr""" self.reportRequestStatus(element['RequestName'], element['Status']) def addNewElementsToOpenRequests(self, queue): """Add new elements to open requests which are in running-open state, only works adding new blocks from the input dataset""" self.logger.info("Checking Request Manager for open requests and closing old ones") # First close any open inbox element which hasn't found anything new in a while queue.closeWork() self.report(queue) work = 0 requests = [] # Drain mode, don't pull any work into open requests. They will be closed if the queue stays in drain long enough if queue.params['DrainMode']: self.logger.info('Draining queue: Skip requesting work from ReqMgr') return 0 try: requests = self.reqMgr2.getRequestByStatus("running-open", detail=False) except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for reqName in requests: try: self.logger.info("Processing request %s" % (reqName)) units = queue.addWork(requestName=reqName) self.logdb.delete(reqName, 'error', True) except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex: # fatal error - but at least it was split the first time. Log and skip. msg = 'Error adding further work to request "%s". Will try again later' % reqName msg += '\nError: "%s"' % str(ex) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nError: "%s"' % str(ex) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nSee log for details.\nError: "%s"' % str(ex) traceMsg = traceback.format_exc() msg = "%s\n%s" % (msg, traceMsg) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) added to open requests" % work) return work
class WorkQueueReqMgrInterface(object): """Helper class for ReqMgr interaction""" def __init__(self, **kwargs): if not kwargs.get('logger'): import logging kwargs['logger'] = logging self.logger = kwargs['logger'] # this will break all in one test self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None)) centralurl = kwargs.get("central_logdb_url", "") identifier = kwargs.get("log_reporter", "") # set the thread name before creat the log db. # only sets that when it is not set already myThread = threading.currentThread() if myThread.getName() == "MainThread": myThread.setName(self.__class__.__name__) self.logdb = LogDB(centralurl, identifier, logger=self.logger) self.previous_state = {} def __call__(self, queue): """Synchronize WorkQueue and RequestManager""" msg = '' try: # pull in new work self.logger.info("queueing new work") work = self.queueNewRequests(queue) msg += "New Work: %d\n" % work except Exception as ex: errorMsg = "Error caught during RequestManager pull" self.logger.exception("%s: %s", errorMsg, str(ex)) try: # get additional open-running work self.logger.info("adding new element to open requests") extraWork = self.addNewElementsToOpenRequests(queue) msg += "Work added: %d\n" % extraWork except Exception as ex: errorMsg = "Error caught during RequestManager split" self.logger.exception("%s: %s", errorMsg, str(ex)) try: # report back to ReqMgr self.logger.info("cancel aborted requests") count = self.cancelWork(queue) self.logger.info("finised canceling requests") msg += "Work canceled: %s " % count except Exception as ex: errorMsg = "Error caught during canceling the request" self.logger.exception("%s: %s", errorMsg, str(ex)) queue.backend.recordTaskActivity('reqmgr_sync', msg) def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] try: workLoads = self.getAvailableRequests() except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: try: try: Lexicon.couchurl(workLoadUrl) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError( None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) self.logdb.delete(reqName, "error", this_thread=True, agent=False) except TERMINAL_EXCEPTIONS as ex: # fatal error - report back to ReqMgr self.logger.error( 'Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nError: "%s"' % str(ex) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nSee log for details.\nError: "%s"' % str(ex) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) obtained from RequestManager" % work) return work def cancelWork(self, queue): requests = self.reqMgr2.getRequestByStatus( ['aborted', 'force-complete'], detail=False) count = 0 for req in requests: try: queue.cancelWork(req) count += 1 except Exception as ex: msg = 'Error to cancel the request "%s": %s' % (req, str(ex)) self.logger.exception(msg) return count def report(self, queue): """Report queue status to ReqMgr.""" new_state = {} uptodate_elements = [] now = time.time() elements = queue.statusInbox(dictKey="RequestName") if not elements: return new_state for ele in elements: ele = elements[ele][0] # 1 element tuple try: request = self.reqMgr2.getRequestByNames(ele['RequestName']) if not request: msg = 'Failed to get request "%s" from ReqMgr2. Will try again later.' % ele[ 'RequestName'] self.logger.warning(msg) continue request = request[0][ele['RequestName']] if request['RequestStatus'] in ('failed', 'completed', 'announced', 'closed-out', 'rejected'): # requests can be done in reqmgr but running in workqueue # if request has been closed but agent cleanup actions # haven't been run (or agent has been retired) # Prune out obviously too old ones to avoid build up if queue.params.get('reqmgrCompleteGraceTime', -1) > 0: if (now - float(ele.updatetime) ) > queue.params['reqmgrCompleteGraceTime']: # have to check all elements are at least running and are old enough request_elements = queue.statusInbox( WorkflowName=request['RequestName']) if not any([ x for x in request_elements if x['Status'] != 'Running' and not x.inEndState() ]): last_update = max([ float(x.updatetime) for x in request_elements ]) if ( now - last_update ) > queue.params['reqmgrCompleteGraceTime']: self.logger.info( "Finishing request %s as it is done in reqmgr" % request['RequestName']) queue.doneWork( WorkflowName=request['RequestName']) continue else: pass # assume workqueue status will catch up later elif request['RequestStatus'] in ['aborted', 'force-complete']: queue.cancelWork(WorkflowName=request['RequestName']) # Check consistency of running-open/closed and the element closure status elif request['RequestStatus'] == 'running-open' and not ele.get( 'OpenForNewData', False): self.reportRequestStatus(ele['RequestName'], 'running-closed') elif request['RequestStatus'] == 'running-closed' and ele.get( 'OpenForNewData', False): queue.closeWork(ele['RequestName']) # we do not want to move the request to 'failed' status elif ele['Status'] == 'Failed': continue elif ele['Status'] not in self._reqMgrToWorkQueueStatus( request['RequestStatus']): self.reportElement(ele) uptodate_elements.append(ele) except Exception as ex: msg = 'Error talking to ReqMgr about request "%s": %s' % ( ele['RequestName'], str(ex)) self.logger.exception(msg) return uptodate_elements def deleteFinishedWork(self, queue, elements): """Delete work from queue that is finished in ReqMgr""" finished = [] for element in elements: if element.inEndState(): finished.append(element['RequestName']) return queue.deleteWorkflows(*finished) def getAvailableRequests(self): """ Get available requests and sort by team and priority returns [(team, request_name, request_spec_url)] """ tempResults = self.reqMgr2.getRequestByStatus("staged") filteredResults = [] for requests in tempResults: for request in requests.values(): filteredResults.append(request) filteredResults.sort(key=itemgetter('RequestPriority'), reverse=True) filteredResults.sort(key=lambda r: r["Team"]) results = [(x["Team"], x["RequestName"], x["RequestWorkflow"]) for x in filteredResults] return results def reportRequestStatus(self, request, status, message=None): """Change state in RequestManager Optionally, take a message to append to the request """ if message: logType = "error" if status == "Failed" else "info" self.logdb.post(request, str(message), logType) reqmgrStatus = self._workQueueToReqMgrStatus(status) if reqmgrStatus: # only send known states try: self.reqMgr2.updateRequestStatus(request, reqmgrStatus) except Exception as ex: msg = "%s : fail to update status will try later: %s" % ( request, str(ex)) msg += traceback.format_exc() self.logdb.post(request, msg, 'warning') raise ex return def _workQueueToReqMgrStatus(self, status): """Map WorkQueue Status to that reported to ReqMgr""" statusMapping = { 'Acquired': 'acquired', 'Running': 'running-open', 'Failed': 'failed', 'Canceled': 'aborted', 'CancelRequested': 'aborted', 'Done': 'completed' } if status in statusMapping: # if wq status passed convert to reqmgr status return statusMapping[status] elif status in REQUEST_STATE_LIST: # if reqmgr status passed return reqmgr status return status else: # unknown status return None def _reqMgrToWorkQueueStatus(self, status): """Map ReqMgr status to that in a WorkQueue element, it is not a 1-1 relation""" statusMapping = { 'acquired': ['Acquired'], 'running': ['Running'], 'running-open': ['Running'], 'running-closed': ['Running'], 'failed': ['Failed'], 'aborted': ['Canceled', 'CancelRequested'], 'force-complete': ['Canceled', 'CancelRequested'], 'completed': ['Done'] } if status in statusMapping: return statusMapping[status] else: return [] def reportElement(self, element): """Report element to ReqMgr""" self.reportRequestStatus(element['RequestName'], element['Status']) def addNewElementsToOpenRequests(self, queue): """Add new elements to open requests which are in running-open state, only works adding new blocks from the input dataset""" self.logger.info( "Checking Request Manager for open requests and closing old ones") work = 0 requests = [] try: requests = self.reqMgr2.getRequestByStatus("running-open", detail=False) except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for reqName in requests: try: self.logger.info("Processing request %s" % (reqName)) units = queue.addWork(requestName=reqName) self.logdb.delete(reqName, 'error', True, agent=False) except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex: # fatal error - but at least it was split the first time. Log and skip. msg = 'Error adding further work to request "%s". Will try again later' % reqName msg += '\nError: "%s"' % str(ex) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nError: "%s"' % str(ex) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' % reqName msg += '\nSee log for details.\nError: "%s"' % str(ex) traceMsg = traceback.format_exc() msg = "%s\n%s" % (msg, traceMsg) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) added to open requests" % work) return work
class LogDBTest(unittest.TestCase): """ _LogDBTest_ """ def setUp(self): """ _setUp_ Setup the database and logging connection. """ logging.basicConfig() self.logger = logging.getLogger('LogDBTest') # self.logger.setLevel(logging.DEBUG) url = 'http://localhost:5984/logdb_t' identifier = 'agentname' self.agent_inst = LogDB(url, identifier, logger=self.logger, create=True) self.agent_inst2 = LogDB(url, identifier, logger=self.logger, create=True, thread_name="Test") identifier = '/DC=org/DC=doegrids/OU=People/CN=First Last 123' self.user_inst = LogDB(url, identifier, logger=self.logger, create=True) self.report = LogDBReport(self.agent_inst) def tearDown(self): """ _tearDown_ Drop all the WMBS tables. """ self.agent_inst.backend.deleteDatabase() def test_docid(self): "Test docid API" request = 'abs' mtype = self.agent_inst.backend.prefix('info') res = self.agent_inst.backend.docid(request, mtype) key = '--'.join([request, self.agent_inst.identifier, self.agent_inst.thread_name, mtype]) khash = gen_hash(key) self.assertEqual(res, khash) def test_prefix(self): "Test prefix LogDBBackend API" request = 'abc' self.assertEqual(self.agent_inst.agent, 1) mtype = self.agent_inst.backend.prefix('msg') self.assertEqual(mtype, 'agent-msg') self.assertEqual(self.user_inst.agent, 0) mtype = self.user_inst.backend.prefix('msg') self.assertEqual(mtype, 'msg') def test_apis(self): "Test LogDB APIs" request = 'abc' # if we post messages for the same request only last one should survive self.agent_inst.post(request, 'msg1') # time.sleep(1) self.agent_inst.post(request, 'msg2') # time.sleep(1) self.agent_inst.post(request, 'msg3') # time.sleep(1) self.agent_inst.post(request, 'msg4') # time.sleep(1) docs = self.agent_inst.get(request) self.assertEqual(len(docs), 1) self.assertEqual(docs[0]['msg'], 'msg4') # add message as user, user based messages will grow self.user_inst.post(request, 'doc1') # time.sleep(1) docs = self.user_inst.get(request) self.assertEqual(len(docs), 2) # we have msg4 and doc1 self.user_inst.post(request, 'doc1') # we have msg2 and two doc1 messages # time.sleep(1) docs = self.user_inst.get(request) self.assertEqual(len(docs), 3) def test_cleanup(self): "Test clean-up LogDB API" request = 'abc' self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.post(request, 'msg2', 'comment') self.agent_inst.post(request, 'msg3', 'warning') self.agent_inst.post(request, 'msg4', 'error') all_docs = self.agent_inst.get(request) self.agent_inst.backend.cleanup(thr=10) # look into past past_docs = self.agent_inst.get(request) self.assertEqual(len(all_docs), len(past_docs)) self.agent_inst.backend.cleanup(thr=-10) # look into future docs = self.agent_inst.get(request) self.assertEqual(len(docs), 0) def test_get_all_requests(self): "Test get_all_requests LogDB API" request = 'abc' self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.post(request, 'msg2', 'info') self.agent_inst.post(request, 'msg3', 'warning') self.agent_inst.post(request, 'msg4', 'error') sum_docs = self.agent_inst.get(request) self.assertEqual(len(sum_docs), 3) # since we have two info records request = 'abc2' self.agent_inst.post(request, 'msg1', 'info') all_docs = self.agent_inst.get_all_requests() self.assertEqual(len(all_docs), 2) # we only have two distinct request names def test_delete(self): "Test delete LogDB API" request = 'abc' self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.delete(request) all_docs = self.agent_inst.get_all_requests() self.assertEqual(len(all_docs), 0) def test_get(self): "Test get LogDB API" request = 'abc' self.agent_inst2.post(request, 'msg1', 'info') # doc with different thread name self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.post(request, 'msg2', 'info') self.agent_inst.post(request, 'msg3', 'warning') self.agent_inst.post(request, 'msg4', 'error') docs = self.agent_inst.get(request) self.assertEqual(len(docs), 4) # there should be 4 docs one for Test thread and 3 for MainThread docs = self.agent_inst2.get(request) self.assertEqual(len(docs), 4) # this should result the same as above. docs1 = self.agent_inst.get(request, 'info') docs2 = self.agent_inst.get(request, 'info') req1 = set([r['request'] for r in docs1]) self.assertEqual(len(req1), 1) req2 = set([r['request'] for r in docs2]) self.assertEqual(len(req2), 1) self.assertEqual(req1, req2) def test_thread_name(self): "Test thread name support by LogDB APIs" request = 'abc' self.agent_inst2.post(request, 'msg1', 'info') # doc with different thread name self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.post(request, 'msg2', 'info') self.agent_inst.post(request, 'msg3', 'warning') self.agent_inst.post(request, 'msg4', 'error') maindocs = self.agent_inst.get(request) self.assertEqual(len(maindocs), 4) # since we have two info records in agent_inst thr_docs = self.agent_inst2.get(request) self.assertEqual(len(thr_docs), 4) # number of docs in different thread req1 = set([r['request'] for r in maindocs]) self.assertEqual(len(req1), 1) req2 = set([r['request'] for r in thr_docs]) self.assertEqual(len(req2), 1) self.assertEqual(req1, req2) worker1 = set([r['thr'] for r in maindocs]) self.assertEqual(worker1, set(["Test", "MainThread"])) worker2 = set([r['thr'] for r in thr_docs]) self.assertEqual(worker2, set(["Test", "MainThread"])) docs = set([r['identifier'] for r in maindocs]) self.assertEqual(docs, set([self.agent_inst.identifier])) docs = set([r['identifier'] for r in thr_docs]) self.assertEqual(docs, set([self.agent_inst.identifier])) def test_checks(self): "Tests LogDB check/mtype functionality" request = 'abc' res = self.agent_inst.post(request, 'msg', 'ingo') # should be silent, i.o. no Exceptions self.assertEqual('post-error', res) args = (request, 'msg', 'ingo') # wrong type self.assertRaises(LogDBError, self.agent_inst.backend.agent_update, *args) args = (request, 'ingo') # wrong type self.assertRaises(LogDBError, self.agent_inst.backend.check, *args) args = ('', 'msg', 'info') # empty request string self.assertRaises(LogDBError, self.agent_inst.backend.agent_update, *args) args = ('', 'info') # empty request string self.assertRaises(LogDBError, self.agent_inst.backend.check, *args) def test_report(self): "Test LogDBReport APIs" request = 'abc' self.agent_inst.post(request, 'msg1', 'info') time.sleep(1) self.agent_inst.post(request, 'msg2', 'comment') time.sleep(1) self.agent_inst.post(request, 'msg3', 'warning') time.sleep(1) self.agent_inst.post(request, 'msg4', 'error') time.sleep(1) self.report.to_stdout(request) out = self.report.to_txt(request) print "\n### txt report\n", out out = self.report.to_html(request) print "\n### html report\n", out docs = self.agent_inst.get(request) tdocs = self.report.orderby(docs, order='ts') messages = [d['msg'] for d in tdocs] self.assertEqual(messages, ['msg4', 'msg3', 'msg2', 'msg1'])
class WorkQueueReqMgrInterface(): """Helper class for ReqMgr interaction""" def __init__(self, **kwargs): if not kwargs.get('logger'): import logging kwargs['logger'] = logging self.logger = kwargs['logger'] #TODO: (reqmgr2Only - remove this line when reqmgr is replaced) self.reqMgr = RequestManager(kwargs) #this will break all in one test self.reqMgr2 = ReqMgr(kwargs.get("reqmgr2_endpoint", None)) centralurl = kwargs.get("central_logdb_url", "") identifier = kwargs.get("log_reporter", "") # set the thread name before creat the log db. # only sets that when it is not set already myThread = threading.currentThread() if myThread.getName() == "MainThread": myThread.setName(self.__class__.__name__) self.logdb = LogDB(centralurl, identifier, logger=self.logger) self.previous_state = {} def __call__(self, queue): """Synchronize WorkQueue and RequestManager""" msg = '' try: # pull in new work work = self.queueNewRequests(queue) msg += "New Work: %d\n" % work except Exception: self.logger.exception("Error caught during RequestManager pull") try: # get additional open-running work extraWork = self.addNewElementsToOpenRequests(queue) msg += "Work added: %d\n" % extraWork except Exception: self.logger.exception("Error caught during RequestManager split") try: # report back to ReqMgr uptodate_elements = self.report(queue) msg += "Updated ReqMgr status for: %s\n" % ", ".join( [x['RequestName'] for x in uptodate_elements]) except Exception: self.logger.exception("Error caught during RequestManager update") else: try: # Delete finished requests from WorkQueue self.deleteFinishedWork(queue, uptodate_elements) except Exception: self.logger.exception("Error caught during work deletion") queue.backend.recordTaskActivity('reqmgr_sync', msg) def queueNewRequests(self, queue): """Get requests from regMgr and queue to workqueue""" self.logger.info("Contacting Request manager for more work") work = 0 workLoads = [] if queue.params['DrainMode']: self.logger.info( 'Draining queue: Skip requesting work from ReqMgr') return 0 try: workLoads = self.getAvailableRequests(queue.params['Teams']) except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for team, reqName, workLoadUrl in workLoads: # try: # self.reportRequestStatus(reqName, "negotiating") # except Exception, ex: # self.logger.error(""" # Unable to update ReqMgr state to negotiating: %s # Ignoring this request: %s""" % (str(ex), reqName)) # continue try: try: Lexicon.couchurl(workLoadUrl) except Exception as ex: # can throw many errors e.g. AttributeError, AssertionError etc. # check its not a local file if not os.path.exists(workLoadUrl): error = WorkQueueWMSpecError( None, "Workflow url validation error: %s" % str(ex)) raise error self.logger.info("Processing request %s at %s" % (reqName, workLoadUrl)) units = queue.queueWork(workLoadUrl, request=reqName, team=team) self.logdb.delete(reqName, "error", this_thread=True) except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex: # fatal error - report back to ReqMgr self.logger.info( 'Permanent failure processing request "%s": %s' % (reqName, str(ex))) self.logger.info("Marking request %s as failed in ReqMgr" % reqName) self.reportRequestStatus(reqName, 'Failed', message=str(ex)) continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' \ '\nError: "%s"' % (reqName, str(ex)) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' \ '\nSee log for details.\nError: "%s"' % (reqName, str(ex)) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue try: self.reportRequestStatus(reqName, "acquired") except Exception as ex: self.logger.warning("Unable to update ReqMgr state: %s" % str(ex)) self.logger.warning('Will try again later') self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) obtained from RequestManager" % work) return work def report(self, queue): """Report queue status to ReqMgr.""" new_state = {} uptodate_elements = [] now = time.time() elements = queue.statusInbox(dictKey="RequestName") if not elements: return new_state for ele in elements: ele = elements[ele][0] # 1 element tuple try: request = self.reqMgr2.getRequestByNames(ele['RequestName']) if not request: msg = 'Failed to get request "%s" from ReqMgr2. Will try again later.' % ele[ 'RequestName'] self.logger.warning(msg) continue request = request[ele['RequestName']] if request['RequestStatus'] in ('failed', 'completed', 'announced', 'epic-FAILED', 'closed-out', 'rejected'): # requests can be done in reqmgr but running in workqueue # if request has been closed but agent cleanup actions # haven't been run (or agent has been retired) # Prune out obviously too old ones to avoid build up if queue.params.get('reqmgrCompleteGraceTime', -1) > 0: if (now - float(ele.updatetime) ) > queue.params['reqmgrCompleteGraceTime']: # have to check all elements are at least running and are old enough request_elements = queue.statusInbox( WorkflowName=request['RequestName']) if not any([ x for x in request_elements if x['Status'] != 'Running' and not x.inEndState() ]): last_update = max([ float(x.updatetime) for x in request_elements ]) if ( now - last_update ) > queue.params['reqmgrCompleteGraceTime']: self.logger.info( "Finishing request %s as it is done in reqmgr" % request['RequestName']) queue.doneWork( WorkflowName=request['RequestName']) continue else: pass # assume workqueue status will catch up later elif request['RequestStatus'] == 'aborted' or request[ 'RequestStatus'] == 'force-complete': queue.cancelWork(WorkflowName=request['RequestName']) # Check consistency of running-open/closed and the element closure status elif request['RequestStatus'] == 'running-open' and not ele.get( 'OpenForNewData', False): self.reportRequestStatus(ele['RequestName'], 'running-closed') elif request['RequestStatus'] == 'running-closed' and ele.get( 'OpenForNewData', False): queue.closeWork(ele['RequestName']) # update request status if necessary elif ele['Status'] not in self._reqMgrToWorkQueueStatus( request['RequestStatus']): self.reportElement(ele) uptodate_elements.append(ele) except Exception as ex: msg = 'Error talking to ReqMgr about request "%s": %s' traceMsg = traceback.format_exc() self.logger.error(msg % (ele['RequestName'], traceMsg)) return uptodate_elements def deleteFinishedWork(self, queue, elements): """Delete work from queue that is finished in ReqMgr""" finished = [] for element in elements: if self._workQueueToReqMgrStatus(element['Status']) in ('aborted', 'failed', 'completed', 'announced', 'epic-FAILED', 'closed-out', 'rejected') \ and element.inEndState(): finished.append(element['RequestName']) return queue.deleteWorkflows(*finished) def _getRequestsByTeamsAndStatus(self, status, teams=[]): """ TODO: now it assumes one team per requests - check whether this assumption is correct Check whether we actually use the team for this. Also switch to byteamandstatus couch call instead of """ requests = self.reqMgr2.getRequestByStatus(status) #Then sort by Team name then sort by Priority #https://docs.python.org/2/howto/sorting.html if teams and len(teams) > 0: results = {} for reqName, value in requests.items(): if value["Teams"][0] in teams: results[reqName] = value return results else: return requests def getAvailableRequests(self, teams): """ Get available requests for the given teams and sort by team and priority returns [(team, request_name, request_spec_url)] """ tempResults = self._getRequestsByTeamsAndStatus("assigned", teams).values() filteredResults = [] for request in tempResults: if "Teams" in request and len(request["Teams"]) == 1: filteredResults.append(request) self.logdb.delete(request["RequestName"], "error", this_thread=True) else: msg = "no team or more than one team (%s) are assigined: %s" % ( request.get("Teams", None), request["RequestName"]) self.logger.error(msg) self.logdb.post(request["RequestName"], msg, 'error') filteredResults.sort(key=itemgetter('RequestPriority'), reverse=True) filteredResults.sort(key=lambda r: r["Teams"][0]) results = [(x["Teams"][0], x["RequestName"], x["RequestWorkflow"]) for x in filteredResults] return results def reportRequestStatus(self, request, status, message=None): """Change state in RequestManager Optionally, take a message to append to the request """ if message: self.logdb.post(request, str(message), 'info') reqmgrStatus = self._workQueueToReqMgrStatus(status) if reqmgrStatus: # only send known states try: #TODO: try reqmgr1 call if it fails (reqmgr2Only - remove this line when reqmgr is replaced) self.reqMgr.reportRequestStatus(request, reqmgrStatus) # And replace with this (remove all Exceptins) #self.reqMgr2.updateRequestStatus(request, reqmgrStatus) except HTTPException as ex: # If we get an HTTPException of 404 means reqmgr2 request if ex.status == 404: # try reqmgr2 call msg = "%s : reqmgr2 request: %s" % (request, str(ex)) self.logdb.post(request, msg, 'info') self.reqMgr2.updateRequestStatus(request, reqmgrStatus) else: msg = "%s : fail to update status with HTTP error: %s" % ( request, str(ex)) self.logdb.post(request, msg, 'warning') raise ex except Exception as ex: msg = "%s : fail to update status will try later: %s" % ( request, str(ex)) self.logdb.post(request, msg, 'warning') raise ex def markAcquired(self, request, url=None): """Mark request acquired""" self.reqMgr.putWorkQueue(request, url) def _workQueueToReqMgrStatus(self, status): """Map WorkQueue Status to that reported to ReqMgr""" statusMapping = { 'Acquired': 'acquired', 'Running': 'running-open', 'Failed': 'failed', 'Canceled': 'aborted', 'CancelRequested': 'aborted', 'Done': 'completed' } if status in statusMapping: # if wq status passed convert to reqmgr status return statusMapping[status] elif status in REQUEST_STATE_LIST: # if reqmgr status passed return reqmgr status return status else: # unknown status return None def _reqMgrToWorkQueueStatus(self, status): """Map ReqMgr status to that in a WorkQueue element, it is not a 1-1 relation""" statusMapping = { 'acquired': ['Acquired'], 'running': ['Running'], 'running-open': ['Running'], 'running-closed': ['Running'], 'failed': ['Failed'], 'aborted': ['Canceled', 'CancelRequested'], 'force-complete': ['Canceled', 'CancelRequested'], 'completed': ['Done'] } if status in statusMapping: return statusMapping[status] else: return [] def reportElement(self, element): """Report element to ReqMgr""" self.reportRequestStatus(element['RequestName'], element['Status']) def addNewElementsToOpenRequests(self, queue): """Add new elements to open requests which are in running-open state, only works adding new blocks from the input dataset""" self.logger.info( "Checking Request Manager for open requests and closing old ones") # First close any open inbox element which hasn't found anything new in a while queue.closeWork() self.report(queue) work = 0 requests = [] # Drain mode, don't pull any work into open requests. They will be closed if the queue stays in drain long enough if queue.params['DrainMode']: self.logger.info( 'Draining queue: Skip requesting work from ReqMgr') return 0 try: requests = self._getRequestsByTeamsAndStatus( "running-open", queue.params['Teams']).keys() except Exception as ex: traceMsg = traceback.format_exc() msg = "Error contacting RequestManager: %s" % traceMsg self.logger.warning(msg) return 0 for reqName in requests: try: self.logger.info("Processing request %s" % (reqName)) units = queue.addWork(requestName=reqName) self.logdb.delete(request["RequestName"], 'error', True) except (WorkQueueWMSpecError, WorkQueueNoWorkError) as ex: # fatal error - but at least it was split the first time. Log and skip. msg = 'Error adding further work to request "%s". Will try again later' \ '\nError: "%s"' % (reqName, str(ex)) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except (IOError, socket.error, CouchError, CouchConnectionError) as ex: # temporary problem - try again later msg = 'Error processing request "%s": will try again later.' \ '\nError: "%s"' % (reqName, str(ex)) self.logger.info(msg) self.logdb.post(reqName, msg, 'error') continue except Exception as ex: # Log exception as it isnt a communication problem msg = 'Error processing request "%s": will try again later.' \ '\nSee log for details.\nError: "%s"' % (reqName, str(ex)) self.logger.exception('Unknown error processing %s' % reqName) self.logdb.post(reqName, msg, 'error') continue self.logger.info('%s units(s) queued for "%s"' % (units, reqName)) work += units self.logger.info("%s element(s) added to open requests" % work) return work
class MSRuleCleaner(MSCore): """ MSRuleCleaner.py class provides the logic used to clean the Rucio block level data placement rules created by WMAgent. """ def __init__(self, msConfig, logger=None): """ Runs the basic setup and initialization for the MSRuleCleaner module :param msConfig: micro service configuration """ super(MSRuleCleaner, self).__init__(msConfig, logger=logger) self.msConfig.setdefault("verbose", True) self.msConfig.setdefault("interval", 60) self.msConfig.setdefault("services", ['ruleCleaner']) self.msConfig.setdefault("rucioWmaAccount", "wma_test") self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor") self.msConfig.setdefault('enableRealMode', False) self.mode = "RealMode" if self.msConfig['enableRealMode'] else "DryRunMode" self.curlMgr = RequestHandler() self.targetStatusRegex = re.compile(r'.*archived') self.logDB = LogDB(self.msConfig["logDBUrl"], self.msConfig["logDBReporter"], logger=self.logger) self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger) # Building all the Pipelines: pName = 'plineMSTrCont' self.plineMSTrCont = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.setParentDatasets), Functor(self.getRucioRules, 'container', self.msConfig['rucioMStrAccount']), Functor(self.cleanRucioRules)]) pName = 'plineMSTrBlock' self.plineMSTrBlock = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.setParentDatasets), Functor(self.getRucioRules, 'block', self.msConfig['rucioMStrAccount']), Functor(self.cleanRucioRules)]) pName = 'plineAgentCont' self.plineAgentCont = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'container', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules)]) pName = 'plineAgentBlock' self.plineAgentBlock = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'block', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules)]) pName = 'plineArchive' self.plineArchive = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.findTargetStatus), Functor(self.setClean), Functor(self.setArchivalDelayExpired), Functor(self.setLogDBClean), Functor(self.archive)]) # Building the different set of plines we will need later: # NOTE: The following are all the functional pipelines which are supposed to include # a cleanup function and report cleanup status in the MSRuleCleanerWflow object self.cleanuplines = [self.plineMSTrCont, self.plineMSTrBlock, self.plineAgentCont, self.plineAgentBlock] # Building an auxiliary list of cleanup pipeline names only: self.cleanupPipeNames = [pline.name for pline in self.cleanuplines] # Building lists of pipelines related only to Agents or MStransferror self.agentlines = [self.plineAgentCont, self.plineAgentBlock] self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock] # Initialization of the 'cleaned' and 'archived' counters: self.wfCounters = {'cleaned': {}, 'archived': {'normalArchived': 0, 'forceArchived': 0}} self.globalLocks = set() def getGlobalLocks(self): """ Fetches the list of 'globalLocks' from wmstats server and the list of 'parentLocks' from request manager. Stores/updates the unified set in the 'globalLocks' instance variable. Returns the resultant unified set. :return: A union set of the 'globalLocks' and the 'parentLocks' lists """ self.logger.info("Fetching globalLocks list from wmstats server.") try: globalLocks = set(self.wmstatsSvc.getGlobalLocks()) except Exception as ex: msg = "Failed to refresh global locks list for the current polling cycle. Error: %s " msg += "Skipping this polling cycle." self.logger.error(msg, str(ex)) raise ex self.logger.info("Fetching parentLocks list from reqmgr2 server.") try: parentLocks = set(self.reqmgr2.getParentLocks()) except Exception as ex: msg = "Failed to refresh parent locks list for the current poling cycle. Error: %s " msg += "Skipping this polling cycle." self.logger.error(msg, str(ex)) raise ex self.globalLocks = globalLocks | parentLocks def resetCounters(self): """ A simple function for zeroing the cleaned and archived counters. """ for pline in self.cleanuplines: self.wfCounters['cleaned'][pline.name] = 0 self.wfCounters['archived']['normalArchived'] = 0 self.wfCounters['archived']['forceArchived'] = 0 def execute(self, reqStatus): """ Executes the whole ruleCleaner logic :return: summary """ # start threads in MSManager which should call this method summary = dict(RULECLEANER_REPORT) self.currThread = current_thread() self.currThreadIdent = self.currThread.name self.updateReportDict(summary, "thread_id", self.currThreadIdent) self.resetCounters() self.logger.info("MSRuleCleaner is running in mode: %s.", self.mode) # Build the list of workflows to work on: try: requestRecords = {} for status in reqStatus: requestRecords.update(self.getRequestRecords(status)) except Exception as err: # general error msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str(err) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) # Call _execute() and feed the relevant pipeline with the objects popped from requestRecords try: self.getGlobalLocks() totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests = self._execute(requestRecords) msg = "\nNumber of processed workflows: %s." msg += "\nNumber of properly cleaned workflows: %s." msg += "\nNumber of normally archived workflows: %s." msg += "\nNumber of force archived workflows: %s." self.logger.info(msg, totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests) self.updateReportDict(summary, "total_num_requests", totalNumRequests) self.updateReportDict(summary, "clean_num_requests", cleanNumRequests) self.updateReportDict(summary, "normal_archived_num_requests", normalArchivedNumRequests) self.updateReportDict(summary, "force_archived_num_requests", forceArchivedNumRequests) except Exception as ex: msg = "Unknown exception while running MSRuleCleaner thread Error: %s" self.logger.exception(msg, str(ex)) self.updateReportDict(summary, "error", msg) return summary def _execute(self, reqRecords): """ Executes the MSRuleCleaner pipelines based on the workflow status :param reqList: A list of RequestRecords to work on :return: a tuple with: number of properly cleaned requests number of processed workflows number of archived workflows """ # NOTE: The Input Cleanup, the Block Level Cleanup and the Archival # Pipelines are executed sequentially in the above order. # This way we assure ourselves that we archive only workflows # that have accomplished the needed cleanup cleanNumRequests = 0 totalNumRequests = 0 # Call the workflow dispatcher: for req in viewvalues(reqRecords): wflow = MSRuleCleanerWflow(req) self._dispatchWflow(wflow) msg = "\n----------------------------------------------------------" msg += "\nMSRuleCleanerWflow: %s" msg += "\n----------------------------------------------------------" self.logger.debug(msg, pformat(wflow)) totalNumRequests += 1 if self._checkClean(wflow): cleanNumRequests += 1 # Report the counters: for pline in self.cleanuplines: msg = "Workflows cleaned by pipeline: %s: %d" self.logger.info(msg, pline.name, self.wfCounters['cleaned'][pline.name]) normalArchivedNumRequests = self.wfCounters['archived']['normalArchived'] forceArchivedNumRequests = self.wfCounters['archived']['forceArchived'] self.logger.info("Workflows normally archived: %d", self.wfCounters['archived']['normalArchived']) self.logger.info("Workflows force archived: %d", self.wfCounters['archived']['forceArchived']) return totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests def _dispatchWflow(self, wflow): """ A function intended to dispatch a workflow (e.g based on its status) through one or more functional pipelines in case there is some more complicated logic involved in the order we execute them but not just a sequentially """ self.logger.debug("Dispatching workflow: %s", wflow['RequestName']) # NOTE: The following dispatch logic is a subject to be changed at any time # Resolve: # NOTE: First resolve any preliminary flags that will be needed further # in the logic of the _dispatcher() itself if wflow['RequestStatus'] == 'announced': self.getMSOutputTransferInfo(wflow) # Clean: # Do not clean any Resubmission, but still let them be archived if wflow['RequestType'] == 'Resubmission': wflow['ForceArchive'] = True msg = "Skipping cleanup step for workflow: %s - RequestType is %s." msg += " Will try to archive it directly." self.logger.info(msg, wflow['RequestName'], wflow['RequestType']) elif wflow['RequestStatus'] in ['rejected', 'aborted-completed']: # NOTE: We do not check the ParentageResolved flag for these # workflows, but we do need to clean output data placement # rules from the agents for them for pline in self.agentlines: try: pline.run(wflow) except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 elif wflow['RequestStatus'] == 'announced' and not wflow['ParentageResolved']: # NOTE: We skip workflows which are not having 'ParentageResolved' # flag, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'ParentageResolved' flag set to false." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced' and not wflow['TransferDone']: # NOTE: We skip workflows which have not yet finalised their TransferStatus # in MSOutput, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'TransferStatus' is 'pending' or 'TransferInfo' is missing in MSOutput." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced' and not wflow['TransferTape']: # NOTE: We skip workflows which have not yet finalised their tape transfers. # (i.e. even if a single output which is supposed to be covered # by a tape rule is in any of the following transient states: # {REPLICATING, STUCK, SUSPENDED, WAITING_APPROVAL}.) # We still need some proper logging for them. msg = "Skipping workflow: %s - tape transfers are not yet completed." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced': for pline in self.cleanuplines: try: pline.run(wflow) except MSRuleCleanerResolveParentError as ex: msg = "%s: Parentage Resolve Error: %s. " msg += "Will retry again in the next cycle." self.logger.error(msg, pline.name, str(ex)) continue except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 else: # We shouldn't be here: msg = "Skipping workflow: %s - " msg += "Does not fall under any of the defined categories." self.logger.error(msg, wflow['RequestName']) # Archive: try: self.plineArchive.run(wflow) if wflow['ForceArchive']: self.wfCounters['archived']['forceArchived'] += 1 else: self.wfCounters['archived']['normalArchived'] += 1 except MSRuleCleanerArchivalSkip as ex: msg = "%s: Proper conditions not met: %s. " msg += "Skipping archival in the current cycle." self.logger.info(msg, wflow['PlineMarkers'][-1], str(ex)) except MSRuleCleanerArchivalError as ex: msg = "%s: Archival Error: %s. " msg += "Will retry again in the next cycle." self.logger.error(msg, wflow['PlineMarkers'][-1], str(ex)) except Exception as ex: msg = "%s General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, wflow['PlineMarkers'][-1], wflow['RequestName'], str(ex)) def setPlineMarker(self, wflow, pName): """ A function intended to mark which is the pipeline currently working on the workflow. It is supposed to be called always as a first function in the pipeline. :param wflow: A MSRuleCleaner workflow representation :param pName: The name of the functional pipeline :return: The workflow object """ # NOTE: The current functional pipeline MUST always be appended at the # end of the 'PlineMarkers' list # First get rid of the default: if not wflow['PlineMarkers']: wflow['PlineMarkers'] = [] # Then push our current value into the markers list: wflow['PlineMarkers'].append(pName) # Populate the list of flags to be used later: if pName not in wflow['RulesToClean']: if pName in self.cleanupPipeNames: wflow['RulesToClean'][pName] = [] if pName not in wflow['CleanupStatus']: if pName in self.cleanupPipeNames: wflow['CleanupStatus'][pName] = False return wflow def _checkClean(self, wflow): """ An auxiliary function used to only check the temporary cleanup status. It basically takes the pipelines registered in 'PlineMarkers' that have already worked on the workflow as a mask and applies this mask over the set of flags in the 'CleanupStatus' field and then reduces the result to a single bool value """ # NOTE: This is one of the few functions taking a workflow as an argument # but returning a bool, since it is an auxiliary function and is not # supposed to be called as a standalone function in a pipeline. # NOTE: `all([]) == True`, ergo all the 'rejected' && 'aborted-completed' workflows # are also counted as properly cleaned and can trigger archival later # Build a list of bool flags based on the mask of PlineMarkers cleanFlagsList = [wflow['CleanupStatus'][key] for key in wflow['PlineMarkers'] if key in wflow['CleanupStatus']] # If no one have worked on the workflow set the clean status to false if not wflow['PlineMarkers']: cleanStatus = False # If we have a mask longer than the list of flags avoid false positives # because of the behavior explained above - `all([]) == True` elif not cleanFlagsList: cleanStatus = False # Figure out the final value else: cleanStatus = all(cleanFlagsList) return cleanStatus def setClean(self, wflow): """ A function to set the 'IsClean' flag based on the status from all the pipelines which have worked on the workflow (and have put their markers in the 'PlineMarkers' list) :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ wflow['IsClean'] = self._checkClean(wflow) return wflow def _checkLogDBClean(self, wflow): """ An auxiliary function used to only check the LogDB cleanup status. It makes a query to LogDB in order to verify there are no any records for the current workflow :param wflow: A MSRuleCleaner workflow representation :return: True if no records were found in LogDB about wflow """ cleanStatus = False logDBRecords = self.logDB.get(wflow['RequestName']) self.logger.debug("logDBRecords: %s", pformat(logDBRecords)) if not logDBRecords: cleanStatus = True return cleanStatus def setLogDBClean(self, wflow): """ A function to set the 'IsLogDBClean' flag based on the presence of any records in LogDB for the current workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ wflow['IsLogDBClean'] = self._checkLogDBClean(wflow) if not wflow['IsLogDBClean'] and wflow['IsArchivalDelayExpired']: wflow['IsLogDBClean'] = self._cleanLogDB(wflow) return wflow def _cleanLogDB(self, wflow): """ A function to be used for cleaning all the records related to a workflow in logDB. :param wflow: A MSRuleCleaner workflow representation :return: True if NO errors were encountered while deleting records from LogDB """ cleanStatus = False try: if self.msConfig['enableRealMode']: self.logger.info("Deleting %s records from LogDB WMStats...", wflow['RequestName']) res = self.logDB.delete(wflow['RequestName'], agent=False) if res == 'delete-error': msg = "Failed to delete logDB docs for wflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalError(msg) cleanStatus = True else: self.logger.info("DRY-RUN: NOT Deleting %s records from LogDB WMStats...", wflow['RequestName']) except Exception as ex: msg = "General Exception while cleaning LogDB records for wflow: %s : %s" self.logger.exception(msg, wflow['RequestName'], str(ex)) return cleanStatus def findTargetStatus(self, wflow): """ Find the proper targeted archival status :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ # Check the available status transitions before we decide the final status targetStatusList = RequestStatus.REQUEST_STATE_TRANSITION.get(wflow['RequestStatus'], []) for status in targetStatusList: if self.targetStatusRegex.match(status): wflow['TargetStatus'] = status self.logger.debug("TargetStatus: %s", wflow['TargetStatus']) return wflow def _checkArchDelayExpired(self, wflow): """ A function to check Archival Expiration Delay based on the information returned by WMStatsServer regarding the time of the last request status transition :param wflow: MSRuleCleaner workflow representation :return: True if the archival delay have been expired """ archDelayExpired = False currentTime = int(time.time()) threshold = self.msConfig['archiveDelayHours'] * 3600 try: lastTransitionTime = wflow['RequestTransition'][-1]['UpdateTime'] if lastTransitionTime and (currentTime - lastTransitionTime) > threshold: archDelayExpired = True except KeyError: self.logger.debug("Could not find status transition history for %s", wflow['RequestName']) return archDelayExpired def setArchivalDelayExpired(self, wflow): """ A function to set the 'IsArchivalDelayExpired' flag """ wflow['IsArchivalDelayExpired'] = self._checkArchDelayExpired(wflow) return wflow def archive(self, wflow): """ Move the workflow to the proper archived status after checking the full cleanup status :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ # Make all the needed checks before trying to archive if not (wflow['IsClean'] or wflow['ForceArchive']): msg = "Not properly cleaned workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalSkip(msg) if not wflow['TargetStatus']: msg = "Could not determine which archival status to target for workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalError(msg) if not wflow['IsLogDBClean']: msg = "LogDB records have not been cleaned for workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalSkip(msg) if not wflow['IsArchivalDelayExpired']: msg = "Archival delay period has not yet expired for workflow: %s." % wflow['RequestName'] raise MSRuleCleanerArchivalSkip(msg) if not self.msConfig['enableRealMode']: msg = "Real Run Mode not enabled." raise MSRuleCleanerArchivalSkip(msg) # Proceed with the actual archival: try: self.reqmgr2.updateRequestStatus(wflow['RequestName'], wflow['TargetStatus']) msg = "Successful status transition to: %s for workflow: %s" self.logger.info(msg, wflow['TargetStatus'], wflow['RequestName']) except Exception as ex: msg = "General Exception while trying status transition to: %s " % wflow['TargetStatus'] msg += "for workflow: %s : %s" % (wflow['RequestName'], str(ex)) raise MSRuleCleanerArchivalError(msg) return wflow def getMSOutputTransferInfo(self, wflow): """ Fetches the transfer information from the MSOutput REST interface for the given workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ headers = {'Accept': 'application/json'} params = {} url = '%s/data/info?request=%s' % (self.msConfig['msOutputUrl'], wflow['RequestName']) try: res = self.curlMgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res)['result'][0] transferInfo = data['transferDoc'] except Exception as ex: msg = "General exception while fetching TransferInfo from MSOutput for %s. " msg += "Error: %s" self.logger.exception(msg, wflow['RequestName'], str(ex)) # Set Transfer status - information fetched from MSOutput only if transferInfo is not None and transferInfo['TransferStatus'] == 'done': wflow['TransferDone'] = True # Set Tape rules status - information fetched from Rucio (tape rule ids from MSOutput) if transferInfo is not None and transferInfo['OutputMap']: tapeRulesStatusList = [] # For setting 'TransferTape' = True we require either no tape rules for the # workflow have been created or all existing tape rules to be in status 'OK', # so every empty TapeRuleID we consider as completed. for mapRecord in transferInfo['OutputMap']: if not mapRecord['TapeRuleID']: continue rucioRule = self.rucio.getRule(mapRecord['TapeRuleID']) if not rucioRule: tapeRulesStatusList.append(False) msg = "Tape rule: %s not found for workflow: %s " msg += "Possible server side error." self.logger.error(msg, mapRecord['TapeRuleID'], wflow['RequestName']) continue if rucioRule['state'] == 'OK': tapeRulesStatusList.append(True) msg = "Tape rule: %s in final state: %s for workflow: %s" self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName']) else: tapeRulesStatusList.append(False) msg = "Tape rule: %s in non final state: %s for workflow: %s" self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName']) if all(tapeRulesStatusList): wflow['TransferTape'] = True return wflow def setParentDatasets(self, wflow): """ Used to resolve parent datasets for a workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ if wflow['InputDataset'] and wflow['IncludeParents']: childDataset = wflow['InputDataset'] parentDataset = findParent([childDataset], self.msConfig['dbsUrl']) # NOTE: If findParent() returned None then the DBS service failed to # resolve the request (it is considered an ERROR outside WMCore) if parentDataset.get(childDataset, None) is None: msg = "Failed to resolve parent dataset for: %s in workflow: %s" % (childDataset, wflow['RequestName']) raise MSRuleCleanerResolveParentError(msg) elif parentDataset: wflow['ParentDataset'] = [parentDataset[childDataset]] msg = "Found parent %s for input dataset %s in workflow: %s " self.logger.info(msg, parentDataset, wflow['InputDataset'], wflow['RequestName']) else: msg = "Could not find parent for input dataset: %s in workflows: %s" self.logger.error(msg, wflow['InputDataset'], wflow['RequestName']) return wflow def getRucioRules(self, wflow, gran, rucioAcct): """ Queries Rucio and builds the relevant list of blocklevel rules for the given workflow :param wflow: A MSRuleCleaner workflow representation :param gran: Data granularity to search for Rucio rules. Possible values: 'block' or 'container' :return: The workflow object """ currPline = wflow['PlineMarkers'][-1] # Create the container list to the rucio account map and set the checkGlobalLocks flag. mapRuleType = {self.msConfig['rucioWmaAccount']: ["OutputDatasets"], self.msConfig['rucioMStrAccount']: ["InputDataset", "MCPileup", "DataPileup", "ParentDataset"]} if rucioAcct == self.msConfig['rucioMStrAccount']: checkGlobalLocks = True else: checkGlobalLocks = False # Find all the data placement rules created by the components: for dataType in mapRuleType[rucioAcct]: dataList = wflow[dataType] if isinstance(wflow[dataType], list) else [wflow[dataType]] for dataCont in dataList: if dataCont is None: continue self.logger.debug("getRucioRules: dataCont: %s", pformat(dataCont)) if checkGlobalLocks and dataCont in self.globalLocks: msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " msg += "RulesToClean list for both container and block level Rules for workflow: %s!" self.logger.info(msg, dataCont, wflow['RequestName']) continue if gran == 'container': for rule in self.rucio.listDataRules(dataCont, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) msg = "Found %s container-level rule to be deleted for container %s" self.logger.info(msg, rule['id'], dataCont) elif gran == 'block': try: blocks = self.rucio.getBlocksInContainer(dataCont) for block in blocks: for rule in self.rucio.listDataRules(block, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) msg = "Found %s block-level rule to be deleted for container %s" self.logger.info(msg, rule['id'], dataCont) except WMRucioDIDNotFoundException: msg = "Container: %s not found in Rucio for workflow: %s." self.logger.info(msg, dataCont, wflow['RequestName']) return wflow def cleanRucioRules(self, wflow): """ Cleans all the Rules present in the field 'RulesToClean' in the MSRuleCleaner workflow representation. And fills the relevant Cleanup Status. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ # NOTE: The function should be called independently and sequentially from # The Input and the respective BlockLevel pipelines. # NOTE: The current functional pipeline is always the last one in the PlineMarkers list currPline = wflow['PlineMarkers'][-1] delResults = [] if self.msConfig['enableRealMode']: for rule in wflow['RulesToClean'][currPline]: self.logger.info("%s: Deleting ruleId: %s ", currPline, rule) delResult = self.rucio.deleteRule(rule) delResults.append(delResult) if not delResult: self.logger.warning("%s: Failed to delete ruleId: %s ", currPline, rule) else: for rule in wflow['RulesToClean'][currPline]: delResults.append(True) self.logger.info("%s: DRY-RUN: Is about to delete ruleId: %s ", currPline, rule) # Set the cleanup flag: wflow['CleanupStatus'][currPline] = all(delResults) return wflow def getRequestRecords(self, reqStatus): """ Queries ReqMgr2 for requests in a given status. :param reqStatus: The status for the requests to be fetched from ReqMgr2 :return requests: A dictionary with all the workflows in the given status """ self.logger.info("Fetching requests in status: %s", reqStatus) result = self.reqmgr2.getRequestByStatus([reqStatus], detail=True) if not result: requests = {} else: requests = result[0] self.logger.info(' retrieved %s requests in status: %s', len(requests), reqStatus) return requests
class LogDBTest(unittest.TestCase): """ _LogDBTest_ """ def setUp(self): """ _setUp_ Setup the database and logging connection. """ logging.basicConfig() self.logger = logging.getLogger('LogDBTest') # self.logger.setLevel(logging.DEBUG) self.schema = [] self.couchApps = ["LogDB"] self.testInit = TestInitCouchApp('LogDBTest') self.testInit.setLogging() self.testInit.setDatabaseConnection() self.testInit.setSchema(customModules=self.schema, useDefault=False) dbName = 'logdb_t' self.testInit.setupCouch(dbName, *self.couchApps) url = "%s/%s" % (self.testInit.couchUrl, dbName) identifier = 'agentname' self.agent_inst = LogDB(url, identifier, logger=self.logger, thread_name="MainThread") self.agent_inst2 = LogDB(url, identifier, logger=self.logger, thread_name="Test") identifier = '/DC=org/DC=doegrids/OU=People/CN=First Last 123' self.user_inst = LogDB(url, identifier, logger=self.logger, thread_name="MainThread") self.report = LogDBReport(self.agent_inst) def tearDown(self): """ _tearDown_ Drop all the WMBS tables. """ self.testInit.tearDownCouch() def test_docid(self): "Test docid API" request = 'abs' mtype = self.agent_inst.backend.prefix('info') res = self.agent_inst.backend.docid(request, mtype) key = '--'.join([ request, self.agent_inst.identifier, self.agent_inst.thread_name, mtype ]) khash = gen_hash(key) self.assertEqual(res, khash) def test_prefix(self): "Test prefix LogDBBackend API" self.assertEqual(self.agent_inst.agent, 1) mtype = self.agent_inst.backend.prefix('msg') self.assertEqual(mtype, 'agent-msg') self.assertEqual(self.user_inst.agent, 0) mtype = self.user_inst.backend.prefix('msg') self.assertEqual(mtype, 'msg') def test_apis(self): "Test LogDB APIs" request = 'abc' # if we post messages for the same request only last one should survive self.agent_inst.post(request, 'msg1') # time.sleep(1) self.agent_inst.post(request, 'msg2') # time.sleep(1) self.agent_inst.post(request, 'msg3') # time.sleep(1) self.agent_inst.post(request, 'msg4') # time.sleep(1) docs = self.agent_inst.get(request) self.assertEqual(len(docs), 1) self.assertEqual(docs[0]['msg'], 'msg4') # add message as user, user based messages will grow self.user_inst.post(request, 'doc1') # time.sleep(1) docs = self.user_inst.get(request) self.assertEqual(len(docs), 2) # we have msg4 and doc1 self.user_inst.post(request, 'doc1') # we have msg2 and two doc1 messages # time.sleep(1) docs = self.user_inst.get(request) self.assertEqual(len(docs), 3) def test_cleanup(self): "Test clean-up LogDB API" request = 'abc' self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.post(request, 'msg2', 'comment') self.agent_inst.post(request, 'msg3', 'warning') self.agent_inst.post(request, 'msg4', 'error') all_docs = self.agent_inst.get(request) self.agent_inst.backend.cleanup(thr=10) # look into past past_docs = self.agent_inst.get(request) self.assertEqual(len(all_docs), len(past_docs)) self.agent_inst.backend.cleanup(thr=-10) # look into future docs = self.agent_inst.get(request) self.assertEqual(len(docs), 0) def test_get_all_requests(self): "Test get_all_requests LogDB API" request = 'abc' self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.post(request, 'msg2', 'info') self.agent_inst.post(request, 'msg3', 'warning') self.agent_inst.post(request, 'msg4', 'error') sum_docs = self.agent_inst.get(request) self.assertEqual(len(sum_docs), 3) # since we have two info records request = 'abc2' self.agent_inst.post(request, 'msg1', 'info') all_docs = self.agent_inst.get_all_requests() self.assertEqual(len(all_docs), 2) # we only have two distinct request names def test_delete(self): "Test delete LogDB API" request = 'abc' self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.delete(request) all_docs = self.agent_inst.get_all_requests() self.assertEqual(len(all_docs), 0) def test_get(self): "Test get LogDB API" request = 'abc' self.agent_inst2.post(request, 'msg1', 'info') # doc with different thread name self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.post(request, 'msg2', 'info') self.agent_inst.post(request, 'msg3', 'warning') self.agent_inst.post(request, 'msg4', 'error') docs = self.agent_inst.get(request) self.assertEqual( len(docs), 4 ) # there should be 4 docs one for Test thread and 3 for MainThread docs = self.agent_inst2.get(request) self.assertEqual(len(docs), 4) # this should result the same as above. docs1 = self.agent_inst.get(request, 'info') docs2 = self.agent_inst.get(request, 'info') req1 = set([r['request'] for r in docs1]) self.assertEqual(len(req1), 1) req2 = set([r['request'] for r in docs2]) self.assertEqual(len(req2), 1) self.assertEqual(req1, req2) def test_thread_name(self): "Test thread name support by LogDB APIs" request = 'abc' self.agent_inst2.post(request, 'msg1', 'info') # doc with different thread name self.agent_inst.post(request, 'msg1', 'info') self.agent_inst.post(request, 'msg2', 'info') self.agent_inst.post(request, 'msg3', 'warning') self.agent_inst.post(request, 'msg4', 'error') maindocs = self.agent_inst.get(request) self.assertEqual(len(maindocs), 4) # since we have two info records in agent_inst thr_docs = self.agent_inst2.get(request) self.assertEqual(len(thr_docs), 4) # number of docs in different thread req1 = set([r['request'] for r in maindocs]) self.assertEqual(len(req1), 1) req2 = set([r['request'] for r in thr_docs]) self.assertEqual(len(req2), 1) self.assertEqual(req1, req2) worker1 = set([r['thr'] for r in maindocs]) self.assertEqual(worker1, set(["Test", "MainThread"])) worker2 = set([r['thr'] for r in thr_docs]) self.assertEqual(worker2, set(["Test", "MainThread"])) docs = set([r['identifier'] for r in maindocs]) self.assertEqual(docs, set([self.agent_inst.identifier])) docs = set([r['identifier'] for r in thr_docs]) self.assertEqual(docs, set([self.agent_inst.identifier])) def test_checks(self): "Tests LogDB check/mtype functionality" request = 'abc' res = self.agent_inst.post( request, 'msg', 'ingo') # should be silent, i.o. no Exceptions self.assertEqual('post-error', res) args = (request, 'msg', 'ingo') # wrong type self.assertRaises(LogDBError, self.agent_inst.backend.agent_update, *args) args = (request, 'ingo') # wrong type self.assertRaises(LogDBError, self.agent_inst.backend.check, *args) args = ('', 'msg', 'info') # empty request string self.assertRaises(LogDBError, self.agent_inst.backend.agent_update, *args) args = ('', 'info') # empty request string self.assertRaises(LogDBError, self.agent_inst.backend.check, *args) def test_report(self): "Test LogDBReport APIs" request = 'abc' self.agent_inst.post(request, 'msg1', 'info') time.sleep(1) self.agent_inst.post(request, 'msg2', 'comment') time.sleep(1) self.agent_inst.post(request, 'msg3', 'warning') time.sleep(1) self.agent_inst.post(request, 'msg4', 'error') time.sleep(1) self.report.to_stdout(request) out = self.report.to_txt(request) print("\n### txt report\n", out) out = self.report.to_html(request) print("\n### html report\n", out) docs = self.agent_inst.get(request) tdocs = self.report.orderby(docs, order='ts') messages = [d['msg'] for d in tdocs] self.assertEqual(messages, ['msg4', 'msg3', 'msg2', 'msg1']) def test_heartbeat(self): "Test heartbeat_report function" threadName = 'MainThread' failThread = 'FailThread' self.agent_inst.post(msg="Test heartbeat", mtype="info") time.sleep(1) self.agent_inst.post(msg="Test heartbeatError", mtype="error") report = self.agent_inst.heartbeat_report() self.assertEqual(report[threadName]['type'], 'agent-error') report = self.agent_inst.wmstats_down_components_report([threadName]) self.assertEqual(report['down_components'], [threadName]) self.agent_inst.post(msg="Test heartbeat", mtype="info") report = self.agent_inst.wmstats_down_components_report([threadName]) self.assertEqual(report['down_components'], []) report = self.agent_inst.wmstats_down_components_report( [threadName, failThread]) self.assertEqual(report['down_components'], [failThread]) self.agent_inst.post(msg="Test heartbeatError2", mtype="error") report = self.agent_inst.wmstats_down_components_report([threadName]) self.assertEqual(report['down_components'], [threadName]) self.agent_inst.delete(mtype="error", this_thread=True) report = self.agent_inst.wmstats_down_components_report([threadName]) self.assertEqual(report['down_components'], [])