def agentsSites(url): "Return list of sites known in CMS WMAgents" sites_ready_in_agent = set() headers = {'Accept': 'application/json'} params = {} mgr = RequestHandler() res = mgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res) agents = {} for r in [i['value'] for i in data['rows']]: team = r['agent_team'] if team != 'production': continue agents.setdefault(team, []).append(r) for team, agents in viewitems(agents): for agent in agents: if agent['status'] != 'ok': continue for site, sinfo in viewitems(agent['WMBS_INFO']['thresholds']): if sinfo['state'] in ['Normal']: sites_ready_in_agent.add(site) return sites_ready_in_agent
def renewRucioToken(rucioAuthUrl, userToken): """ Provided a user Rucio token, check it's lifetime and extend it by another hour :param rucioAuthUrl: url to the rucio authentication server :param rucioAcct: rucio account to be used :return: a datetime.datetime object with the new token lifetime """ params = {} headers = {"X-Rucio-Auth-Token": userToken} url = '%s/auth/validate' % rucioAuthUrl logging.info("Renewing the Rucio token...") mgr = RequestHandler() res = mgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) try: newExpiration = eval(res)['lifetime'] except Exception as exc: raise RuntimeError( "Failed to renew Rucio token. Response: {} Error: {}".format( res, str(exc))) return newExpiration
class MicroServiceTest(unittest.TestCase): "Unit test for MicroService module" def setUp(self): "Setup MicroService for testing" self.app = ServiceManager() config = TestConfig manager = 'WMCore_t.Services_t.MicroService_t.MicroService_t.ServiceManager' config.views.data.manager = manager config.manager = manager mount = '/microservice' self.mgr = RequestHandler() self.port = config.main.port self.url = 'http://localhost:%s%s/data' % (self.port, mount) cherrypy.config["server.socket_port"] = self.port self.server = RestInterface(self.app, config, mount) cherrypy.tree.mount(self.server, mount) cherrypy.engine.start() def tearDown(self): "Tear down MicroService" cherrypy.engine.exit() cherrypy.engine.stop() def postRequest(self, params): "Perform POST request to our MicroService" headers = {'Content-type': 'application/json'} print("### post call %s params=%s headers=%s" % (self.url, params, headers)) data = self.mgr.getdata(self.url, params=params, headers=headers, \ verb='POST', cert=cert(), ckey=ckey()) print("### post call data %s" % data) return json.loads(data) def test_getState(self): "Test function for getting state of the MicroService" url = '%s/status' % self.url data = self.mgr.getdata(url, params={}) state = "bla" data = {"request": {"state": state}} self.postRequest(data) data = self.mgr.getdata(url, params={}) data = json.loads(data) print("### url=%s, data=%s" % (url, data)) for row in data['result']: if 'state' in row: self.assertEqual(state, row['state'])
def getSpec(self, request, reqSpecs=None): "Get request from workload cache" if reqSpecs and request['RequestName'] in reqSpecs: return reqSpecs[request['RequestName']] url = str('%s/%s/spec' % (self.msConfig['reqmgrCacheUrl'], request['RequestName'])) mgr = RequestHandler() data = mgr.getdata(url, params={}, cert=cert(), ckey=ckey()) return pickle.loads(data)
def findParent(dataset): "Helper function to find a parent of the dataset" url = '%s/datasetparents' % dbsUrl() params = {'dataset': dataset} headers = {'Accept': 'application/json'} mgr = RequestHandler() data = mgr.getdata(url, params=params, headers=headers, cert=cert(), ckey=ckey()) return [str(i['parent_dataset']) for i in json.loads(data)]
def getSpec(request, reqSpecs=None): "Get request from workload cache" if reqSpecs and request['RequestName'] in reqSpecs: return reqSpecs[request['RequestName']] url = str('%s/%s/spec' % (reqmgrCacheUrl(), request['RequestName'])) mgr = RequestHandler() data = mgr.getdata(url, params={}, cert=cert(), ckey=ckey()) return pickle.loads(data)
class MicroServiceTest(unittest.TestCase): "Unit test for MicroService module" def setUp(self): "Setup MicroService for testing" self.app = ServiceManager() config = TestConfig manager = 'WMCore_t.Services_t.MicroService_t.MicroService_t.ServiceManager' config.views.data.manager = manager config.manager = manager mount = '/microservice' self.mgr = RequestHandler() self.port = config.main.port self.url = 'http://localhost:%s%s/data' % (self.port, mount) cherrypy.config["server.socket_port"] = self.port self.server = RestApiHub(self.app, config, mount) cherrypy.tree.mount(self.server, mount) cherrypy.engine.start() def tearDown(self): "Tear down MicroService" cherrypy.engine.exit() cherrypy.engine.stop() def postRequest(self, params): "Perform POST request to our MicroService" headers = {'Content-type': 'application/json'} print("### post call %s params=%s headers=%s" % (self.url, params, headers)) data = self.mgr.getdata(self.url, params=params, headers=headers, \ verb='POST', cert=cert(), ckey=ckey()) print("### post call data %s" % data) return json.loads(data) def test_getState(self): "Test function for getting state of the MicroService" url = '%s/status' % self.url data = self.mgr.getdata(url, params={}) state = "bla" data = {"request":{"state": state}} self.postRequest(data) data = self.mgr.getdata(url, params={}) data = json.loads(data) print("### url=%s, data=%s" % (url, data)) for row in data['result']: if 'state' in row: self.assertEqual(state, row['state'])
def getNodes(kind): "Get list of PhEDEx nodes" params = {} headers = {'Accept': 'application/json'} url = '%s/nodes' % phedexUrl() mgr = RequestHandler() data = mgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) nodes = json.loads(data)['phedex']['node'] return [node['name'] for node in nodes if node['kind'] == kind]
def getWorkflows(state): "Get list of workflows from ReqMgr2 data-service" url = '%s/data/request' % reqmgrUrl() headers = {'Accept': 'application/json'} params = {'status': state} mgr = RequestHandler() res = mgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res) return data.get('result', [])
def getWorkflow(requestName): "Get list of workflow info from ReqMgr2 data-service for given request name" headers = {'Accept': 'application/json'} params = {} url = '%s/data/request/%s' % (reqmgrUrl(), requestName) mgr = RequestHandler() res = mgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res) return data.get('result', [])
def getNodesForId(phedexid): "Helper function to get nodes for given phedex id" url = '%s/requestlist' % phedexUrl() params = {'request': str(phedexid)} headers = {'Accept': 'application/json'} mgr = RequestHandler() data = mgr.getdata(url, params, headers, ckey=ckey(), cert=cert()) items = json.loads(data)['phedex']['request'] nodes = [n['name'] for i in items for n in i['node']] return list(set(nodes))
def getRequest(url, params): "Helper function to GET data from given URL" mgr = RequestHandler() headers = {'Accept': 'application/json'} verbose = 0 if 'verbose' in params: verbose = params['verbose'] del params['verbose'] data = mgr.getdata(url, params, headers, ckey=ckey(), cert=cert(), verbose=verbose) return data
def getdata(url, params, headers=None): "Helper function to get data from the service" ckey, cert = getKeyCertFromEnv() mgr = RequestHandler() res = mgr.getdata(url, params=params, headers=headers, ckey=ckey, cert=cert) return json.loads(res)
def postRequest(url, params): "Helper function to POST request to given URL" mgr = RequestHandler() headers = {'Accept': 'application/json'} verbose = 0 if 'verbose' in params: verbose = params['verbose'] del params['verbose'] data = mgr.getdata(url, params, headers, ckey=ckey(), cert=cert(), \ verb='POST', verbose=verbose) return data
def getDetoxQuota(url): "Get list of workflow info from ReqMgr2 data-service for given request name" headers = {} params = {} mgr = RequestHandler() res = mgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) res = res.split('\n') return res
def findParent(dataset, dbsUrl): "Helper function to find a parent of the dataset" url = '%s/datasetparents' % dbsUrl params = {'dataset': dataset} headers = {'Accept': 'application/json'} mgr = RequestHandler() data = mgr.getdata(url, params=params, headers=headers, cert=cert(), ckey=ckey()) return [str(i['parent_dataset']) for i in json.loads(data)]
def getNodeQueues(): "Helper function to fetch nodes usage from PhEDEx data service" headers = {'Accept': 'application/json'} params = {} mgr = RequestHandler() url = '%s/nodeusagehistory' % phedexUrl() res = mgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res) ret = defaultdict(int) for node in data['phedex']['node']: for usage in node['usage']: ret[node['name']] += int(usage['miss_bytes'] / 1023.**4) #in TB return ret
def getWorkflow(requestName, reqMgrUrl): "Get list of workflow info from ReqMgr2 data-service for given request name" headers = {'Accept': 'application/json'} params = {} url = '%s/data/request/%s' % (reqMgrUrl, requestName) mgr = RequestHandler() res = mgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res) return data.get('result', [])
def workqueueRequests(state=None): "Helper functions to get requests from WorkQueue" url = workqueueView('jobsByRequest') if state: pass # we may need to use state when we'll query WorkQueue params = {} headers = {'Accept': 'application/json'} mgr = RequestHandler() data = mgr.getdata(url, params=params, headers=headers, cert=cert(), ckey=ckey()) data = json.loads(data) rdict = {} for row in data.get('rows', []): rdict[row['key']] = row['value'] return rdict
def _postRequest(self, url, params, verb='POST', verbose=0): "Helper function to POST request to given URL" mgr = RequestHandler(logger=self.logger) headers = copy(self.configDict['headers']) headers.update({"Authorization": self._token}) try: data = mgr.getdata(url, params, headers, verb=verb, verbose=verbose) return json.loads(data) except Exception as exc: self.logger.error("Failed to retrieve data from MonIT. Error: %s", str(exc)) return None
class AuxCacheUpdateTasks(CherryPyPeriodicTask): """ Updates Aux db update periodically. (i.e. TagCollector) """ def __init__(self, rest, config): super(AuxCacheUpdateTasks, self).__init__(config) self.reqmgrAux = ReqMgrAux(config.reqmgr2_url, logger=self.logger) self.mgr = RequestHandler() def setConcurrentTasks(self, config): """ sets the list of functions which """ self.concurrentTasks = [{ 'func': self.updateAuxiliarDocs, 'duration': config.tagCollectDuration }] def updateAuxiliarDocs(self, config): """ Update the central couch database with auxiliary documents that need to be constanly updated whenever an update is made at the data source """ self.logger.info("Updating auxiliary couch documents ...") self.reqmgrAux.populateCMSSWVersion(config.tagcollect_url, **config.tagcollect_args) try: data = self.mgr.getdata(config.unified_url, params={}, headers={'Accept': 'application/json'}) data = json.loads(data) except Exception as ex: msg = "Failed to retrieve unified configuration from github. Error: %s" % str( ex) msg += "\nRetrying again in the next cycle" self.logger.error(msg) return self.reqmgrAux.updateUnifiedConfig(data, docName="config")
def alterSubscription(phedexid, decision, comments, nodes=None): "Helper function to alter subscriptions for given phedex id and nodes" mgr = RequestHandler() headers = {'Accept': 'application/json'} nodes = nodes if nodes else getNodesForId(phedexid) params = { 'decision': decision, 'request': phedexid, 'node': ','.join(nodes), 'comments': comments } url = '%s/updaterequest' data = mgr.getdata(url, params, headers, ckey=ckey(), cert=cert(), verb='POST') result = json.loads(data) if not result: return False if 'already' in result: return True return result
def agentsSites(url): "Return list of sites known in CMS WMAgents" sites_ready_in_agent = set() headers = {'Accept': 'application/json'} params = {} mgr = RequestHandler() res = mgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res) agents = {} for r in [i['value'] for i in data['rows']]: team = r['agent_team'] if team != 'production': continue agents.setdefault(team, []).append(r) for team, agents in agents.items(): for agent in agents: if agent['status'] != 'ok': continue for site, sinfo in agent['WMBS_INFO']['thresholds'].iteritems(): if sinfo['state'] in ['Normal']: sites_ready_in_agent.add(site) return sites_ready_in_agent
class MSRuleCleaner(MSCore): """ MSRuleCleaner.py class provides the logic used to clean the Rucio block level data placement rules created by WMAgent. """ def __init__(self, msConfig, logger=None): """ Runs the basic setup and initialization for the MSRuleCleaner module :param msConfig: micro service configuration """ super(MSRuleCleaner, self).__init__(msConfig, logger=logger) self.msConfig.setdefault("verbose", True) self.msConfig.setdefault("interval", 60) self.msConfig.setdefault("services", ['ruleCleaner']) self.msConfig.setdefault("rucioWmaAccount", "wma_test") self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor") self.msConfig.setdefault('enableRealMode', False) self.mode = "RealMode" if self.msConfig[ 'enableRealMode'] else "DryRunMode" self.emailAlert = EmailAlert(self.msConfig) self.curlMgr = RequestHandler() # Building all the Pipelines: pName = 'plineMSTrCont' self.plineMSTrCont = Pipeline(name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.cleanRucioRules) ]) pName = 'plineMSTrBlock' self.plineMSTrBlock = Pipeline(name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.cleanRucioRules) ]) pName = 'plineAgentCont' self.plineAgentCont = Pipeline( name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'container', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules) ]) pName = 'plineAgentBlock' self.plineAgentBlock = Pipeline( name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'block', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules) ]) pName = 'plineArchive' self.plineArchive = Pipeline(name=pName, funcLine=[ Functor(self.setPlineMarker, pName), Functor(self.setClean), Functor(self.archive) ]) # Building the different set of plines we will need later: # NOTE: The following are all the functional pipelines which are supposed to include # a cleanup function and report cleanup status in the MSRuleCleanerWflow object self.cleanuplines = [ self.plineMSTrCont, self.plineMSTrBlock, self.plineAgentCont, self.plineAgentBlock ] # Building an auxiliary list of cleanup pipeline names only: self.cleanupPipeNames = [pline.name for pline in self.cleanuplines] # Building lists of pipelines related only to Agents or MStransferror self.agentlines = [self.plineAgentCont, self.plineAgentBlock] self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock] # Initialization of the 'cleaned' and 'archived' counters: self.wfCounters = {'cleaned': {}, 'archived': 0} def resetCounters(self): """ A simple function for zeroing the cleaned and archived counters. """ for pline in self.cleanuplines: self.wfCounters['cleaned'][pline.name] = 0 self.wfCounters['archived'] = 0 def execute(self, reqStatus): """ Executes the whole ruleCleaner logic :return: summary """ # start threads in MSManager which should call this method summary = dict(RULECLEANER_REPORT) self.currThread = current_thread() self.currThreadIdent = self.currThread.name self.updateReportDict(summary, "thread_id", self.currThreadIdent) self.resetCounters() self.logger.info("MSRuleCleaner is running in mode: %s.", self.mode) # Build the list of workflows to work on: try: requestRecords = {} for status in reqStatus: requestRecords.update(self.getRequestRecords(status)) except Exception as err: # general error msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str( err) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) # Call _execute() and feed the relevant pipeline with the objects popped from requestRecords try: totalNumRequests, cleanNumRequests, archivedNumRequests = self._execute( requestRecords) msg = "\nNumber of processed workflows: %s." msg += "\nNumber of properly cleaned workflows: %s." msg += "\nNumber of archived workflows: %s." self.logger.info(msg, totalNumRequests, cleanNumRequests, archivedNumRequests) self.updateReportDict(summary, "total_num_requests", totalNumRequests) self.updateReportDict(summary, "clean_num_requests", cleanNumRequests) self.updateReportDict(summary, "archived_num_requests", archivedNumRequests) except Exception as ex: msg = "Unknown exception while running MSRuleCleaner thread Error: %s" self.logger.exception(msg, str(ex)) self.updateReportDict(summary, "error", msg) return summary def _execute(self, reqRecords): """ Executes the MSRuleCleaner pipelines based on the workflow status :param reqList: A list of RequestRecords to work on :return: a tuple with: number of properly cleaned requests number of processed workflows number of archived workflows """ # NOTE: The Input Cleanup, the Block Level Cleanup and the Archival # Pipelines are executed sequentially in the above order. # This way we assure ourselves that we archive only workflows # that have accomplished the needed cleanup cleanNumRequests = 0 totalNumRequests = 0 # Call the workflow dispatcher: for _, req in reqRecords.items(): wflow = MSRuleCleanerWflow(req) self._dispatchWflow(wflow) msg = "\n----------------------------------------------------------" msg += "\nMSRuleCleanerWflow: %s" msg += "\n----------------------------------------------------------" self.logger.debug(msg, pformat(wflow)) totalNumRequests += 1 if self._checkClean(wflow): cleanNumRequests += 1 # Report the counters: for pline in self.cleanuplines: msg = "Workflows cleaned by pipeline: %s: %d" self.logger.info(msg, pline.name, self.wfCounters['cleaned'][pline.name]) archivedNumRequests = self.wfCounters['archived'] self.logger.info("Workflows archived: %d", self.wfCounters['archived']) return totalNumRequests, cleanNumRequests, archivedNumRequests def _dispatchWflow(self, wflow): """ A function intended to dispatch a workflow (e.g based on its status) through one or more functional pipelines in case there is some more complicated logic involved in the order we execute them but not just a sequentially """ self.logger.debug("Dispatching workflow: %s", wflow['RequestName']) # NOTE: The following dispatch logic is a subject to be changed at any time # Resolve: # NOTE: First resolve any preliminary flags that will be needed further # in the logic of the _dispatcher() itself if wflow['RequestStatus'] == 'announced': self.getMSOutputTransferInfo(wflow) # Clean: # Do not clean any Resubmission, but still let them be archived if wflow['RequestType'] == 'Resubmission': wflow['ForceArchive'] = True msg = "Skipping cleanup step for workflow: %s - RequestType is %s." msg += " Will try to archive it directly." self.logger.info(msg, wflow['RequestName'], wflow['RequestType']) elif wflow['RequestStatus'] in ['rejected', 'aborted-completed']: # NOTE: We do not check the ParentageResolved flag for these # workflows, but we do need to clean output data placement # rules from the agents for them for pline in self.agentlines: try: pline.run(wflow) except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 elif wflow['RequestStatus'] == 'announced' and not wflow[ 'ParentageResolved']: # NOTE: We skip workflows which are not having 'ParentageResolved' # flag, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'ParentageResolved' flag set to false." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow[ 'RequestStatus'] == 'announced' and not wflow['TransferDone']: # NOTE: We skip workflows which have not yet finalised their TransferStatus # in MSOutput, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'TransferStatus' is 'pending' or 'TransferInfo' is missing in MSOutput." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced': for pline in self.cleanuplines: try: pline.run(wflow) except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 else: # We shouldn't be here: msg = "Skipping workflow: %s - " msg += "Does not fall under any of the defined categories." self.logger.error(msg, wflow['RequestName']) # Archive: try: self.plineArchive.run(wflow) self.wfCounters['archived'] += 1 except MSRuleCleanerArchival as ex: msg = "%s: Archival Error: %s. " msg += " Will retry again in the next cycle." self.logger.error(msg, wflow['PlineMarkers'][-1], ex.message()) except Exception as ex: msg = "%s General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, wflow['PlineMarkers'][-1], wflow['RequestName'], str(ex)) def setPlineMarker(self, wflow, pName): """ A function intended to mark which is the pipeline currently working on the workflow. It is supposed to be called always as a first function in the pipeline. :param wflow: A MSRuleCleaner workflow representation :param pName: The name of the functional pipeline :return wflow: """ # NOTE: The current functional pipeline MUST always be appended at the # end of the 'PlineMarkers' list # First get rid of the default: if not wflow['PlineMarkers']: wflow['PlineMarkers'] = [] # Then push our current value into the markers list: wflow['PlineMarkers'].append(pName) # Populate the list of flags to be used later: if pName not in wflow['RulesToClean']: if pName in self.cleanupPipeNames: wflow['RulesToClean'][pName] = [] if pName not in wflow['CleanupStatus']: if pName in self.cleanupPipeNames: wflow['CleanupStatus'][pName] = False return wflow def _checkClean(self, wflow): """ An auxiliary function used to only check the temporary cleanup status. It basically takes the pipelines registered in 'PlineMarkers' that have already worked on the workflow as a mask and applies this mask over the set of flags in the 'CleanupStatus' field and then reduces the result to a single bool value """ # NOTE: This is one of the few functions taking a workflow as an argument # but returning a bool, since it is an auxiliary function and is not # supposed to be called as a standalone function in a pipeline. # NOTE: `all([]) == True`, ergo all the 'rejected' && 'aborted-completed' workflows # are also counted as properly cleaned and can trigger archival later # Build a list of bool flags based on the mask of PlineMarkers cleanFlagsList = [ wflow['CleanupStatus'][key] for key in wflow['PlineMarkers'] if key in wflow['CleanupStatus'].keys() ] # If no one have worked on the workflow set the clean status to false if not wflow['PlineMarkers']: cleanStatus = False # If we have a mask longer than the list of flags avoid false positives # because of the behavior explained above - `all([]) == True` elif not cleanFlagsList: cleanStatus = False # Figure out the final value else: cleanStatus = all(cleanFlagsList) return cleanStatus def setClean(self, wflow): """ A function to set the 'IsClean' flag based on the status from all the pipelines which have worked on the workflow (and have put their markers in the 'PlineMarkers' list) :param wflow: A MSRuleCleaner workflow representation :return wflow: """ wflow['IsClean'] = self._checkClean(wflow) return wflow def archive(self, wflow): """ Move the workflow to the proper archived status after checking the full cleanup status :param wflow: A MSRuleCleaner workflow representation :param archStatus: Target status to transition after archival :return wflow: """ # NOTE: check allowed status transitions with: # https://github.com/dmwm/WMCore/blob/5961d2229b1e548e58259c06af154f33bce36c68/src/python/WMCore/ReqMgr/DataStructs/RequestStatus.py#L171 if not (wflow['IsClean'] or wflow['ForceArchive']): msg = "Not properly cleaned workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchival(msg) # Check the available status transitions before we decide the final status targetStatusList = RequestStatus.REQUEST_STATE_TRANSITION.get( wflow['RequestStatus'], []) self.logger.info("targetStatusList: %s", targetStatusList) return wflow def getMSOutputTransferInfo(self, wflow): """ Fetches the transfer information from the MSOutput REST interface for the given workflow. :param wflow: A MSRuleCleaner workflow representation :return wflow: """ headers = {'Accept': 'application/json'} params = {} url = '%s/data/info?request=%s' % (self.msConfig['msOutputUrl'], wflow['RequestName']) try: res = self.curlMgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res)['result'][0] transferInfo = data['transferDoc'] except Exception as ex: msg = "General exception while fetching TransferInfo from MSOutput for %s. " msg += "Error: %s" self.logger.exception(msg, wflow['RequestName'], str(ex)) if transferInfo is not None and transferInfo[ 'TransferStatus'] == 'done': wflow['TransferDone'] = True return wflow def getRucioRules(self, wflow, gran, rucioAcct): """ Queries Rucio and builds the relevant list of blocklevel rules for the given workflow :param wflow: A MSRuleCleaner workflow representation :param gran: Data granularity to search for Rucio rules. Possible values: 'block' || 'container' :return: wflow """ currPline = wflow['PlineMarkers'][-1] # Find all the output placement rules created by the agents for dataCont in wflow['OutputDatasets']: if gran == 'container': for rule in self.rucio.listDataRules(dataCont, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) elif gran == 'block': try: blocks = self.rucio.getBlocksInContainer(dataCont) for block in blocks: for rule in self.rucio.listDataRules( block, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) except WMRucioDIDNotFoundException: msg = "Container: %s not found in Rucio for workflow: %s." self.logger.info(msg, dataCont, wflow['RequestName']) return wflow def cleanRucioRules(self, wflow): """ Cleans all the Rules present in the field 'RulesToClean' in the MSRuleCleaner workflow representation. And fills the relevant Cleanup Status. :param wflow: A MSRuleCleaner workflow representation :return: wflow """ # NOTE: The function should be called independently and sequentially from # The Input and the respective BlockLevel pipelines. # NOTE: The current functional pipeline is always the last one in the PlineMarkers list currPline = wflow['PlineMarkers'][-1] delResults = [] if self.msConfig['enableRealMode']: for rule in wflow['RulesToClean'][currPline]: self.logger.info("%s: Deleting ruleId: %s ", currPline, rule) delResult = self.rucio.deleteRule(rule) delResults.append(delResult) if not delResult: self.logger.warning("%s: Failed to delete ruleId: %s ", currPline, rule) else: for rule in wflow['RulesToClean'][currPline]: delResults.append(True) self.logger.info("%s: DRY-RUN: Is about to delete ruleId: %s ", currPline, rule) # Set the cleanup flag: wflow['CleanupStatus'][currPline] = all(delResults) # ---------------------------------------------------------------------- # FIXME : To be removed once the plineMSTrBlock && plineMSTrCont are # developed if wflow['CleanupStatus'][currPline] in [ 'plineMSTrBlock', 'plineMSTrCont' ]: wflow['CleanupStatus'][currPline] = True # ---------------------------------------------------------------------- return wflow def getRequestRecords(self, reqStatus): """ Queries ReqMgr2 for requests in a given status. :param reqStatus: The status for the requests to be fetched from ReqMgr2 :return requests: A dictionary with all the workflows in the given status """ self.logger.info("Fetching requests in status: %s", reqStatus) result = self.reqmgr2.getRequestByStatus([reqStatus], detail=True) if not result: requests = {} else: requests = result[0] self.logger.info(' retrieved %s requests in status: %s', len(requests), reqStatus) return requests
class MSRuleCleaner(MSCore): """ MSRuleCleaner.py class provides the logic used to clean the Rucio block level data placement rules created by WMAgent. """ def __init__(self, msConfig, logger=None): """ Runs the basic setup and initialization for the MSRuleCleaner module :param msConfig: micro service configuration """ super(MSRuleCleaner, self).__init__(msConfig, logger=logger) self.msConfig.setdefault("verbose", True) self.msConfig.setdefault("interval", 60) self.msConfig.setdefault("services", ['ruleCleaner']) self.msConfig.setdefault("rucioWmaAccount", "wma_test") self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor") self.msConfig.setdefault('enableRealMode', False) self.mode = "RealMode" if self.msConfig['enableRealMode'] else "DryRunMode" self.curlMgr = RequestHandler() self.targetStatusRegex = re.compile(r'.*archived') self.logDB = LogDB(self.msConfig["logDBUrl"], self.msConfig["logDBReporter"], logger=self.logger) self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger) # Building all the Pipelines: pName = 'plineMSTrCont' self.plineMSTrCont = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.setParentDatasets), Functor(self.getRucioRules, 'container', self.msConfig['rucioMStrAccount']), Functor(self.cleanRucioRules)]) pName = 'plineMSTrBlock' self.plineMSTrBlock = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.setParentDatasets), Functor(self.getRucioRules, 'block', self.msConfig['rucioMStrAccount']), Functor(self.cleanRucioRules)]) pName = 'plineAgentCont' self.plineAgentCont = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'container', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules)]) pName = 'plineAgentBlock' self.plineAgentBlock = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.getRucioRules, 'block', self.msConfig['rucioWmaAccount']), Functor(self.cleanRucioRules)]) pName = 'plineArchive' self.plineArchive = Pipeline(name=pName, funcLine=[Functor(self.setPlineMarker, pName), Functor(self.findTargetStatus), Functor(self.setClean), Functor(self.setArchivalDelayExpired), Functor(self.setLogDBClean), Functor(self.archive)]) # Building the different set of plines we will need later: # NOTE: The following are all the functional pipelines which are supposed to include # a cleanup function and report cleanup status in the MSRuleCleanerWflow object self.cleanuplines = [self.plineMSTrCont, self.plineMSTrBlock, self.plineAgentCont, self.plineAgentBlock] # Building an auxiliary list of cleanup pipeline names only: self.cleanupPipeNames = [pline.name for pline in self.cleanuplines] # Building lists of pipelines related only to Agents or MStransferror self.agentlines = [self.plineAgentCont, self.plineAgentBlock] self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock] # Initialization of the 'cleaned' and 'archived' counters: self.wfCounters = {'cleaned': {}, 'archived': {'normalArchived': 0, 'forceArchived': 0}} self.globalLocks = set() def getGlobalLocks(self): """ Fetches the list of 'globalLocks' from wmstats server and the list of 'parentLocks' from request manager. Stores/updates the unified set in the 'globalLocks' instance variable. Returns the resultant unified set. :return: A union set of the 'globalLocks' and the 'parentLocks' lists """ self.logger.info("Fetching globalLocks list from wmstats server.") try: globalLocks = set(self.wmstatsSvc.getGlobalLocks()) except Exception as ex: msg = "Failed to refresh global locks list for the current polling cycle. Error: %s " msg += "Skipping this polling cycle." self.logger.error(msg, str(ex)) raise ex self.logger.info("Fetching parentLocks list from reqmgr2 server.") try: parentLocks = set(self.reqmgr2.getParentLocks()) except Exception as ex: msg = "Failed to refresh parent locks list for the current poling cycle. Error: %s " msg += "Skipping this polling cycle." self.logger.error(msg, str(ex)) raise ex self.globalLocks = globalLocks | parentLocks def resetCounters(self): """ A simple function for zeroing the cleaned and archived counters. """ for pline in self.cleanuplines: self.wfCounters['cleaned'][pline.name] = 0 self.wfCounters['archived']['normalArchived'] = 0 self.wfCounters['archived']['forceArchived'] = 0 def execute(self, reqStatus): """ Executes the whole ruleCleaner logic :return: summary """ # start threads in MSManager which should call this method summary = dict(RULECLEANER_REPORT) self.currThread = current_thread() self.currThreadIdent = self.currThread.name self.updateReportDict(summary, "thread_id", self.currThreadIdent) self.resetCounters() self.logger.info("MSRuleCleaner is running in mode: %s.", self.mode) # Build the list of workflows to work on: try: requestRecords = {} for status in reqStatus: requestRecords.update(self.getRequestRecords(status)) except Exception as err: # general error msg = "Unknown exception while fetching requests from ReqMgr2. Error: %s", str(err) self.logger.exception(msg) self.updateReportDict(summary, "error", msg) # Call _execute() and feed the relevant pipeline with the objects popped from requestRecords try: self.getGlobalLocks() totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests = self._execute(requestRecords) msg = "\nNumber of processed workflows: %s." msg += "\nNumber of properly cleaned workflows: %s." msg += "\nNumber of normally archived workflows: %s." msg += "\nNumber of force archived workflows: %s." self.logger.info(msg, totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests) self.updateReportDict(summary, "total_num_requests", totalNumRequests) self.updateReportDict(summary, "clean_num_requests", cleanNumRequests) self.updateReportDict(summary, "normal_archived_num_requests", normalArchivedNumRequests) self.updateReportDict(summary, "force_archived_num_requests", forceArchivedNumRequests) except Exception as ex: msg = "Unknown exception while running MSRuleCleaner thread Error: %s" self.logger.exception(msg, str(ex)) self.updateReportDict(summary, "error", msg) return summary def _execute(self, reqRecords): """ Executes the MSRuleCleaner pipelines based on the workflow status :param reqList: A list of RequestRecords to work on :return: a tuple with: number of properly cleaned requests number of processed workflows number of archived workflows """ # NOTE: The Input Cleanup, the Block Level Cleanup and the Archival # Pipelines are executed sequentially in the above order. # This way we assure ourselves that we archive only workflows # that have accomplished the needed cleanup cleanNumRequests = 0 totalNumRequests = 0 # Call the workflow dispatcher: for req in viewvalues(reqRecords): wflow = MSRuleCleanerWflow(req) self._dispatchWflow(wflow) msg = "\n----------------------------------------------------------" msg += "\nMSRuleCleanerWflow: %s" msg += "\n----------------------------------------------------------" self.logger.debug(msg, pformat(wflow)) totalNumRequests += 1 if self._checkClean(wflow): cleanNumRequests += 1 # Report the counters: for pline in self.cleanuplines: msg = "Workflows cleaned by pipeline: %s: %d" self.logger.info(msg, pline.name, self.wfCounters['cleaned'][pline.name]) normalArchivedNumRequests = self.wfCounters['archived']['normalArchived'] forceArchivedNumRequests = self.wfCounters['archived']['forceArchived'] self.logger.info("Workflows normally archived: %d", self.wfCounters['archived']['normalArchived']) self.logger.info("Workflows force archived: %d", self.wfCounters['archived']['forceArchived']) return totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests def _dispatchWflow(self, wflow): """ A function intended to dispatch a workflow (e.g based on its status) through one or more functional pipelines in case there is some more complicated logic involved in the order we execute them but not just a sequentially """ self.logger.debug("Dispatching workflow: %s", wflow['RequestName']) # NOTE: The following dispatch logic is a subject to be changed at any time # Resolve: # NOTE: First resolve any preliminary flags that will be needed further # in the logic of the _dispatcher() itself if wflow['RequestStatus'] == 'announced': self.getMSOutputTransferInfo(wflow) # Clean: # Do not clean any Resubmission, but still let them be archived if wflow['RequestType'] == 'Resubmission': wflow['ForceArchive'] = True msg = "Skipping cleanup step for workflow: %s - RequestType is %s." msg += " Will try to archive it directly." self.logger.info(msg, wflow['RequestName'], wflow['RequestType']) elif wflow['RequestStatus'] in ['rejected', 'aborted-completed']: # NOTE: We do not check the ParentageResolved flag for these # workflows, but we do need to clean output data placement # rules from the agents for them for pline in self.agentlines: try: pline.run(wflow) except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 elif wflow['RequestStatus'] == 'announced' and not wflow['ParentageResolved']: # NOTE: We skip workflows which are not having 'ParentageResolved' # flag, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'ParentageResolved' flag set to false." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced' and not wflow['TransferDone']: # NOTE: We skip workflows which have not yet finalised their TransferStatus # in MSOutput, but we still need some proper logging for them. msg = "Skipping workflow: %s - 'TransferStatus' is 'pending' or 'TransferInfo' is missing in MSOutput." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced' and not wflow['TransferTape']: # NOTE: We skip workflows which have not yet finalised their tape transfers. # (i.e. even if a single output which is supposed to be covered # by a tape rule is in any of the following transient states: # {REPLICATING, STUCK, SUSPENDED, WAITING_APPROVAL}.) # We still need some proper logging for them. msg = "Skipping workflow: %s - tape transfers are not yet completed." msg += " Will retry again in the next cycle." self.logger.info(msg, wflow['RequestName']) elif wflow['RequestStatus'] == 'announced': for pline in self.cleanuplines: try: pline.run(wflow) except MSRuleCleanerResolveParentError as ex: msg = "%s: Parentage Resolve Error: %s. " msg += "Will retry again in the next cycle." self.logger.error(msg, pline.name, str(ex)) continue except Exception as ex: msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, pline.name, wflow['RequestName'], str(ex)) continue if wflow['CleanupStatus'][pline.name]: self.wfCounters['cleaned'][pline.name] += 1 else: # We shouldn't be here: msg = "Skipping workflow: %s - " msg += "Does not fall under any of the defined categories." self.logger.error(msg, wflow['RequestName']) # Archive: try: self.plineArchive.run(wflow) if wflow['ForceArchive']: self.wfCounters['archived']['forceArchived'] += 1 else: self.wfCounters['archived']['normalArchived'] += 1 except MSRuleCleanerArchivalSkip as ex: msg = "%s: Proper conditions not met: %s. " msg += "Skipping archival in the current cycle." self.logger.info(msg, wflow['PlineMarkers'][-1], str(ex)) except MSRuleCleanerArchivalError as ex: msg = "%s: Archival Error: %s. " msg += "Will retry again in the next cycle." self.logger.error(msg, wflow['PlineMarkers'][-1], str(ex)) except Exception as ex: msg = "%s General error from pipeline. Workflow: %s. Error: \n%s. " msg += "\nWill retry again in the next cycle." self.logger.exception(msg, wflow['PlineMarkers'][-1], wflow['RequestName'], str(ex)) def setPlineMarker(self, wflow, pName): """ A function intended to mark which is the pipeline currently working on the workflow. It is supposed to be called always as a first function in the pipeline. :param wflow: A MSRuleCleaner workflow representation :param pName: The name of the functional pipeline :return: The workflow object """ # NOTE: The current functional pipeline MUST always be appended at the # end of the 'PlineMarkers' list # First get rid of the default: if not wflow['PlineMarkers']: wflow['PlineMarkers'] = [] # Then push our current value into the markers list: wflow['PlineMarkers'].append(pName) # Populate the list of flags to be used later: if pName not in wflow['RulesToClean']: if pName in self.cleanupPipeNames: wflow['RulesToClean'][pName] = [] if pName not in wflow['CleanupStatus']: if pName in self.cleanupPipeNames: wflow['CleanupStatus'][pName] = False return wflow def _checkClean(self, wflow): """ An auxiliary function used to only check the temporary cleanup status. It basically takes the pipelines registered in 'PlineMarkers' that have already worked on the workflow as a mask and applies this mask over the set of flags in the 'CleanupStatus' field and then reduces the result to a single bool value """ # NOTE: This is one of the few functions taking a workflow as an argument # but returning a bool, since it is an auxiliary function and is not # supposed to be called as a standalone function in a pipeline. # NOTE: `all([]) == True`, ergo all the 'rejected' && 'aborted-completed' workflows # are also counted as properly cleaned and can trigger archival later # Build a list of bool flags based on the mask of PlineMarkers cleanFlagsList = [wflow['CleanupStatus'][key] for key in wflow['PlineMarkers'] if key in wflow['CleanupStatus']] # If no one have worked on the workflow set the clean status to false if not wflow['PlineMarkers']: cleanStatus = False # If we have a mask longer than the list of flags avoid false positives # because of the behavior explained above - `all([]) == True` elif not cleanFlagsList: cleanStatus = False # Figure out the final value else: cleanStatus = all(cleanFlagsList) return cleanStatus def setClean(self, wflow): """ A function to set the 'IsClean' flag based on the status from all the pipelines which have worked on the workflow (and have put their markers in the 'PlineMarkers' list) :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ wflow['IsClean'] = self._checkClean(wflow) return wflow def _checkLogDBClean(self, wflow): """ An auxiliary function used to only check the LogDB cleanup status. It makes a query to LogDB in order to verify there are no any records for the current workflow :param wflow: A MSRuleCleaner workflow representation :return: True if no records were found in LogDB about wflow """ cleanStatus = False logDBRecords = self.logDB.get(wflow['RequestName']) self.logger.debug("logDBRecords: %s", pformat(logDBRecords)) if not logDBRecords: cleanStatus = True return cleanStatus def setLogDBClean(self, wflow): """ A function to set the 'IsLogDBClean' flag based on the presence of any records in LogDB for the current workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ wflow['IsLogDBClean'] = self._checkLogDBClean(wflow) if not wflow['IsLogDBClean'] and wflow['IsArchivalDelayExpired']: wflow['IsLogDBClean'] = self._cleanLogDB(wflow) return wflow def _cleanLogDB(self, wflow): """ A function to be used for cleaning all the records related to a workflow in logDB. :param wflow: A MSRuleCleaner workflow representation :return: True if NO errors were encountered while deleting records from LogDB """ cleanStatus = False try: if self.msConfig['enableRealMode']: self.logger.info("Deleting %s records from LogDB WMStats...", wflow['RequestName']) res = self.logDB.delete(wflow['RequestName'], agent=False) if res == 'delete-error': msg = "Failed to delete logDB docs for wflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalError(msg) cleanStatus = True else: self.logger.info("DRY-RUN: NOT Deleting %s records from LogDB WMStats...", wflow['RequestName']) except Exception as ex: msg = "General Exception while cleaning LogDB records for wflow: %s : %s" self.logger.exception(msg, wflow['RequestName'], str(ex)) return cleanStatus def findTargetStatus(self, wflow): """ Find the proper targeted archival status :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ # Check the available status transitions before we decide the final status targetStatusList = RequestStatus.REQUEST_STATE_TRANSITION.get(wflow['RequestStatus'], []) for status in targetStatusList: if self.targetStatusRegex.match(status): wflow['TargetStatus'] = status self.logger.debug("TargetStatus: %s", wflow['TargetStatus']) return wflow def _checkArchDelayExpired(self, wflow): """ A function to check Archival Expiration Delay based on the information returned by WMStatsServer regarding the time of the last request status transition :param wflow: MSRuleCleaner workflow representation :return: True if the archival delay have been expired """ archDelayExpired = False currentTime = int(time.time()) threshold = self.msConfig['archiveDelayHours'] * 3600 try: lastTransitionTime = wflow['RequestTransition'][-1]['UpdateTime'] if lastTransitionTime and (currentTime - lastTransitionTime) > threshold: archDelayExpired = True except KeyError: self.logger.debug("Could not find status transition history for %s", wflow['RequestName']) return archDelayExpired def setArchivalDelayExpired(self, wflow): """ A function to set the 'IsArchivalDelayExpired' flag """ wflow['IsArchivalDelayExpired'] = self._checkArchDelayExpired(wflow) return wflow def archive(self, wflow): """ Move the workflow to the proper archived status after checking the full cleanup status :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ # Make all the needed checks before trying to archive if not (wflow['IsClean'] or wflow['ForceArchive']): msg = "Not properly cleaned workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalSkip(msg) if not wflow['TargetStatus']: msg = "Could not determine which archival status to target for workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalError(msg) if not wflow['IsLogDBClean']: msg = "LogDB records have not been cleaned for workflow: %s" % wflow['RequestName'] raise MSRuleCleanerArchivalSkip(msg) if not wflow['IsArchivalDelayExpired']: msg = "Archival delay period has not yet expired for workflow: %s." % wflow['RequestName'] raise MSRuleCleanerArchivalSkip(msg) if not self.msConfig['enableRealMode']: msg = "Real Run Mode not enabled." raise MSRuleCleanerArchivalSkip(msg) # Proceed with the actual archival: try: self.reqmgr2.updateRequestStatus(wflow['RequestName'], wflow['TargetStatus']) msg = "Successful status transition to: %s for workflow: %s" self.logger.info(msg, wflow['TargetStatus'], wflow['RequestName']) except Exception as ex: msg = "General Exception while trying status transition to: %s " % wflow['TargetStatus'] msg += "for workflow: %s : %s" % (wflow['RequestName'], str(ex)) raise MSRuleCleanerArchivalError(msg) return wflow def getMSOutputTransferInfo(self, wflow): """ Fetches the transfer information from the MSOutput REST interface for the given workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ headers = {'Accept': 'application/json'} params = {} url = '%s/data/info?request=%s' % (self.msConfig['msOutputUrl'], wflow['RequestName']) try: res = self.curlMgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) data = json.loads(res)['result'][0] transferInfo = data['transferDoc'] except Exception as ex: msg = "General exception while fetching TransferInfo from MSOutput for %s. " msg += "Error: %s" self.logger.exception(msg, wflow['RequestName'], str(ex)) # Set Transfer status - information fetched from MSOutput only if transferInfo is not None and transferInfo['TransferStatus'] == 'done': wflow['TransferDone'] = True # Set Tape rules status - information fetched from Rucio (tape rule ids from MSOutput) if transferInfo is not None and transferInfo['OutputMap']: tapeRulesStatusList = [] # For setting 'TransferTape' = True we require either no tape rules for the # workflow have been created or all existing tape rules to be in status 'OK', # so every empty TapeRuleID we consider as completed. for mapRecord in transferInfo['OutputMap']: if not mapRecord['TapeRuleID']: continue rucioRule = self.rucio.getRule(mapRecord['TapeRuleID']) if not rucioRule: tapeRulesStatusList.append(False) msg = "Tape rule: %s not found for workflow: %s " msg += "Possible server side error." self.logger.error(msg, mapRecord['TapeRuleID'], wflow['RequestName']) continue if rucioRule['state'] == 'OK': tapeRulesStatusList.append(True) msg = "Tape rule: %s in final state: %s for workflow: %s" self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName']) else: tapeRulesStatusList.append(False) msg = "Tape rule: %s in non final state: %s for workflow: %s" self.logger.info(msg, mapRecord['TapeRuleID'], rucioRule['state'], wflow['RequestName']) if all(tapeRulesStatusList): wflow['TransferTape'] = True return wflow def setParentDatasets(self, wflow): """ Used to resolve parent datasets for a workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ if wflow['InputDataset'] and wflow['IncludeParents']: childDataset = wflow['InputDataset'] parentDataset = findParent([childDataset], self.msConfig['dbsUrl']) # NOTE: If findParent() returned None then the DBS service failed to # resolve the request (it is considered an ERROR outside WMCore) if parentDataset.get(childDataset, None) is None: msg = "Failed to resolve parent dataset for: %s in workflow: %s" % (childDataset, wflow['RequestName']) raise MSRuleCleanerResolveParentError(msg) elif parentDataset: wflow['ParentDataset'] = [parentDataset[childDataset]] msg = "Found parent %s for input dataset %s in workflow: %s " self.logger.info(msg, parentDataset, wflow['InputDataset'], wflow['RequestName']) else: msg = "Could not find parent for input dataset: %s in workflows: %s" self.logger.error(msg, wflow['InputDataset'], wflow['RequestName']) return wflow def getRucioRules(self, wflow, gran, rucioAcct): """ Queries Rucio and builds the relevant list of blocklevel rules for the given workflow :param wflow: A MSRuleCleaner workflow representation :param gran: Data granularity to search for Rucio rules. Possible values: 'block' or 'container' :return: The workflow object """ currPline = wflow['PlineMarkers'][-1] # Create the container list to the rucio account map and set the checkGlobalLocks flag. mapRuleType = {self.msConfig['rucioWmaAccount']: ["OutputDatasets"], self.msConfig['rucioMStrAccount']: ["InputDataset", "MCPileup", "DataPileup", "ParentDataset"]} if rucioAcct == self.msConfig['rucioMStrAccount']: checkGlobalLocks = True else: checkGlobalLocks = False # Find all the data placement rules created by the components: for dataType in mapRuleType[rucioAcct]: dataList = wflow[dataType] if isinstance(wflow[dataType], list) else [wflow[dataType]] for dataCont in dataList: if dataCont is None: continue self.logger.debug("getRucioRules: dataCont: %s", pformat(dataCont)) if checkGlobalLocks and dataCont in self.globalLocks: msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " msg += "RulesToClean list for both container and block level Rules for workflow: %s!" self.logger.info(msg, dataCont, wflow['RequestName']) continue if gran == 'container': for rule in self.rucio.listDataRules(dataCont, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) msg = "Found %s container-level rule to be deleted for container %s" self.logger.info(msg, rule['id'], dataCont) elif gran == 'block': try: blocks = self.rucio.getBlocksInContainer(dataCont) for block in blocks: for rule in self.rucio.listDataRules(block, account=rucioAcct): wflow['RulesToClean'][currPline].append(rule['id']) msg = "Found %s block-level rule to be deleted for container %s" self.logger.info(msg, rule['id'], dataCont) except WMRucioDIDNotFoundException: msg = "Container: %s not found in Rucio for workflow: %s." self.logger.info(msg, dataCont, wflow['RequestName']) return wflow def cleanRucioRules(self, wflow): """ Cleans all the Rules present in the field 'RulesToClean' in the MSRuleCleaner workflow representation. And fills the relevant Cleanup Status. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ # NOTE: The function should be called independently and sequentially from # The Input and the respective BlockLevel pipelines. # NOTE: The current functional pipeline is always the last one in the PlineMarkers list currPline = wflow['PlineMarkers'][-1] delResults = [] if self.msConfig['enableRealMode']: for rule in wflow['RulesToClean'][currPline]: self.logger.info("%s: Deleting ruleId: %s ", currPline, rule) delResult = self.rucio.deleteRule(rule) delResults.append(delResult) if not delResult: self.logger.warning("%s: Failed to delete ruleId: %s ", currPline, rule) else: for rule in wflow['RulesToClean'][currPline]: delResults.append(True) self.logger.info("%s: DRY-RUN: Is about to delete ruleId: %s ", currPline, rule) # Set the cleanup flag: wflow['CleanupStatus'][currPline] = all(delResults) return wflow def getRequestRecords(self, reqStatus): """ Queries ReqMgr2 for requests in a given status. :param reqStatus: The status for the requests to be fetched from ReqMgr2 :return requests: A dictionary with all the workflows in the given status """ self.logger.info("Fetching requests in status: %s", reqStatus) result = self.reqmgr2.getRequestByStatus([reqStatus], detail=True) if not result: requests = {} else: requests = result[0] self.logger.info(' retrieved %s requests in status: %s', len(requests), reqStatus) return requests
class AlertManagerAPI(object): """ A class used to send alerts via the MONIT AlertManager API """ def __init__(self, alertManagerUrl, logger=None): self.alertManagerUrl = alertManagerUrl # sender's hostname is added as an annotation self.hostname = socket.gethostname() self.mgr = RequestHandler() self.ltz = LocalTimezone() self.headers = {"Content-Type": "application/json"} self.validSeverity = ["high", "medium", "low"] self.logger = logger if logger else logging.getLogger() def sendAlert(self, alertName, severity, summary, description, service, tag="wmcore", endSecs=600, generatorURL=""): """ :param alertName: a unique name for the alert :param severity: low, medium, high :param summary: a short description of the alert :param description: a longer informational message with details about the alert :param service: the name of the service firing an alert :param tag: a unique tag used to help route the alert :param endSecs: how many minutes until the alarm is silenced :param generatorURL: this URL will be sent to AlertManager and configured as a clickable "Source" link in the web interface AlertManager JSON format reference: https://www.prometheus.io/docs/alerting/latest/clients/ [ { "labels": { "alertname": "<requiredAlertName>", "<labelname>": "<labelvalue>", ... }, "annotations": { "<labelname>": "<labelvalue>", ... }, "startsAt": "<rfc3339>", # optional, will be current time if not present "endsAt": "<rfc3339>", "generatorURL": "<generator_url>" # optional }, ] """ if not self._isValidSeverity(severity): return False request = [] alert = {} labels = {} annotations = {} # add labels labels["alertname"] = alertName labels["severity"] = severity labels["tag"] = tag labels["service"] = service alert["labels"] = labels # add annotations annotations["hostname"] = self.hostname annotations["summary"] = summary annotations["description"] = description alert["annotations"] = annotations # In python3 we won't need the LocalTimezone class # Will change to d = datetime.now().astimezone() + timedelta(seconds=endSecs) d = datetime.now(self.ltz) + timedelta(seconds=endSecs) alert["endsAt"] = d.isoformat("T") alert["generatorURL"] = generatorURL request.append(alert) # need to do this because pycurl_manager only accepts dict and encoded strings type params = json.dumps(request) res = self.mgr.getdata(self.alertManagerUrl, params=params, headers=self.headers, verb='POST') return res def _isValidSeverity(self, severity): """ Used to check if the severity of the alert matches the valid levels: low, medium, high :param severity: severity of the alert :return: True or False """ if severity not in self.validSeverity: logging.critical( "Alert submitted to AlertManagerAPI with invalid severity: %s", severity) return False return True
class MicroServiceTest(unittest.TestCase): "Unit test for MicroService module" def setUp(self): "Setup MicroService for testing" self.managerName = "ServiceManager" config = TestConfig manager = 'WMCore_t.MicroService_t.MicroService_t.%s' % self.managerName config.views.data.manager = manager config.manager = manager mount = '/microservice/data' self.mgr = RequestHandler() self.port = config.main.port self.url = 'http://localhost:%s%s' % (self.port, mount) cherrypy.config["server.socket_port"] = self.port self.app = ServiceManager(config) self.server = RestApiHub(self.app, config, mount) cherrypy.tree.mount(self.server, mount) cherrypy.engine.start() def tearDown(self): "Tear down MicroService" cherrypy.engine.stop() cherrypy.engine.exit() def postRequest(self, apiName, params): "Perform POST request to our MicroService" headers = {'Content-type': 'application/json'} url = self.url + "/%s" % apiName data = self.mgr.getdata(url, params=params, headers=headers, \ verb='POST', cert=cert(), ckey=ckey(), encode=True, decode=True) print("### post call data %s" % data) return data def testGetStatus(self): "Test function for getting state of the MicroService" api = "status" url = '%s/%s' % (self.url, api) params = {} data = self.mgr.getdata(url, params=params, encode=True, decode=True) self.assertEqual(data['result'][0]['microservice'], self.managerName) self.assertEqual(data['result'][0]['api'], api) params = {"service": "transferor"} data = self.mgr.getdata(url, params=params, encode=True, decode=True) self.assertEqual(data['result'][0]['microservice'], self.managerName) self.assertEqual(data['result'][0]['api'], api) def testGetInfo(self): "Test function for getting state of the MicroService" api = "status" url = '%s/%s' % (self.url, api) params = {} data = self.mgr.getdata(url, params=params, encode=True, decode=True) self.assertEqual(data['result'][0]['microservice'], self.managerName) self.assertEqual(data['result'][0]['api'], api) params = {"request": "fake_request_name"} data = self.mgr.getdata(url, params=params, encode=True, decode=True) self.assertEqual(data['result'][0]['microservice'], self.managerName) self.assertEqual(data['result'][0]['api'], api) def testPostCall(self): "Test function for getting state of the MicroService" api = "status" data = {"request": "fake_request_name"} data = self.postRequest(api, data) self.assertDictEqual(data['result'][0], { 'status': 'OK', 'api': 'info' })